Description
This model maps extracted medical (anatomical structure) entities to SNOMED codes (body structure version) using sbiobert_base_cased_mli BERT sentence embeddings
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner_jsl")
ner_jsl_converter = NerConverterInternal()\
.setInputCols(["sentence", "token", "ner_jsl"])\
.setOutputCol("ner_jsl_chunk")\
.setWhiteList(["External_body_part_or_region",
"Internal_organ_or_component"])\
.setReplaceLabels({"External_body_part_or_region": "BodyPart",
"Internal_organ_or_component": "BodyPart" })
ner_anatomy = MedicalNerModel.pretrained("ner_anatomy_coarse", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner_anatomy")
ner_anatomy_converter = NerConverterInternal() \
.setInputCols(["sentence", "token", "ner_anatomy"]) \
.setOutputCol("ner_anatomy_chunk")\
.setReplaceLabels({"Anatomy": "BodyPart"})
ner_oncology_anatomy = MedicalNerModel.pretrained("ner_oncology_anatomy_general", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner_oncology_anatomy")
ner_oncology_anatomy_converter = NerConverterInternal() \
.setInputCols(["sentence", "token", "ner_oncology_anatomy"]) \
.setOutputCol("ner_oncology_anatomy_chunk")\
.setReplaceLabels({"Anatomical_Site": "BodyPart"})
chunk_merger = ChunkMergeApproach() \
.setInputCols("ner_jsl_chunk", "ner_anatomy_chunk", "ner_oncology_anatomy_chunk") \
.setOutputCol("ner_chunk") \
chunk2doc = Chunk2Doc()\
.setInputCols("ner_chunk")\
.setOutputCol("ner_chunk_doc")
sbert_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("sbert_embeddings")\
.setCaseSensitive(False)
snomed_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_bodyStructure", "en", "clinical/models") \
.setInputCols(["sbert_embeddings"]) \
.setOutputCol("snomed_code")
snomed_pipeline = Pipeline(stages = [
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
ner_anatomy,
ner_anatomy_converter,
ner_oncology_anatomy,
ner_oncology_anatomy_converter,
chunk_merger,
chunk2doc,
sbert_embeddings,
snomed_resolver
])
sample_text = """The patient is a 30-year-old female with a long history of insulin-dependent diabetes, type 2; coronary artery disease; chronic renal insufficiency; peripheral vascular disease, also secondary to diabetes; who was originally admitted to an outside hospital for what appeared to be acute paraplegia, lower extremities. She did receive a course of Bactrim for 14 days for UTI."""
df= spark.createDataFrame([[sample_text]]).toDF("text")
result= nlpPipeline.fit(df).transform(df)
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_jsl = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner_jsl")
ner_jsl_converter = medical.NerConverterInternal() \
.setInputCols(["sentence", "token", "ner_jsl"])\
.setOutputCol("ner_jsl_chunk")\
.setWhiteList(["External_body_part_or_region",
"Internal_organ_or_component"])\
.setReplaceLabels({"External_body_part_or_region": "BodyPart",
"Internal_organ_or_component": "BodyPart" })
ner_anatomy = medical.NerModel.pretrained("ner_anatomy_coarse", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner_anatomy")
ner_anatomy_converter = medical.NerConverterInternal() \
.setInputCols(["sentence", "token", "ner_anatomy"]) \
.setOutputCol("ner_anatomy_chunk")\
.setReplaceLabels({"Anatomy": "BodyPart"})
ner_oncology_anatomy = medical.NerModel.pretrained("ner_oncology_anatomy_general", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner_oncology_anatomy")
ner_oncology_anatomy_converter = medical.NerConverterInternal() \
.setInputCols(["sentence", "token", "ner_oncology_anatomy"]) \
.setOutputCol("ner_oncology_anatomy_chunk")\
.setReplaceLabels({"Anatomical_Site": "BodyPart"})
chunk_merger = medical.ChunkMergeApproach() \
.setInputCols("ner_jsl_chunk", "ner_anatomy_chunk", "ner_oncology_anatomy_chunk") \
.setOutputCol("ner_chunk")
chunk2doc = nlp.Chunk2Doc() \
.setInputCols("ner_chunk") \
.setOutputCol("ner_chunk_doc")
sbert_embedder = nlp.BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("sbert_embeddings")\
.setCaseSensitive(False)
snomed_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_bodyStructure", "en", "clinical/models") \
.setInputCols(["sbert_embeddings"]) \
.setOutputCol("snomed_code")\
.setDistanceFunction("EUCLIDEAN")
nlpPipeline= nlp.Pipeline(stages = [
documentAssembler,
sentenceDetector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
ner_anatomy,
ner_anatomy_converter,
ner_oncology_anatomy,
ner_oncology_anatomy_converter,
chunk_merger,
chunk2doc,
sbert_embedder,
snomed_resolver
])
sample_text = """The patient is a 30-year-old female with a long history of insulin-dependent diabetes, type 2; coronary artery disease; chronic renal insufficiency; peripheral vascular disease, also secondary to diabetes; who was originally admitted to an outside hospital for what appeared to be acute paraplegia, lower extremities. She did receive a course of Bactrim for 14 days for UTI."""
df= spark.createDataFrame([[sample_text]]).toDF("text")
result= nlpPipeline.fit(df).transform(df)
val document_assembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence","token"))
.setOutputCol("embeddings")
val ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")
.setInputCols(Array("sentence","token","embeddings"))
.setOutputCol("ner")
val ner_jsl_converter = new NerConverter()
.setInputCols(Array("sentence","token","ner"))
.setOutputCol("ner_jsl_chunk")
.setWhiteList(Array("External_body_part_or_region", "Internal_organ_or_component"))
.setReplaceLabels({"Anatomical_Site": "BodyPart"})
val ner_anatomy = MedicalNerModel.pretrained("ner_anatomy_coarse", "en", "clinical/models")
.setInputCols(Array("sentence","token","embeddings"))
.setOutputCol("ner_anatomy")
val ner_anatomy_converter = new NerConverterInternal()
.setInputCols(Array("sentence", "token", "ner_anatomy"))
.setOutputCol("ner_anatomy_chunk")
.setReplaceLabels(Map{"Anatomy" -> "BodyPart"})
val ner_oncology_anatomy = MedicalNerModel.pretrained("ner_oncology_anatomy_general", "en", "clinical/models")
.setInputCols(Array("sentence","token","embeddings"))
.setOutputCol("ner_oncology_anatomy")
val ner_oncology_anatomy_converter = new NerConverter()
.setInputCols(Array("sentence","token","ner_oncology_anatomy"))
.setOutputCol("ner_oncology_anatomy_chunk")
.setWhiteList(Array("Anatomical_Site"))
.setReplaceLabels(Map{"Anatomical_Site" -> "BodyPart"})
val chunk_merger = ChunkMergeApproach() .setInputCols("ner_jsl_chunk", "ner_anatomy_chunk", "ner_oncology_anatomy_chunk")
.setOutputCol("ner_chunk")
val chunk2doc = new Chunk2Doc()
.setInputCols("ner_chunk")
.setOutputCol("ner_chunk_doc")
val sbert_embedder = BertSentenceEmbeddings
.pretrained("sbiobert_base_cased_mli","en","clinical/models")
.setInputCols(Array("ner_chunk_doc"))
.setOutputCol("sbert_embeddings")
.setCaseSensitive(False)
val resolver = SentenceEntityResolverModel
.pretrained("sbiobertresolve_snomed_bodyStructure", "en", "clinical/models")
.setInputCols(Array("ner_chunk", "sbert_embeddings"))
.setOutputCol("resolution")
.setDistanceFunction("EUCLIDEAN")
val nlpPipeline = new Pipeline().setStages(Array(
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
ner_anatomy,
ner_anatomy_converter,
ner_oncology_anatomy,
ner_oncology_anatomy_converter,
chunk_merger,
chunk2doc,
sbert_embeddings,
snomed_resolver
))
val sample_text = """The patient is a 30-year-old female with a long history of insulin-dependent diabetes, type 2; coronary artery disease; chronic renal insufficiency; peripheral vascular disease, also secondary to diabetes; who was originally admitted to an outside hospital for what appeared to be acute paraplegia, lower extremities. She did receive a course of Bactrim for 14 days for UTI."""
val df= Seq(sample_text).toDF("text")
val result= nlpPipeline.fit(df).transform(df)
Results
| ner_chunk | entity | snomed_code | resolutions | all_codes | all_resolutions |
|---------------------|----------|-------------|----------------------------|------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------|
| coronary artery | BodyPart | 181294004 | coronary artery | [181294004, 119204004, 360487004, 55537005, 41801008, 110554000, 312553002, 48955001, 1343313006...] | [coronary artery, coronary artery part, segment of coronary artery, ostium of coronary artery, ...] |
| renal | BodyPart | 64033007 | renal structure | [64033007, 243968009, 84924000, 303402001, 361332007, 181339005, 50403003, 181414000, 91773002, ...] | [renal structure, renal area, renal segment, renal vessels, renal tubule, renal artery, ...] |
| peripheral vascular | BodyPart | 51833009 | peripheral vascular system | [51833009, 840581000, 3058005, 300054001, 281828002, 70402007, 84782009, 63111001, 244457007, ...] | [peripheral vascular system, peripheral artery, peripheral nervous system, peripheral nerve of ...] |
| lower extremities | BodyPart | 61685007 | lower extremity | [61685007, 127951001, 120575009, 182281004, 276744008, 244476009, 63337009, 302584006, 128263001...] | [lower extremity, lower extremity region, lower extremity part, lower limb, lower limb compartment...] |
Model Information
| Model Name: | sbiobertresolve_snomed_bodyStructure |
| Compatibility: | Healthcare NLP 6.3.0+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [sentence_embeddings] |
| Output Labels: | [snomed_code] |
| Language: | en |
| Size: | 210.1 MB |
| Case sensitive: | false |