Description
This is a Sentence Entity Resolver model that maps clinical entities to SNOMED codes using bge_base_en_v1_5_onnx embeddings. It leverages contextual embeddings to improve code resolution accuracy for medical concepts, including diseases, symptoms, procedures, and drugs.
Predicted Entities
SNOMED Codes
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner_jsl")
ner_jsl_converter = NerConverterInternal()\
.setInputCols(["sentence", "token", "ner_jsl"])\
.setOutputCol("ner_jsl_chunk")\
.setWhiteList(["Procedure","Kidney_Disease","Cerebrovascular_Disease","Heart_Disease",
"Disease_Syndrome_Disorder", "ImagingFindings", "Symptom", "VS_Finding",
"EKG_Findings", "Communicable_Disease","Substance","Drug_Ingredient",
"Internal_organ_or_component", "External_body_part_or_region", "Modifier",
"Triglycerides", "Alcohol", "Smoking", "Hypertension", "Obesity",
"Injury_or_Poisoning","Test","Hyperlipidemia","BMI","Oncological",
"Psychological_Condition", "LDL", "Diabetes"])
chunk2doc = Chunk2Doc()\
.setInputCols("ner_jsl_chunk")\
.setOutputCol("ner_chunk_doc")
bge_embeddings = BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("bge_embeddings")
snomed_resolver = SentenceEntityResolverModel.pretrained("bgeresolve_snomed", "en", "clinical/models")\
.setInputCols(["bge_embeddings"])\
.setOutputCol("snomed_code")\
.setDistanceFunction("EUCLIDEAN")
pipeline = Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
chunk2doc,
bge_embeddings,
snomed_resolver
])
text = """John's doctor prescribed ofloxacin for his secondary conjunctivitis, cefixime for his cystic urethritis, ibuprofen for his inflammation, and cilnidipine for his hypertension on 2023-12-01."""
data = spark.createDataFrame([[text]]).toDF("text")
result = pipeline.fit(data).transform(data)
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en" ,"clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_jsl = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models")\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner_jsl")
ner_jsl_converter = medical.NerConverter()\
.setInputCols(["sentence", "token", "ner_jsl"])\
.setOutputCol("ner_jsl_chunk")\
.setWhiteList(["Procedure","Kidney_Disease","Cerebrovascular_Disease","Heart_Disease",
"Disease_Syndrome_Disorder", "ImagingFindings", "Symptom", "VS_Finding",
"EKG_Findings", "Communicable_Disease","Substance","Drug_Ingredient",
"Internal_organ_or_component", "External_body_part_or_region", "Modifier",
"Triglycerides", "Alcohol", "Smoking", "Hypertension", "Obesity",
"Injury_or_Poisoning","Test","Hyperlipidemia","BMI","Oncological",
"Psychological_Condition", "LDL"," Diabetes"])
chunk2doc = nlp.Chunk2Doc()\
.setInputCols("ner_jsl_chunk")\
.setOutputCol("ner_chunk_doc")
bge_embeddings = nlp.BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("bge_embeddings")
snomed_resolver = medical.SentenceEntityResolverModel.pretrained("bgeresolve_snomed", "en", "clinical/models")\
.setInputCols(["bge_embeddings"])\
.setOutputCol("snomed_code")\
.setDistanceFunction("EUCLIDEAN")
pipeline = nlp.Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
chunk2doc,
bge_embeddings,
snomed_resolver
])
text = """John's doctor prescribed ofloxacin for his secondary conjunctivitis, cefixime for his cystic urethritis, ibuprofen for his inflammation, and cilnidipine for his hypertension on 2023-12-01."""
data = spark.createDataFrame([[text]]).toDF("text")
result = pipeline.fit(data).transform(data)
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val nerJsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner_jsl")
val nerJslConverter = new NerConverterInternal()
.setInputCols(Array("sentence", "token", "ner_jsl"))
.setOutputCol("ner_jsl_chunk")
.setWhiteList(Array("Procedure","Kidney_Disease","Cerebrovascular_Disease","Heart_Disease",
"Disease_Syndrome_Disorder", "ImagingFindings", "Symptom", "VS_Finding",
"EKG_Findings", "Communicable_Disease","Substance","Drug_Ingredient",
"Internal_organ_or_component", "External_body_part_or_region", "Modifier",
"Triglycerides", "Alcohol", "Smoking", "Hypertension", "Obesity",
"Injury_or_Poisoning","Test","Hyperlipidemia","BMI","Oncological",
"Psychological_Condition", "LDL", "Diabetes"))
val chunk2doc = new Chunk2Doc()
.setInputCols("ner_jsl_chunk")
.setOutputCol("ner_chunk_doc")
val bgeEmbeddings = BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en")
.setInputCols(Array("ner_chunk_doc"))
.setOutputCol("bge_embeddings")
val snomedResolver = SentenceEntityResolverModel.pretrained("bgeresolve_snomed", "en", "clinical/models")
.setInputCols(Array("bge_embeddings"))
.setOutputCol("snomed_code")
.setDistanceFunction("EUCLIDEAN")
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
sentenceDetector,
tokenizer,
wordEmbeddings,
nerJsl,
nerJslConverter,
chunk2doc,
bgeEmbeddings,
snomedResolver
))
val data = Seq("""John's doctor prescribed ofloxacin for his secondary conjunctivitis, cefixime for his cystic urethritis, ibuprofen for his inflammation, and cilnidipine for his hypertension on 2023-12-01.""").toDF("text")
val result = pipeline.fit(data).transform(data)
Results
|sent_id|ner_chunk |entity |snomed_code|resolutions |all_codes |all_resolutions |
|-------|-----------------|-------------------------|-----------|------------------|-----------------------------------------------|-----------------------------------------------|
|0 |ofloxacin |Drug_Ingredient |387551000 |ofloxacin |[387551000, 96086002, 392417006, 392415003, ...]|[ofloxacin, ofloxacin product, parenteral oflo...]|
|0 |secondary |Modifier |2603003 |secondary |[2603003, 262134003, 721071000000106, ...] |[secondary, secondary procedure, secondary and...]|
|0 |conjunctivitis |Disease_Syndrome_Disorder|9826008 |conjunctivitis |[9826008, 473460002, 45261009, 128350005, ...] |[conjunctivitis, allergic conjunctivitis, vira...]|
|0 |cefixime |Drug_Ingredient |387536009 |cefixime |[387536009, 96052006, 713750001, 294548002, ...]|[cefixime, cefixime product, oral form cefixim...]|
|0 |cystic urethritis|Disease_Syndrome_Disorder|1259233009 |cystic urethritis |[1259233009, 70795003, 31822004, 429728004, ...]|[cystic urethritis, urethral cyst, urethritis,...]|
|0 |ibuprofen |Drug_Ingredient |387207008 |ibuprofen |[387207008, 350321003, 293619005, 38268001, ...]|[ibuprofen, oral ibuprofen, ibuprofen allergy,...]|
|0 |inflammation |Symptom |257552002 |inflammation |[257552002, 225540005, 274144001, 3723001, ...] |[inflammation, wound inflammation, inflammatio...]|
|0 |cilnidipine |Drug_Ingredient |1177123004 |cilnidipine |[1177123004, 1179035008, 1179037000, ...] |[cilnidipine, cilnidipine-containing product, ...]|
|0 |hypertension |Hypertension |38341003 |hypertension |[38341003, 75367002, 73578008, 28119000, ...] |[hypertension, blood pressure, hyperdistension...]|
Model Information
| Model Name: | bgeresolve_snomed |
| Compatibility: | Healthcare NLP 6.2.0+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [bge_embeddings] |
| Output Labels: | [snomed_code] |
| Language: | en |
| Size: | 2.7 GB |
| Case sensitive: | false |