Description
This model utilizes BERT sentence embeddings from sbiobert_base_cased_mli
to map extracted medical entities (no concept class) to SNOMED codes.
Predicted Entities
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")\
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner_jsl")
ner_jsl_converter = NerConverterInternal() \
.setInputCols(["sentence", "token", "ner_jsl"]) \
.setOutputCol("ner_chunk")\
.setWhiteList(["Drug",
"Drug_Ingredient",
"Drug_BrandName",
"Disease_Syndrome_Disorder",
"Kidney_Disease",
"Heart_Disease",
"Diabetes",
"Oncological"])\
chunk2doc = Chunk2Doc()\
.setInputCols("ner_chunk")\
.setOutputCol("ner_chunk_doc")
sbert_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("sbert_embeddings")\
.setCaseSensitive(False)
snomed_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_no_class", "en", "clinical/models")\
.setInputCols(["sbert_embeddings"]) \
.setOutputCol("snomed_code")\
.setDistanceFunction("EUCLIDEAN")
snomed_pipeline = Pipeline(stages = [
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
chunk2doc,
sbert_embeddings,
snomed_resolver
])
data = spark.createDataFrame([["""John's doctor prescribed ofloxacin for his secondary conjunctivitis, cefixime for his cystic urethritis, ibuprofen for his inflammation, and cilnidipine for his hypertension on 2023-12-01."""]]).toDF("text")
model = snomed_pipeline.fit(data)
result = model.transform(data)
val document_assembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence","token"))
.setOutputCol("embeddings")
val ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")
.setInputCols(Array("sentence","token","embeddings"))
.setOutputCol("ner_chunk")
val ner_jsl_converter = new NerConverter()
.setInputCols(Array("sentence","token","ner"))
.setOutputCol("ner_chunk")
.setWhiteList(Array("Drug",
"Drug_Ingredient",
"Drug_BrandName",
"Disease_Syndrome_Disorder",
"Kidney_Disease",
"Heart_Disease",
"Diabetes",
"Oncological"))
val chunk2doc = new Chunk2Doc()
.setInputCols("ner_chunk")
.setOutputCol("ner_chunk_doc")
val sbert_embedder = BertSentenceEmbeddings
.pretrained("sbiobert_base_cased_mli","en","clinical/models")
.setInputCols(Array("ner_chunk_doc"))
.setOutputCol("sbert_embeddings")
.setCaseSensitive(False)
val resolver = SentenceEntityResolverModel
.pretrained("sbiobertresolve_snomed_no_class", "en", "clinical/models")
.setInputCols(Array("ner_chunk", "sbert_embeddings"))
.setOutputCol("resolution")
.setDistanceFunction("EUCLIDEAN")
val nlpPipeline = new Pipeline().setStages(Array(
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
chunk2doc,
sbert_embeddings,
snomed_resolver
))
val data = Seq("John's doctor prescribed ofloxacin for his secondary conjunctivitis, cefixime for his cystic urethritis, ibuprofen for his inflammation, and cilnidipine for his hypertension on 2023-12-01.") .toDF("text")
val model = snomed_pipeline.fit(data)
val result = model.transform(data)
Results
+-----------------+-------------------------+-----------+---------------------------------------------+--------------------------------------------------+--------------------------------------------------+
| chunk| label|snomed_code| resolution| all_codes| all_resolutions|
+-----------------+-------------------------+-----------+---------------------------------------------+--------------------------------------------------+--------------------------------------------------+
| ofloxacin| Drug_Ingredient| 1252718003| cefixime- and ofloxacin-containing product|1252718003:::1172759009:::1162766006:::11725730...|cefixime- and ofloxacin-containing product:::of...|
| conjunctivitis|Disease_Syndrome_Disorder| 1217666006| secondary conjunctivitis|1217666006:::15680761000119102:::1177057009:::1...|secondary conjunctivitis:::left infectious conj...|
| cefixime| Drug_Ingredient| 1217570005| cefixime trihydrate|1217570005:::1162766006:::1252718003:::50020121...|cefixime trihydrate:::fropenem:::cefixime- and ...|
|cystic urethritis|Disease_Syndrome_Disorder| 1259233009| cystic urethritis|1259233009:::1259241009:::1259225008:::11792350...|cystic urethritis:::stricture of membranous ure...|
| ibuprofen| Drug_Ingredient| 1172854008|ibuprofen- and paracetamol-containing product|1172854008:::1269077005:::1217598008:::11728570...|ibuprofen- and paracetamol-containing product::...|
| cilnidipine| Drug_Ingredient| 1177123004| cilnidipine|1177123004:::1179035008:::1217308000:::11936630...|cilnidipine:::cilnidipine-containing product:::...|
+-----------------+-------------------------+-----------+---------------------------------------------+--------------------------------------------------+--------------------------------------------------+
Model Information
Model Name: | sbiobertresolve_snomed_no_class |
Compatibility: | Healthcare NLP 5.3.0+ |
License: | Licensed |
Edition: | Official |
Input Labels: | [sentence_embeddings] |
Output Labels: | [snomed_code] |
Language: | en |
Size: | 104.0 MB |
Case sensitive: | false |
References
This model is trained with the augmented version of NIH September 2023 SNOMED CT United States (US) Edition.