Description
This model maps clinical entities and concepts to Snomed codes using sbiobert_base_cased_mli Sentence Bert Embeddings.
How to use
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_jsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner_jsl")
ner_jsl_converter = NerConverter()\
.setInputCols(["sentence", "token", "ner_jsl"])\
.setOutputCol("ner_jsl_chunk")\
.setWhiteList(["Procedure","Substance","Drug_Ingredient","Internal_organ_or_component","Modifier","BMI","LDL","External_body_part_or_region","Alcohol","Treatment","Test","Smoking"])
chunk2doc = Chunk2Doc()\
.setInputCols("ner_jsl_chunk")\
.setOutputCol("ner_chunk_doc")
sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("sbert_embeddings")\
.setCaseSensitive(False)
snomed_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_auxConcepts","en","clinical/models")\
.setInputCols(["sbert_embeddings"])\
.setOutputCol("snomed_code")\
.setDistanceFunction("EUCLIDEAN")
nlpPipeline= Pipeline(stages=[
documentAssembler,
sentenceDetector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
chunk2doc,
sbert_embedder,
snomed_resolver
])
sample_text = """This is an 82-year-old male with a history of prior tobacco use, hypertension, chronic renal insufficiency, COPD, gastritis, and TIA. He initially presented to Braintree with a nonspecific ST-T abnormality and was transferred to St. Margaret’s Center. He underwent cardiac catheterization because of occlusion of the mid left anterior descending coronary artery lesion, which was complicated by hypotension and bradycardia. He required atropine, IV fluids, and dopamine, possibly secondary to a vagal reaction. He was subsequently transferred to the CCU for close monitoring. He was hemodynamically stable at the time of admission to the CCU."""
df= spark.createDataFrame([[sample_text]]).toDF("text")
result= nlpPipeline.fit(df).transform(df)
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_jsl = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models")\
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner_jsl")
ner_jsl_converter = medical.NerConverterInternal()\
.setInputCols(["sentence", "token", "ner_jsl"]) \
.setOutputCol("ner_jsl_chunk")\
.setWhiteList(["Procedure","Substance","Drug_Ingredient","Internal_organ_or_component","Modifier","BMI","LDL","External_body_part_or_region","Alcohol","Treatment","Test","Smoking"])
chunk2doc = nlp.Chunk2Doc()\
.setInputCols("ner_jsl_chunk")\
.setOutputCol("ner_chunk_doc")
sbert_embedder = nlp.BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("sbert_embeddings")\
.setCaseSensitive(False)
snomed_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_auxConcepts","en","clinical/models")\
.setInputCols(["sbert_embeddings"]) \
.setOutputCol("snomed_code")\
.setDistanceFunction("EUCLIDEAN")
nlpPipeline= nlp.Pipeline(stages=[
documentAssembler,
sentenceDetector,
tokenizer,
word_embeddings,
ner_jsl,
ner_jsl_converter,
chunk2doc,
sbert_embedder,
snomed_resolver
])
sample_text = """ This is an 82-year-old male with a history of prior tobacco use, hypertension, chronic renal insufficiency, COPD, gastritis, and TIA. He initially presented to Braintree with a nonspecific ST-T abnormality and was transferred to St. Margaret’s Center. He underwent cardiac catheterization because of occlusion of the mid left anterior descending coronary artery lesion, which was complicated by hypotension and bradycardia. He required atropine, IV fluids, and dopamine, possibly secondary to a vagal reaction. He was subsequently transferred to the CCU for close monitoring. He was hemodynamically stable at the time of admission to the CCU."""
df= spark.createDataFrame([[sample_text]]).toDF("text")
result= nlpPipeline.fit(df).transform(df)
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val nerJsl = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner_jsl")
val nerJslConverter = new NerConverter()
.setInputCols(Array("sentence", "token", "ner_jsl"))
.setOutputCol("ner_jsl_chunk")
.setWhiteList(Array("Procedure","Substance","Drug_Ingredient","Internal_organ_or_component","Modifier","BMI","LDL","External_body_part_or_region","Alcohol","Treatment","Test","Smoking"))
val chunk2doc = new Chunk2Doc()
.setInputCols(Array("ner_jsl_chunk"))
.setOutputCol("ner_chunk_doc")
val sbertEmbedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")
.setInputCols(Array("ner_chunk_doc"))
.setOutputCol("sbert_embeddings")
.setCaseSensitive(False)
val snomedResolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_auxConcepts", "en", "clinical/models")
.setInputCols(Array("sbert_embeddings"))
.setOutputCol("snomed_code")
.setDistanceFunction("EUCLIDEAN")
val nlpPipeline = new Pipeline().setStages(Array(
documentAssembler,
sentenceDetector,
tokenizer,
wordEmbeddings,
nerJsl,
nerJslConverter,
chunk2doc,
sbertEmbedder,
snomedResolver
))
val sample_text = """ This is an 82-year-old male with a history of prior tobacco use, hypertension, chronic renal insufficiency, COPD, gastritis, and TIA. He initially presented to Braintree with a nonspecific ST-T abnormality and was transferred to St. Margaret’s Center. He underwent cardiac catheterization because of occlusion of the mid left anterior descending coronary artery lesion, which was complicated by hypotension and bradycardia. He required atropine, IV fluids, and dopamine, possibly secondary to a vagal reaction. He was subsequently transferred to the CCU for close monitoring. He was hemodynamically stable at the time of admission to the CCU."""
val df= Seq(sample_text).toDF("text")
val result= nlpPipeline.fit(df).transform(df)
Results
| sent_id | ner_chunk | entity | snomed_code | resolution | all_codes | all_resolutions | aux_list |
| ------- | ----------------------- | --------------- | ----------- | ----------------------- | ---------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------- |
| 0 | tobacco | Smoking | 57264008 | tobacco | [57264008, 102407002, 39953003, 159882006, 230491000087101, 102408007, 722496004, 38402001, 7110...] | [tobacco, tobacco smoke, tobacco - substance, tobacco processor, shisha tobacco, cigarette smoke...] | [Organism, Substance, Substance, Social Context, No_Concept_Class, Substance, Physical Object, ...] |
| 1 | nonspecific | Modifier | 10003008 | non-specific | [10003008, 261992003, 863956004, 300844001, 278001007, 863967003, 236821005, 276119007, 12170110...] | [non-specific, non-biological, non-sterile, non-resonant, nonspecific site, non-absorbable, ...] | [Qualifier Value, Qualifier Value, Qualifier Value, Qualifier Value, Body Structure, Qualifier Value...] |
| 2 | cardiac catheterization | Procedure | 41976001 | cardiac catheterization | [41976001, 705923009, 721968000, 467735004, 129085009, 425315000, 128956009, 467525008, 12895700...] | [cardiac catheterization, cardiac catheter, cardiac catheterization report, ...] | [Procedure, Physical Object, Record Artifact, Physical Object, Qualifier Value, Procedure, ...] |
| 3 | atropine | Drug_Ingredient | 73949004 | atropine | [73949004, 105075009, 349945006, 410493009, 74237004, 50507004, 109207004, 349946007, 34013000...] | [atropine, atropine measurement, oral atropine, atropinization, atropine sulfate, ...] | [Pharma/Biol Product, Procedure, Clinical Drug Form, Procedure, Substance, Substance, ...] |
| 3 | fluids | Drug_Ingredient | 255765007 | fluid | [255765007, 246498002, 258442002, 251851008, 32457005, 396276007, 406142009, 251840008, 33463005...] | [fluid, fluid used, fluid sample, fluid input, body fluid, fluid appearance, ...] | [Qualifier Value, Attribute, Specimen, Observable Entity, Substance, Observable Entity, ...] |
| 3 | dopamine | Drug_Ingredient | 59187003 | dopamine | [59187003, 412383006, 37484001, 32779004, 412845004, 713493000, 418222008, 12307008, 384952006...] | [dopamine, dopamine agent, dopamine receptor, dopamine measurement, serum dopamine level, ...] | [Pharma/Biol Product, Substance, Substance, Procedure, Procedure, Substance, ...] |
Model Information
| Model Name: | sbiobertresolve_snomed_auxConcepts |
| Compatibility: | Healthcare NLP 6.3.0+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [sentence_embeddings] |
| Output Labels: | [snomed_code] |
| Language: | en |
| Size: | 2.1 GB |
| Case sensitive: | false |