Description
This model maps extracted clinical NER entities to LOINC codes using sbiobert_base_cased_mli
Sentence Bert Embeddings. It is trained with augmented cased (unlowered) concept names since sbiobert model is cased.
Predicted Entities
LOINC
Live Demo Open in Colab Copy S3 URI
How to use
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector = SentenceDetectorDLModel.pretrained()\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical','en', 'clinical/models')\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
rad_ner = MedicalNerModel.pretrained("ner_radiology", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")
rad_ner_converter = NerConverter() \
.setInputCols(["sentence", "token", "ner"]) \
.setOutputCol("ner_chunk")\
.setWhiteList(['Test'])
chunk2doc = Chunk2Doc() \
.setInputCols("ner_chunk") \
.setOutputCol("ner_chunk_doc")
sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("sbert_embeddings")\
.setCaseSensitive(True)
resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_loinc_cased", "en", "clinical/models") \
.setInputCols(["sbert_embeddings"])\
.setOutputCol("resolution")\
.setDistanceFunction("EUCLIDEAN")
pipeline = Pipeline(stages = [
documentAssembler,
sentenceDetector,
tokenizer,
word_embeddings,
rad_ner,
rad_ner_converter,
chunk2doc,
sbert_embedder,
resolver
])
data = spark.createDataFrame([["""The patient is a 22-year-old female with a history of obesity. She has a BMI of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126. Her hemoglobin is 8.2%."""]]).toDF("text")
result = pipeline.fit(data).transform(data)
val documentAssembler = DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetector = SentenceDetectorDLModel.pretrained()
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val rad_ner = MedicalNerModel.pretrained("ner_radiology", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner")
val rad_ner_converter = NerConverter()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")
.setWhiteList(Array("Test"))
val chunk2doc = Chunk2Doc()
.setInputCols("ner_chunk")
.setOutputCol("ner_chunk_doc")
val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")
.setInputCols(Array("ner_chunk_doc"))
.setOutputCol("sbert_embeddings")
.setCaseSensitive(True)
val resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_loinc_cased", "en", "clinical/models")
.setInputCols(Array("sbert_embeddings"))
.setOutputCol("resolution")\
.setDistanceFunction("EUCLIDEAN")
val pipeline = new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizer, word_embeddings, rad_ner, rad_ner_converter, chunk2doc, sbert_embedder, resolver))
val data = Seq("The patient is a 22-year-old female with a history of obesity. She has a BMI of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126. Her hemoglabin is 8.2%.").toDF("text")
val result = pipeline.fit(data).transform(data)
import nlu
nlu.load("en.resolve.loinc_cased").predict("""The patient is a 22-year-old female with a history of obesity. She has a BMI of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126. Her hemoglobin is 8.2%.""")
Results
+-------------------------------------+------+-----------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ner_chunk|entity| resolution| all_codes| resolutions|
+-------------------------------------+------+-----------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| BMI| Test| LP35925-4|[LP35925-4, 59574-4, BDYCRC, 73964-9, 59574-4,... |[Body mass index (BMI), Body mass index, Body circumference, Body muscle mass, Body mass index (BMI) [Percentile], ... |
| aspartate aminotransferase| Test| 14409-7|[14409-7, 1916-6, 16325-3, 16324-6, 43822-6, 308... |[Aspartate aminotransferase, Aspartate aminotransferase/Alanine aminotransferase, Alanine aminotransferase/Aspartate aminotransferase, Alanine aminotransferase, Aspartate aminotransferase [Prese... |
| alanine aminotransferase| Test| 16324-6|[16324-6, 16325-3, 14409-7, 1916-6, 59245-1, 30... |[Alanine aminotransferase, Alanine aminotransferase/Aspartate aminotransferase, Aspartate aminotransferase, Aspartate aminotransferase/Alanine aminotransferase, Alanine glyoxylate aminotransfer,... |
| hemoglobin| Test| 14775-1|[14775-1, 16931-8, 12710-0, 29220-1, 15082-1, 72... |[Hemoglobin, Hematocrit/Hemoglobin, Hemoglobin pattern, Haptoglobin, Methemoglobin, Oxyhemoglobin, Hemoglobin test status, Verdohemoglobin, Hemoglobin A, Hemoglobin distribution width, Myoglobin,... |
+-------------------------------------+------+-----------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Model Information
Model Name: | sbiobertresolve_loinc_cased |
Compatibility: | Healthcare NLP 3.3.4+ |
License: | Licensed |
Edition: | Official |
Input Labels: | [sentence_embeddings] |
Output Labels: | [loinc_code] |
Language: | en |
Size: | 648.5 MB |
Case sensitive: | true |