Description
This model maps extracted clinical NER entities to LOINC codes using sbluebert_base_uncased_mli
Sentence Bert Embeddings. It trained on the augmented version of the uncased (lowercased) dataset which is used in previous LOINC resolver models.
Predicted Entities
LOINC Code
Live Demo Open in Colab Copy S3 URI
How to use
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols("document")\
.setOutputCol("sentence")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical','en', 'clinical/models')\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner = MedicalNerModel.pretrained("ner_radiology", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")
ner_converter = NerConverter() \
.setInputCols(["sentence", "token", "ner"]) \
.setOutputCol("ner_chunk")\
.setWhiteList(['Test'])
chunk2doc = Chunk2Doc() \
.setInputCols("ner_chunk") \
.setOutputCol("ner_chunk_doc")
sbert_embedder = BertSentenceEmbeddings.pretrained("sbluebert_base_uncased_mli", "en", "clinical/models")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("sbert_embeddings")\
.setCaseSensitive(True)
resolver = SentenceEntityResolverModel.pretrained("sbluebertresolve_loinc_uncased", "en", "clinical/models") \
.setInputCols(["sbert_embeddings"])\
.setOutputCol("resolution")\
.setDistanceFunction("EUCLIDEAN")
pipeline_loinc = Pipeline(stages = [
documentAssembler,
sentenceDetector,
tokenizer,
word_embeddings,
ner,
ner_converter,
chunk2doc,
sbert_embedder,
resolver
])
test = """The patient is a 22-year-old female with a history of obesity. She has a BMI of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126. Her hgba1c is 8.2%."""
sparkDF = spark.createDataFrame([[test]]).toDF("text")
result = pipeline_loinc.fit(sparkDF).transform(sparkDF)
val documentAssembler = DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols("document")
.setOutputCol("sentence")
val tokenizer = Tokenizer()
.setInputCols("document")
.setOutputCol("token")
val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val ner = MedicalNerModel.pretrained("ner_radiology", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner")
val ner_converter = NerConverter()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")
.setWhiteList(Array("Test"))
val chunk2doc = Chunk2Doc()
.setInputCols("ner_chunk")
.setOutputCol("ner_chunk_doc")
val sbert_embedder = BertSentenceEmbeddings.pretrained("sbluebert_base_uncased_mli", "en", "clinical/models")
.setInputCols(Array("ner_chunk_doc"))
.setOutputCol("sbert_embeddings")
.setCaseSensitive(True)
val resolver = SentenceEntityResolverModel.pretrained("sbluebertresolve_loinc_uncased", "en", "clinical/models")
.setInputCols(Array("sbert_embeddings"))
.setOutputCol("resolution")
.setDistanceFunction("EUCLIDEAN")
val pipeline_loinc = new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizer, word_embeddings, ner, ner_converter, chunk2doc, sbert_embedder, resolver))
val data = Seq("The patient is a 22-year-old female with a history of obesity. She has a BMI of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126. Her hgba1c is 8.2%.").toDF("text")
val result = pipeline_loinc.fit(data).transform(data)
import nlu
nlu.load("en.resolve.loinc_uncased").predict("""The patient is a 22-year-old female with a history of obesity. She has a BMI of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126. Her hgba1c is 8.2%.""")
Results
+-------------------------------------+------+-----------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ner_chunk|entity| resolution| all_codes| resolutions|
+-------------------------------------+------+-----------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| BMI| Test| 39156-5|[39156-5, LP35925-4, BDYCRC, 73964-9, 59574-4,...] |[Body mass index, Body mass index (BMI), Body circumference, Body muscle mass, Body mass index (BMI) [Percentile], ...] |
| aspartate aminotransferase| Test| 14409-7|['14409-7', '16325-3', '1916-6', '16324-6',...] |['Aspartate aminotransferase', 'Alanine aminotransferase/Aspartate aminotransferase', 'Aspartate aminotransferase/Alanine aminotransferase', 'Alanine aminotransferase', ...] |
| alanine aminotransferase| Test| 16324-6|['16324-6', '1916-6', '16325-3', '59245-1',...] |['Alanine aminotransferase', 'Aspartate aminotransferase/Alanine aminotransferase', 'Alanine aminotransferase/Aspartate aminotransferase', 'Alanine glyoxylate aminotransferase',...] |
| hgba1c| Test| 41995-2|['41995-2', 'LP35944-5', 'LP19717-5', '43150-2',...]|['Hemoglobin A1c', 'HbA1c measurement device', 'HBA1 gene', 'HbA1c measurement device panel', ...] |
+-------------------------------------+------+-----------+------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Model Information
Model Name: | sbluebertresolve_loinc_uncased |
Compatibility: | Healthcare NLP 3.3.4+ |
License: | Licensed |
Edition: | Official |
Input Labels: | [sentence_embeddings] |
Output Labels: | [loinc_code] |
Language: | en |
Size: | 647.9 MB |
Case sensitive: | false |
Data Source
Trained on standard LOINC coding system.