Sentence Entity Resolver for LOINC (sbiobert_base_cased_mli embeddings)

Description

This model maps extracted clinical NER entities to Logical Observation Identifiers Names and Codes(LOINC) codes using sbiobert_base_cased_mli Sentence Bert Embeddings. It is trained with the numeric LOINC codes, without the inclusion of LOINC “Document Ontology” codes starting with the letter “L”. It also provides the official resolution of the codes within the brackets.

Predicted Entities

loinc_code

Copy S3 URI

How to use


document_assembler = DocumentAssembler()\
	.setInputCol("text")\
	.setOutputCol("document")

sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") \
	.setInputCols(["document"]) \
	.setOutputCol("sentence")

tokenizer = Tokenizer()\
	.setInputCols(["sentence"])\
	.setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
	.setInputCols(["sentence", "token"])\
	.setOutputCol("embeddings")

ner_model = MedicalNerModel.pretrained("ner_radiology", "en", "clinical/models") \
	.setInputCols(["sentence", "token", "embeddings"]) \
	.setOutputCol("ner")

ner_converter = NerConverterInternal() \
 	  .setInputCols(["sentence", "token", "ner"]) \
	  .setOutputCol("ner_chunk")\
	  .setWhiteList(["Test"])

chunk2doc = Chunk2Doc()\
  	.setInputCols("ner_chunk")\
  	.setOutputCol("ner_chunk_doc")

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")\
	  .setInputCols(["ner_chunk_doc"])\
	  .setOutputCol("sbert_embeddings")\
	  .setCaseSensitive(False)

resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_loinc_numeric_augmented","en", "clinical/models") \
	  .setInputCols(["sbert_embeddings"]) \
	  .setOutputCol("resolution")\
	  .setDistanceFunction("EUCLIDEAN")


nlpPipeline = Pipeline(stages=[document_assembler,
                               sentence_detector,
                               tokenizer,
                               word_embeddings,
                               ner_model,
                               ner_converter,
                               chunk2doc,
                               sbert_embedder,
                               resolver])

data = spark.createDataFrame([["""The patient is a 22-year-old female with a history of obesity. She has a Body mass index (BMI) of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126."""]]).toDF("text")

result = nlpPipeline.fit(data).transform(data)


val document_assembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
  .setInputCols(Array("document"))
  .setOutputCol("sentence")

val tokenizer = new Tokenizer()
  .setInputCols(Array("sentence"))
  .setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")
  .setInputCols(Array("sentence","token"))
  .setOutputCol("embeddings")

val ner_model = MedicalNerModel.pretrained("ner_radiology","en","clinical/models")
  .setInputCols(Array("sentence","token","embeddings"))
  .setOutputCol("ner")

val ner_converter = new NerConverterInternal()
  .setInputCols(Array("sentence","token","ner"))
  .setOutputCol("ner_chunk")
  .setWhiteList(Array("Test"))

val chunk2doc = new Chunk2Doc()
  .setInputCols("ner_chunk")
  .setOutputCol("ner_chunk_doc")

val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")
  .setInputCols(Array("ner_chunk_doc"))
  .setOutputCol("sbert_embeddings")
  .setCaseSensitive(false)

val resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_loinc_numeric_augmented","en","clinical/models")
  .setInputCols(Array("sbert_embeddings"))
  .setOutputCol("resolution")
  .setDistanceFunction("EUCLIDEAN")

val nlpPipeline = new Pipeline().setStages(Array(
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    ner_model,
    ner_converter,
    chunk2doc,
    sbert_embedder,
    resolver))

val data = Seq([["""The patient is a 22-year-old female with a history of obesity. She has a Body mass index (BMI) of 33.5 kg/m2, aspartate aminotransferase 64, and alanine aminotransferase 126."""]]).toDF("text")

val result = nlpPipeline.fit(data).transform(data)

Results


+--------------------------+-----+---+---------+----------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+
|                     chunk|begin|end|ner_label|loinc_code|                                                 description|                                                 resolutions|                                                   all_codes|                                                  aux_labels|
+--------------------------+-----+---+---------+----------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+
|                       BMI|   90| 92|     Test|   39156-5|                         BMI [Body mass index (BMI) [Ratio]]|BMI [Body mass index (BMI) [Ratio]]:::BM [IDH1 gene exon ...|39156-5:::100305-2:::100225-2:::38410-7:::72087-0:::33573...|Observation:::Observation:::Observation:::Observation:::O...|
|aspartate aminotransferase|  110|135|     Test|  100739-2|Aspartate transaminase [Aspartate aminotransferase.macrom...|Aspartate transaminase [Aspartate aminotransferase.macrom...|100739-2:::43822-6:::77063-6:::53877-7:::100738-4:::21081...|Observation:::Observation:::Observation:::Observation:::O...|
|  alanine aminotransferase|  145|168|     Test|  100738-4|Alanine transaminase [Alanine aminotransferase.macromolec...|Alanine transaminase [Alanine aminotransferase.macromolec...|100738-4:::100739-2:::69383-8:::59245-1:::25302-1:::43822...|Observation:::Observation:::Observation:::Observation:::O...|
+--------------------------+-----+---+---------+----------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+

Model Information

Model Name: sbiobertresolve_loinc_numeric_augmented
Compatibility: Healthcare NLP 5.5.0+
License: Licensed
Edition: Official
Input Labels: [sentence_embeddings]
Output Labels: [loinc_code]
Language: en
Size: 667.3 MB
Case sensitive: false

References

This model is trained with augmented version of the LOINC v2.78 dataset released in 2024-08-06.