Sentence Entity Resolver for Clinical Abbreviations and Acronyms (sbiobert_base_cased_mli embeddings)

Description

This model maps clinical abbreviations and acronyms to their meanings using sbiobert_base_cased_mli Sentence Bert Embeddings. This model is an improved version of the base model, and includes more variational data.

Predicted Entities

Abbreviation Meanings

Open in Colab Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["document", "token"])\
    .setOutputCol("word_embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_abbreviation_clinical", "en", "clinical/models") \
    .setInputCols(["document", "token", "word_embeddings"]) \
    .setOutputCol("ner")

ner_converter = NerConverterInternal() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")\
    .setWhiteList(['ABBR'])

sentence_chunk_embeddings = BertSentenceChunkEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
    .setInputCols(["document", "ner_chunk"])\
    .setOutputCol("sentence_embeddings")\
    .setChunkWeight(0.5)\
    .setCaseSensitive(True)

abbr_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_clinical_abbreviation_acronym", "en", "clinical/models") \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("abbr_meaning")\
    .setDistanceFunction("EUCLIDEAN")\

resolver_pipeline = Pipeline(
stages = [
document_assembler,
tokenizer,
word_embeddings,
clinical_ner,
ner_converter,
sentence_chunk_embeddings,
abbr_resolver
])

model = resolver_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

sample_text = "Gravid with estimated fetal weight of 6-6/12 pounds. LOWER EXTREMITIES: No edema. LABORATORY DATA: Laboratory tests include a CBC which is normal. Blood Type: AB positive. Rubella: Immune. VDRL: Nonreactive. Hepatitis C surface antigen: Negative. HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet."

abbr_result = model.transform(spark.createDataFrame([[sample_text]]).toDF('text'))
val document_assembler = new DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")

val tokenizer = new Tokenizer()
    .setInputCols("document")
    .setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
    .setInputCols(Array("document", "token"))
    .setOutputCol("word_embeddings")

val clinical_ner = MedicalNerModel.pretrained("ner_abbreviation_clinical", "en", "clinical/models") 
    .setInputCols(Array("document", "token", "word_embeddings")) 
    .setOutputCol("ner")

val ner_converter = new NerConverterInternal() 
    .setInputCols(Array("document", "token", "ner")) 
    .setOutputCol("ner_chunk")
    .setWhiteList(Array("ABBR"))

val sentence_chunk_embeddings = BertSentenceChunkEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")
    .setInputCols(Array("document", "ner_chunk"))
    .setOutputCol("sentence_embeddings")
    .setChunkWeight(0.5)
    .setCaseSensitive(True)

val abbr_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_clinical_abbreviation_acronym", "en", "clinical/models") 
    .setInputCols(Array("sentence_embeddings")) 
    .setOutputCol("abbr_meaning")
    .setDistanceFunction("EUCLIDEAN")

val resolver_pipeline = new Pipeline().setStages(document_assembler, tokenizer, word_embeddings, clinical_ner, ner_converter, sentence_chunk_embeddings, abbr_resolver)

val sample_text = Seq("""Gravid with estimated fetal weight of 6-6/12 pounds. LOWER EXTREMITIES: No edema. LABORATORY DATA: Laboratory tests include a CBC which is normal. Blood Type: AB positive. Rubella: Immune. VDRL: Nonreactive. Hepatitis C surface antigen: Negative. HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet.""").toDS().toDF("text")

val abbr_result = resolver_pipeline.fit(sample_text).transform(sample_text)
import nlu
nlu.load("en.resolve.clinical_abbreviation_acronym").predict("""Gravid with estimated fetal weight of 6-6/12 pounds. LOWER EXTREMITIES: No edema. LABORATORY DATA: Laboratory tests include a CBC which is normal. Blood Type: AB positive. Rubella: Immune. VDRL: Nonreactive. Hepatitis C surface antigen: Negative. HIV: Negative. One-Hour Glucose: 117. Group B strep has not been done as yet.""")

Results

|    | chunk   | abbr_meaning                         | all_k_results                                                                                                                                                                                                |
|---:|:--------|:-------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | CBC     | Complete Blood Count                 | Complete Blood Count:::Complete blood count:::blood group in ABO system:::(complement) component 4:::abortion:::carbohydrate antigen:::clear to auscultation:::carcinoembryonic antigen:::cervical (level) 4 |
|  1 | AB      | blood group in ABO system            | blood group in ABO system:::abortion                                                                                                                                                                         |
|  2 | VDRL    | Venereal disease research laboratory | Venereal disease research laboratory:::venous blood gas:::leukocyte esterase:::vertical banded gastroplasty                                                                                                  |
|  3 | HIV     | human immunodeficiency virus         | human immunodeficiency virus:::blood group in ABO system:::abortion:::fluorescent in situ hybridization                                                                                                      |

Model Information

Model Name: sbiobertresolve_clinical_abbreviation_acronym
Compatibility: Healthcare NLP 3.3.4+
License: Licensed
Edition: Official
Input Labels: [sentence_embeddings]
Output Labels: [output]
Language: en
Size: 112.3 MB
Case sensitive: true

References

Trained on in-house curated dataset.