Few-Shot Assertion Model ( Smoking )

Description

Assign assertion status to clinical entities extracted by NER based on their context in the text. Also this model is trained on a list of clinical and biomedical datasets curated in-house

Predicted Entities

Present, Absent, Past

Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")\
    .setSplitChars(["-", "\/"])

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# ner_sdoh
ner_smoking = MedicalNerModel.pretrained("ner_sdoh","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner_smoking")

ner_converter = NerConverterInternal()\
    .setInputCols(["sentence","token","ner_smoking"])\
    .setOutputCol("ner_chunk")\
    .setWhiteList(["smoking"])

few_shot_assertion_converter = FewShotAssertionSentenceConverter()\
    .setInputCols(["sentence","token","ner_chunk"])\
    .setOutputCol("assertion_sentence")

e5_embeddings = E5Embeddings.pretrained("e5_base_v2_embeddings_medical_assertion_smoking", "en", "clinical/models")\
    .setInputCols(["assertion_sentence"])\
    .setOutputCol("assertion_embedding")

few_shot_assertion_classifier = FewShotAssertionClassifierModel()\
    .pretrained("fewhot_assertion_smoking_e5_base_v2_smoking", "en", "clinical/models")\
    .setInputCols(["assertion_embedding"])\
    .setOutputCol("assertion")

assertion_pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    ner_smoking,
    ner_converter,
    few_shot_assertion_converter,
    e5_embeddings,
    few_shot_assertion_classifier
])

data = spark.createDataFrame([["""The patient, a 50-year-old man, came to the clinic due to worsening shortness of breath, productive cough, and wheezing. He has a history of heavy smoking, having smoked a pack of cigarettes daily for 20 years. He quit smoking five years ago after recurrent respiratory infections and worsening breathing problems. Despite quitting, he frequently experiences exacerbations of chronic bronchitis, particularly in the winter. Over the past week, his symptoms have intensified, with increased sputum production and dyspnea on exertion."""]]).toDF("text")

result = assertion_pipeline.fit(data).transform(data)

val document_assembler = new DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")

val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
    .setInputCols(Array("document"))
    .setOutputCol("sentence")

val tokenizer = new Tokenizer()
    .setInputCols(Array("sentence"))
    .setOutputCol("token")
    .setSplitChars(Array("-", "\/"))

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")
    .setInputCols(Array("sentence","token"))
    .setOutputCol("embeddings")

// ner_sdoh
val ner_smoking = MedicalNerModel.pretrained("ner_sdoh","en","clinical/models")
    .setInputCols(Array("sentence","token","embeddings"))
    .setOutputCol("ner_smoking")

val ner_converter = new NerConverterInternal()
    .setInputCols(Array("sentence","token","ner_smoking"))
    .setOutputCol("ner_chunk")
    .setWhiteList(Array("smoking"))

val few_shot_assertion_converter = new FewShotAssertionSentenceConverter()
    .setInputCols(Array("sentence","token","ner_chunk"))
    .setOutputCol("assertion_sentence")

val e5_embeddings = E5Embeddings.pretrained("e5_base_v2_embeddings_medical_assertion_smoking", "en", "clinical/models")
    .setInputCols(Array("assertion_sentence"))
    .setOutputCol("assertion_embedding")

val few_shot_assertion_classifier = FewShotAssertionClassifierModel()
    .pretrained("fewhot_assertion_smoking_e5_base_v2_smoking", "en", "clinical/models")
    .setInputCols(Array("assertion_embedding"))
    .setOutputCol("assertion")

val pipeline = new Pipeline().setStages(Array(
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    ner_smoking,
    ner_converter,
    few_shot_assertion_converter,
    e5_embeddings,
    few_shot_assertion_classifier))

val data = Seq(Array("""The patient, a 50-year-old man, came to the clinic due to worsening shortness of breath, productive cough, and wheezing. He has a history of heavy smoking, having smoked a pack of cigarettes daily for 20 years. He quit smoking five years ago after recurrent respiratory infections and worsening breathing problems. Despite quitting, he frequently experiences exacerbations of chronic bronchitis, particularly in the winter. Over the past week, his symptoms have intensified, with increased sputum production and dyspnea on exertion.""")).toDF("text")
val result = pipeline.fit(data).transform(data)

Results

|    | chunks     |   begin |   end | entities   | assertion   |   confidence |
|---:|:-----------|--------:|------:|:-----------|:------------|-------------:|
|  0 | smoking    |     147 |   153 | Smoking    | Past        |     0.936773 |
|  1 | smoked     |     163 |   168 | Smoking    | Past        |     0.936713 |
|  2 | cigarettes |     180 |   189 | Smoking    | Past        |     0.936727 |
|  3 | smoking    |     219 |   225 | Smoking    | Past        |     0.936954 |

Model Information

Model Name: fewhot_assertion_smoking_e5_base_v2_smoking
Compatibility: Healthcare NLP 5.3.3+
License: Licensed
Edition: Official
Input Labels: [assertion_embedding]
Output Labels: [assertion]
Language: en
Size: 15.2 KB

Benchmarking

       label  precision    recall  f1-score   support
      Absent       0.95      1.00      0.97        19
        Past       0.94      0.89      0.91        18
     Present       0.92      0.92      0.92        13
    accuracy          -         -      0.94        50
   macro-avg       0.94      0.94      0.94        50
weighted-avg       0.94      0.94      0.94        50