Few-Shot Assertion Model ( SDOH )

Description

Assign assertion status to clinical entities extracted by NER based on their context in the text. Also this model is trained on a list of clinical and biomedical datasets curated in-house

Predicted Entities

Absent, Past, Present, Someone_Else, Hypothetical, Possible

Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")\
    .setSplitChars(["-", "\/"])

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# ner_sdoh
clinical_ner = MedicalNerModel.pretrained("ner_sdoh","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")

ner_converter = NerConverterInternal()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_sdoh_chunk")\
    .setBlackList(['Age','Gender','Language','HEALTHCARE_INSTITUTION']) 

few_shot_assertion_converter = FewShotAssertionSentenceConverter()\
    .setInputCols(["sentence","token", "ner_sdoh_chunk"])\
    .setOutputCol("assertion_sentence")

e5_embeddings = E5Embeddings.pretrained("e5_base_v2_embeddings_medical_assertion_sdoh", "en", "clinical/models")\
    .setInputCols(["assertion_sentence"])\
    .setOutputCol("assertion_embedding")

few_shot_assertion_classifier = FewShotAssertionClassifierModel()\
    .pretrained("fewhot_assertion_sdoh_e5_base_v2_sdoh", "en", "clinical/models")\
    .setInputCols(["assertion_embedding"])\
    .setOutputCol("assertion")

assertion_pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter,
    few_shot_assertion_converter,
    e5_embeddings,
    few_shot_assertion_classifier
])

data = spark.createDataFrame([["""Smith works as a cleaning assistant and does not have access to health insurance or paid sick leave.
But she has generally housing problems. She lives in a apartment now.  She has long history of EtOH abuse, beginning in her teens.
She is aware she needs to attend Rehab Programs. She had DUI back in April and was due to be in court this week.
Her partner is an alcoholic and a drug abuser for the last 5 years.
She also mentioned feeling socially isolated and lack of a strong support system."""]]).toDF("text")

result = assertion_pipeline.fit(data).transform(data)

val document_assembler = new DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")

val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
    .setInputCols(Array("document"))
    .setOutputCol("sentence")

val tokenizer = new Tokenizer()
    .setInputCols(Array("sentence"))
    .setOutputCol("token")
    .setSplitChars(Array("-", "\/"))

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")
    .setInputCols(Array("sentence","token"))
    .setOutputCol("embeddings")

// ner_sdoh
val clinical_ner = MedicalNerModel.pretrained("ner_sdoh","en","clinical/models")
    .setInputCols(Array("sentence","token","embeddings"))
    .setOutputCol("ner")

val ner_converter = new NerConverterInternal()
    .setInputCols(Array("sentence","token","ner"))
    .setOutputCol("ner_sdoh_chunk")
    .setBlackList(Array('Age','Gender','Language','HEALTHCARE_INSTITUTION')) 

val few_shot_assertion_converter = new FewShotAssertionSentenceConverter()
    .setInputCols(Array("sentence","token", "ner_sdoh_chunk"))
    .setOutputCol("assertion_sentence")

val e5_embeddings = E5Embeddings.pretrained("e5_base_v2_embeddings_medical_assertion_sdoh", "en", "clinical/models")
    .setInputCols(Array("assertion_sentence"))
    .setOutputCol("assertion_embedding")

val few_shot_assertion_classifier = FewShotAssertionClassifierModel()
    .pretrained("fewhot_assertion_sdoh_e5_base_v2_sdoh", "en", "clinical/models")
    .setInputCols(Array("assertion_embedding"))
    .setOutputCol("assertion")

val pipeline = new Pipeline().setStages(Array(
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter,
    few_shot_assertion_converter,
    e5_embeddings,
    few_shot_assertion_classifier))

val data = Seq(Array("""Smith works as a cleaning assistant and does not have access to health insurance or paid sick leave.
But she has generally housing problems. She lives in a apartment now.  She has long history of EtOH abuse, beginning in her teens.
She is aware she needs to attend Rehab Programs. She had DUI back in April and was due to be in court this week.
Her partner is an alcoholic and a drug abuser for the last 5 years.
She also mentioned feeling socially isolated and lack of a strong support system.""")).toDF("text")
val result = pipeline.fit(data).transform(data)

Results

|    | chunks             |   begin |   end | entities           | assertion    |   confidence |
|---:|:-------------------|--------:|------:|:-------------------|:-------------|-------------:|
|  0 | cleaning assistant |      17 |    34 | Employment         | present      |     0.956549 |
|  1 | health insurance   |      64 |    79 | Insurance_Status   | Absent       |     0.930705 |
|  2 | apartment          |     156 |   164 | Housing            | present      |     0.953653 |
|  3 | EtOH abuse         |     196 |   205 | Alcohol            | Past         |     0.855614 |
|  4 | Rehab Programs     |     265 |   278 | Access_To_Care     | Hypothetical |     0.871034 |
|  5 | DUI                |     289 |   291 | Legal_Issues       | Past         |     0.853602 |
|  6 | alcoholic          |     363 |   371 | Alcohol            | Someone_Else |     0.895126 |
|  7 | drug abuser        |     379 |   389 | Substance_Use      | Someone_Else |     0.89584  |
|  8 | last 5 years       |     399 |   410 | Substance_Duration | present      |     0.95608  |
|  9 | socially isolated  |     440 |   456 | Social_Exclusion   | present      |     0.956841 |
| 10 | strong support     |     472 |   485 | Social_Support     | Absent       |     0.93079  |

Model Information

Model Name: fewhot_assertion_sdoh_e5_base_v2_sdoh
Compatibility: Healthcare NLP 5.3.3+
License: Licensed
Edition: Official
Input Labels: [assertion_embedding]
Output Labels: [assertion]
Language: en
Size: 25.4 KB

Benchmarking

       label  precision    recall  f1-score   support
      Absent       0.93      0.95      0.94       385
Hypothetical       0.83      0.82      0.83       211
        Past       0.71      0.71      0.71       156
    Possible       0.75      0.72      0.74        64
Someone_Else       0.89      0.75      0.81       240
     present       0.85      0.89      0.87       670
    accuracy          -         -      0.85      1726
   macro-avg       0.83      0.81      0.81      1726
weighted-avg       0.85      0.85      0.85      1726