Few-Shot Assertion Model ( Oncology )

Description

Assign assertion status to clinical entities extracted by NER based on their context in the text. Also, this model is trained on a list of clinical and biomedical datasets curated in-house

Predicted Entities

Absent, Past, Present, Family, Hypothetical, Possible

Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")\
    .setSplitChars(["-", "\/"])

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

# ner_oncology
ner_oncology = MedicalNerModel.pretrained("ner_oncology", "en", "clinical/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner_oncology")

ner_oncology_converter = NerConverterInternal()\
    .setInputCols(["sentence","token","ner_oncology"])\
    .setOutputCol("ner_chunk")

few_shot_assertion_converter = FewShotAssertionSentenceConverter()\
    .setInputCols(["sentence", "token", "ner_chunk"])\
    .setOutputCol("assertion_sentence")

e5_embeddings = E5Embeddings.pretrained("e5_base_v2_embeddings_medical_assertion_oncology", "en", "clinical/models")\
    .setInputCols(["assertion_sentence"])\
    .setOutputCol("assertion_embedding")

few_shot_assertion_classifier = FewShotAssertionClassifierModel()\
    .pretrained("fewhot_assertion_oncology_e5_base_v2_oncology", "en", "clinical/models")\
    .setInputCols(["assertion_embedding"])\
    .setOutputCol("assertion")
 
assertion_pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    ner_oncology,
    ner_oncology_converter,
    few_shot_assertion_converter,
    e5_embeddings,
    few_shot_assertion_classifier
])

sample_text= [
"""The patient is suspected to have colorectal cancer. Her family history is positive for other cancers. The result of the biopsy was positive. A CT scan was ordered to rule out metastases."""
]

data = spark.createDataFrame([sample_text]).toDF("text")

result = assertion_pipeline.fit(data).transform(data)

val document_assembler = new DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")

val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")
    .setInputCols(Array("document"))
    .setOutputCol("sentence")

val tokenizer = new Tokenizer()
    .setInputCols(Array("sentence"))
    .setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
    .setInputCols(Array("sentence", "token"))
    .setOutputCol("embeddings")

# ner_oncology
val ner_oncology = MedicalNerModel.pretrained("ner_oncology", "en", "clinical/models")
    .setInputCols(Array("sentence", "token", "embeddings"))
    .setOutputCol("ner_oncology")

val ner_oncology_converter = new NerConverterInternal()
    .setInputCols(Array("sentence","token","ner_oncology"))
    .setOutputCol("ner_chunk")

val few_shot_assertion_converter = new FewShotAssertionSentenceConverter()
    .setInputCols(Array("sentence", "token", "ner_chunk"))
    .setOutputCol("assertion_sentence")

val e5_embeddings = E5Embeddings.pretrained("e5_base_v2_embeddings_medical_assertion_oncology", "en", "clinical/models")
    .setInputCols(Array("assertion_sentence"))
    .setOutputCol("assertion_embedding")

val few_shot_assertion_classifier = FewShotAssertionClassifierModel()
    .pretrained("fewhot_assertion_oncology_e5_base_v2_oncology", "en", "clinical/models")
    .setInputCols(Array("assertion_embedding"))
    .setOutputCol("assertion")

val pipeline = new Pipeline().setStages(Array(    
    document_assembler,
    sentence_detector,
    tokenizer,
    word_embeddings,
    ner_oncology,
    ner_oncology_converter,
    few_shot_assertion_converter,
    e5_embeddings,
    few_shot_assertion_classifier))

val data = Seq("The patient is suspected to have colorectal cancer. Her family history is positive for other cancers. The result of the biopsy was positive. A CT scan was ordered to rule out metastases.").toDF("text")

val result = pipeline.fit(data).transform(data)

Results

| chunks            |   begin |   end | entities         | assertion   |   confidence |
|:------------------|--------:|------:|:-----------------|:------------|-------------:|
| colorectal cancer |      33 |    49 | Cancer_Dx        | Possible    |     0.581282 |
| Her               |      52 |    54 | Gender           | Present     |     0.9563   |
| cancers           |      93 |    99 | Cancer_Dx        | Family      |     0.234656 |
| biopsy            |     120 |   125 | Pathology_Test   | Past        |     0.957321 |
| CT scan           |     143 |   149 | Imaging_Test     | Past        |     0.95717  |
| metastases        |     175 |   184 | Metastasis       | Possible    |     0.549866 |

Model Information

Model Name: fewhot_assertion_oncology_e5_base_v2_oncology
Compatibility: Healthcare NLP 5.3.3+
License: Licensed
Edition: Official
Input Labels: [assertion_embedding]
Output Labels: [assertion]
Language: en
Size: 25.4 KB

Benchmarking

       label  precision    recall  f1-score   support
      Absent       0.86      0.86      0.86       343
      Family       1.00      0.86      0.92        35
Hypothetical       0.72      0.71      0.71       216
        Past       0.90      0.95      0.93      2024
    Possible       0.76      0.79      0.77        80
     Present       0.93      0.87      0.90      1842
    accuracy        -         -        0.90      4540
   macro-avg       0.86      0.84      0.85      4540
weighted-avg       0.90      0.90      0.90      4540