Zero-shot Relation Extraction (BioBert)

Description

Zero-shot Relation Extraction to extract relations between clinical entities with no training dataset, just pretrained BioBert embeddings (included in the model). This model requires Healthcare NLP 3.5.0.

Take a look at how it works in the “Open in Colab” section below.

Predicted Entities

Open in Colab Copy S3 URI

How to use

documenter = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("tokens")

sentencer = nlp.SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentences")

words_embedder = nlp.WordEmbeddingsModel() \
    .pretrained("embeddings_clinical", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens"]) \
    .setOutputCol("embeddings")

ner_clinical = medical.NerModel() \
    .pretrained("ner_clinical", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens", "embeddings"]) \
    .setOutputCol("ner_clinical")

ner_clinical_converter = nlp.NerConverter() \
    .setInputCols(["sentences", "tokens", "ner_clinical"]) \
    .setOutputCol("ner_clinical_chunks")\
    .setWhiteList(["PROBLEM", "TEST"])      # PROBLEM-TEST-TREATMENT

ner_posology = medical.NerModel.pretrained("ner_posology", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens", "embeddings"]) \
    .setOutputCol("ner_posology")           

ner_posology_converter = nlp.NerConverter() \
    .setInputCols(["sentences", "tokens", "ner_posology"]) \
    .setOutputCol("ner_posology_chunks")\
    .setWhiteList(["DRUG"])                # DRUG-FREQUENCY-DOSAGE-DURATION-FORM-ROUTE-STRENGTH

chunk_merger = medical.ChunkMergeApproach()\
    .setInputCols("ner_clinical_chunks", "ner_posology_chunks")\
    .setOutputCol('merged_ner_chunks')

## ZERO-SHOT RE Starting...

re_model = medical.ZeroShotRelationExtractionModel.pretrained("re_zeroshot_biobert", "en", "clinical/models")\
    .setInputCols(["merged_ner_chunks", "sentences"]) \
    .setOutputCol("relations")\
    .setMultiLabel(True)

re_model.setRelationalCategories({
    "ADE": ["{DRUG} causes {PROBLEM}."],
    "IMPROVE": ["{DRUG} improves {PROBLEM}.", "{DRUG} cures {PROBLEM}."],
    "REVEAL": ["{TEST} reveals {PROBLEM}."]})

pipeline = nlp.Pipeline() \
    .setStages([documenter, 
                tokenizer, 
                sentencer, 
                words_embedder, 
                ner_clinical, 
                ner_clinical_converter,
                ner_posology,
                ner_posology_converter,
                chunk_merger,
                re_model])

data = spark.createDataFrame( [["Paracetamol can alleviate headache or sickness. An MRI test can be used to find cancer."]] ).toDF("text")

results = pipeline.fit(data).transform(data)

results\
.selectExpr("explode(relations) as relation")\
.show(truncate=False)
val data = spark.createDataFrame(Seq("Paracetamol can alleviate headache or sickness. An MRI test can be used to find cancer.")).toDF("text")

val documenter = new DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")

val tokenizer = new Tokenizer()
    .setInputCols("document")
    .setOutputCol("tokens")

val sentencer = new SentenceDetector()
    .setInputCols("document")
    .setOutputCol("sentences")

val words_embedder = WordEmbeddingsModel()
    .pretrained("embeddings_clinical", "en", "clinical/models")
    .setInputCols(Array("sentences", "tokens"))
    .setOutputCol("embeddings")

val ner_clinical = MedicalNerModel()
    .pretrained("ner_clinical", "en", "clinical/models")
    .setInputCols(Array("sentences", "tokens", "embeddings"))
    .setOutputCol("ner_clinical")

val ner_clinical_converter = new NerConverter()
    .setInputCols(Array("sentences", "tokens", "ner_clinical"))
    .setOutputCol("ner_clinical_chunks")
    .setWhiteList(Array("PROBLEM", "TEST"))      # PROBLEM-TEST-TREATMENT

val ner_posology = MedicalNerModel()
    .pretrained("ner_posology", "en", "clinical/models")
    .setInputCols(Array("sentences", "tokens", "embeddings"))
    .setOutputCol("ner_posology")           

val ner_posology_converter = new NerConverter()
    .setInputCols(Array("sentences", "tokens", "ner_posology"))
    .setOutputCol("ner_posology_chunks")
    .setWhiteList(Array("DRUG"))                # DRUG-FREQUENCY-DOSAGE-DURATION-FORM-ROUTE-STRENGTH

val chunk_merger = ChunkMergeApproach()
    .setInputCols("ner_clinical_chunks", "ner_posology_chunks")
    .setOutputCol('merged_ner_chunks')

## ZERO-SHOT RE Starting...

val re_model = ZeroShotRelationExtractionModel.pretrained("re_zeroshot_biobert", "en", "clinical/models")
    .setInputCols(Array("ner_chunks", "sentences"))
    .setOutputCol("relations")
    .setMultiLabel(true)

re_model.setRelationalCategories({ Map(
    "CURE" -> Array("{TREATMENT} cures {PROBLEM}."),
    "IMPROVE" -> Array("{TREATMENT} improves {PROBLEM}.", "{TREATMENT} cures {PROBLEM}."),
    "REVEAL" -> Array("{TEST} reveals {PROBLEM}.") ))

val pipeline = new Pipeline()
    .setStages(Array(documenter, 
                     tokenizer, 
                     sentencer, 
                     words_embedder, 
                     ner_clinical, 
                     ner_clinical_converter,
                     ner_posology,
                     ner_posology_converter,
                     chunk_merger,
                     re_model))

val model = pipeline.fit(data)
val results = model.transform(data)
import nlu
nlu.load("en.relation.zeroshot_biobert").predict("""Paracetamol can alleviate headache or sickness. An MRI test can be used to find cancer.""")

Results

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|relation                                                                                                                                                                                                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{category, 268, 358, IMPROVE, {entity1_begin -> 0, relation -> IMPROVE, hypothesis -> Paracetamol improves sickness., confidence -> 0.98819494, nli_prediction -> entail, entity1 -> DRUG, syntactic_distance -> undefined, chunk2 -> sickness, entity2_end -> 45, entity1_end -> 10, entity2_begin -> 38, entity2 -> PROBLEM, chunk1 -> Paracetamol, sentence -> 0}, []}|
|{category, 0, 90, IMPROVE, {entity1_begin -> 0, relation -> IMPROVE, hypothesis -> Paracetamol improves headache., confidence -> 0.9929625, nli_prediction -> entail, entity1 -> DRUG, syntactic_distance -> undefined, chunk2 -> headache, entity2_end -> 33, entity1_end -> 10, entity2_begin -> 26, entity2 -> PROBLEM, chunk1 -> Paracetamol, sentence -> 0}, []}    |
|{category, 536, 615, REVEAL, {entity1_begin -> 48, relation -> REVEAL, hypothesis -> An MRI test reveals cancer., confidence -> 0.9760039, nli_prediction -> entail, entity1 -> TEST, syntactic_distance -> undefined, chunk2 -> cancer, entity2_end -> 85, entity1_end -> 58, entity2_begin -> 80, entity2 -> PROBLEM, chunk1 -> An MRI test, sentence -> 1}, []}       |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Model Information

Model Name: re_zeroshot_biobert
Compatibility: Healthcare NLP 3.5.0+
License: Licensed
Edition: Official
Language: en
Size: 1.3 GB
Case sensitive: true

References

As it is a zero-shot relation extractor, no training dataset is necessary.