Sentence Entity Resolver for ATC (sbiobert_base_cased_mli embeddings)

Description

This model maps drugs entities to ATC (Anatomic Therapeutic Chemical) codes using sbiobert_base_cased_mli Sentence Bert Embeddings.

Predicted Entities

ATC Codes

Open in Colab Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") \
      .setInputCols(["document"]) \
      .setOutputCol("sentence")

tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("word_embeddings")

posology_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "word_embeddings"]) \
      .setOutputCol("ner")

ner_converter = NerConverterInternal() \
      .setInputCols(["sentence", "token", "ner"]) \
      .setOutputCol("ner_chunk")\
      .setWhiteList(["DRUG"])

c2doc = Chunk2Doc()\
      .setInputCols("ner_chunk")\
      .setOutputCol("ner_chunk_doc") 

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
      .setInputCols(["ner_chunk_doc"])\
      .setOutputCol("sentence_embeddings")\
      .setCaseSensitive(False)
    
atc_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_atc", "en", "clinical/models")\
      .setInputCols(["sentence_embeddings"]) \
      .setOutputCol("atc_code")\
      .setDistanceFunction("EUCLIDEAN")
    
resolver_pipeline = Pipeline(
    stages = [
        document_assembler,
        sentenceDetectorDL,
        tokenizer,
        word_embeddings,
        posology_ner,
        ner_converter,
        c2doc,
        sbert_embedder,
        atc_resolver
  ])

sampleText = ["""He was seen by the endocrinology service and she was discharged on eltrombopag at night, amlodipine with meals metformin two times a day.""",
              """She was immediately given hydrogen peroxide 30 mg and amoxicillin twice daily for 10 days to treat the infection on her leg. She has a history of taking magnesium hydroxide.""",
              """She was given antidepressant for a month"""]

data = spark.createDataFrame(sampleText, StringType()).toDF("text")

results = resolver_pipeline.fit(data).transform(data)
val document_assembler = DocumentAssembler()
      .setInputCol("text")
      .setOutputCol("document")

val sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")
      .setInputCols(Array("document"))
      .setOutputCol("sentence")

val tokenizer = Tokenizer()
      .setInputCols(Array("sentence"))
      .setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
      .setInputCols(Array("sentence", "token"))
      .setOutputCol("word_embeddings")

val posology_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models")
      .setInputCols(Array("sentence", "token", "word_embeddings"))
      .setOutputCol("ner")

val ner_converter = NerConverterInternal()
      .setInputCols(Array("sentence", "token", "ner"))
      .setOutputCol("ner_chunk")
      .setWhiteList(Array("DRUG"))

val c2doc = Chunk2Doc()
      .setInputCols(Array("ner_chunk"))
      .setOutputCol("ner_chunk_doc") 

val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")
      .setInputCols(Array("ner_chunk_doc"))
      .setOutputCol("sentence_embeddings")
      .setCaseSensitive(False)
    
val atc_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_atc", "en", "clinical/models")
      .setInputCols(Array("sentence_embeddings"))
      .setOutputCol("atc_code")
      .setDistanceFunction("EUCLIDEAN")
    

val resolver_pipeline = new PipelineModel().setStages(Array(document_assembler, sentenceDetectorDL, tokenizer, word_embeddings, posology_ner, 
           ner_converter,  c2doc, sbert_embedder, atc_resolver))

val data = Seq("He was seen by the endocrinology service and she was discharged on eltrombopag at night, amlodipine with meals metformin two times a day and then ibuprofen. She was immediately given hydrogen peroxide 30 mg and amoxicillin twice daily for 10 days to treat the infection on her leg. She has a history of taking magnesium hydroxide. She was given antidepressant for a month").toDF("text")

val results = resolver_pipeline.fit(data).transform(data)

import nlu
nlu.load("en.resolve.atc").predict("""She was immediately given hydrogen peroxide 30 mg and amoxicillin twice daily for 10 days to treat the infection on her leg. She has a history of taking magnesium hydroxide.""")

Results

+-------------------+--------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|              chunk|atc_code|                                       all_k_codes|                                       resolutions|                                  all_k_aux_labels|
+-------------------+--------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|        eltrombopag| B02BX05|B02BX05:::A07DA06:::B06AC03:::M01AB08:::L04AA39...|eltrombopag :::eluxadoline :::ecallantide :::et...|ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th...|
|         amlodipine| C08CA01|C08CA01:::C08CA17:::C08CA13:::C08CA06:::C08CA10...|amlodipine :::levamlodipine :::lercanidipine ::...|ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th...|
|          metformin| A10BA02|A10BA02:::A10BA01:::A10BB01:::A10BH04:::A10BB07...|metformin :::phenformin :::glyburide / metformi...|ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th...|
|  hydrogen peroxide| A01AB02|A01AB02:::S02AA06:::D10AE:::D11AX25:::D10AE01::...|hydrogen peroxide :::hydrogen peroxide; otic:::...|ATC 5th:::ATC 5th:::ATC 4th:::ATC 5th:::ATC 5th...|
|        amoxicillin| J01CA04|J01CA04:::J01CA01:::J01CF02:::J01CF01:::J01CA51...|amoxicillin :::ampicillin :::cloxacillin :::dic...|ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th...|
|magnesium hydroxide| A02AA04|A02AA04:::A12CC02:::D10AX30:::B05XA11:::A02AA02...|magnesium hydroxide :::magnesium sulfate :::alu...|ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th:::ATC 5th...|
|     antidepressant|    N06A|N06A:::N05A:::N06AX:::N05AH02:::N06D:::N06CA:::...|ANTIDEPRESSANTS:::ANTIPSYCHOTICS:::Other antide...|ATC 3rd:::ATC 3rd:::ATC 4th:::ATC 5th:::ATC 3rd...|
+-------------------+--------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+

Model Information

Model Name: sbiobertresolve_atc
Compatibility: Healthcare NLP 3.4.1+
License: Licensed
Edition: Official
Input Labels: [sentence_embeddings]
Output Labels: [atc_code]
Language: en
Size: 71.6 MB
Case sensitive: false

References

Trained on ATC 2022 Codes dataset