Sentence Entity Resolver for NDC (sbiobert_base_cased_mli embeddings)

Description

This model maps clinical entities and concepts (like drugs/ingredients) to National Drug Codes using sbiobert_base_cased_mli Sentence Bert Embeddings. It also returns package options and alternative drugs in the all_k_aux_label column.

Predicted Entities

NDC codes, package options

Live Demo Open in Colab Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("word_embeddings")

ner = MedicalNerModel.pretrained("ner_posology_greedy", "en", "clinical/models")\
    .setInputCols(["sentence", "token", "word_embeddings"])\
    .setOutputCol("ner")\

ner_converter = NerConverterInternal()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")\
    .setWhiteList(["DRUG"])

c2doc = Chunk2Doc()\
    .setInputCols("ner_chunk")\
    .setOutputCol("ner_chunk_doc")

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
    .setInputCols(["ner_chunk_doc"])\
    .setOutputCol("sentence_embeddings")\
    .setCaseSensitive(False)

ndc_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_ndc", "en", "clinical/models")\ \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("ndc_code")\
    .setDistanceFunction("EUCLIDEAN")

resolver_pipeline = Pipeline(stages = [document_assembler,
                                       sentenceDetectorDL,
                                       tokenizer,
                                       word_embeddings,
                                       ner,
                                       ner_converter,
                                       c2doc,
                                       sbert_embedder,
                                       ndc_resolver])

text= "The patient was prescribed Amlodopine Vallarta 10-320mg, Eviplera. aspirin 81 mg and metformin 500 mg. The other patient is given Lescol 40 MG and Everolimus 1.5 mg tablet."
data = spark.createDataFrame([[text]]).toDF("text")

result = resolver_pipeline.fit(data).transform(data)
val document_assembler = new DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")

val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
    .setInputCols("document")
    .setOutputCol("sentence")

val tokenizer = new Tokenizer()
    .setInputCols("sentence")
    .setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
    .setInputCols(Array("sentence","token"))
    .setOutputCol("embeddings")

val ner = MedicalNerModel.pretrained("ner_posology_greedy", "en", "clinical/models")
    .setInputCols(Array("sentence","token","embeddings"))
    .setOutputCol("ner")

val ner_converter = new NerConverter()
    .setInputCols(Array("sentence","token","ner"))
    .setOutputCol("ner_chunk")
    .setWhiteList("DRUG")

val chunk2doc = new Chunk2Doc()
    .setInputCols("ner_chunk")
    .setOutputCol("ner_chunk_doc")

val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")
    .setInputCols("ner_chunk_doc")
    .setOutputCol("sbert_embeddings")

val ndc_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_ndc", "en", "clinical/models")
    .setInputCols("sbert_embeddings")
    .setOutputCol("ndc_code")
    .setDistanceFunction("EUCLIDEAN")

val pipeline = new Pipeline().setStages(Array(document_assembler,
                               sentence_detector,
                               tokenizer,
                               word_embeddings,
                               ner,
                               ner_converter,
                               chunk2doc,
                               sbert_embedder,
                               ndc_resolver))

val data = Seq("The patient was prescribed Amlodopine Vallarta 10-320mg, Eviplera. aspirin 81 mg and metformin 500 mg. The other patient is given Lescol 40 MG and Everolimus 1.5 mg tablet.").toDS().toDF("text")

val result = pipeline.fit(data).transform(data)

Results

|    | ner_chunk                    | entity   | ndc_code   | aux_list                                                                                            |
|---:|:-----------------------------|:---------|:-----------|:----------------------------------------------------------------------------------------------------|
|  0 | Amlodopine Vallarta 10-320mg | DRUG     | 72483-0100 | '{'packages': "['1 BOTTLE in 1 BOX (72483-100-04)  / 120 mL in 1 BOTTLE\']", \'alternatives\': [ ...|
|  1 | aspirin 81 mg                | DRUG     | 41250-0780 | '{'packages': "['1 BOTTLE, PLASTIC in 1 PACKAGE (41250-780-01)  / 120 TABLET, DELAYED RELEASE in ...|
|  2 | metformin 500 mg             | DRUG     | 62207-0491 | '{'packages': "['5000 TABLET in 1 POUCH (62207-491-31)\', \'25000 TABLET in 1 CARTON (62207-491- ...|
|  3 | Lescol 40 MG                 | DRUG     | 0713-0862  | '{'packages': "['30 TABLET, FILM COATED in 1 BOTTLE, PLASTIC (0713-0862-30)\']", \'alternatives\ ...|
|  4 | Everolimus 1.5 mg tablet     | DRUG     | 0054-0604  | '{'packages': "['60 TABLET in 1 BOTTLE (0054-0604-21)\']", \'alternatives\': [\'67877-721\', \'4 ...|

Model Information

Model Name: sbiobertresolve_ndc
Compatibility: Healthcare NLP 5.4.1+
License: Licensed
Edition: Official
Input Labels: [sentence_embeddings]
Output Labels: [ndc_code]
Language: en
Size: 724.6 MB
Case sensitive: false

References

It is trained on U.S. FDA 2024-NDC Codes dataset