Sentence Entity Resolver for SNOMED (sbiobertresolve_snomed_drug)

Description

This model maps detected drug entities to SNOMED codes using sbiobert_base_cased_mli Sentence Bert Embeddings.

Predicted Entities

Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", 'clinical/models') \
  .setInputCols(["document"]) \
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("word_embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "word_embeddings"]) \
  .setOutputCol("ner")

ner_converter = NerConverterInternal() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(['DRUG'])

c2doc = Chunk2Doc()\
  .setInputCols("ner_chunk")\
  .setOutputCol("ner_chunk_doc")

sentence_chunk_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
  .setInputCols(["ner_chunk_doc"])\
  .setOutputCol("sentence_embeddings")


snomed_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_drug", "en", "clinical/models") \
  .setInputCols(["sentence_embeddings"]) \
  .setOutputCol("snomed_code")\
  .setDistanceFunction("EUCLIDEAN")\


resolver_pipeline = Pipeline(
    stages = [
          document_assembler,
          sentenceDetectorDL,
          tokenizer,
          word_embeddings,
          clinical_ner,
          ner_converter,
          c2doc,
          sentence_chunk_embeddings,
          snomed_resolver
          ])

model = resolver_pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
result = model.transform(spark.createDataFrame([["John's doctor prescribed aspirin for his heart condition, along with paracetamol for his fever and headache, amoxicillin for his tonsilitis and lansoprazole for his GORD on 2023-12-01."]]).toDF("text"))
val document_assembler = DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")
  .setInputCols(Array("document"))
  .setOutputCol("sentence")

val tokenizer = Tokenizer()
  .setInputCols(Array("sentence"))
  .setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
  .setInputCols(Array("sentence", "token"))
  .setOutputCol("word_embeddings")

val clinical_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models")
  .setInputCols(Array("sentence", "token", "word_embeddings"))
  .setOutputCol("ner")

val ner_converter = NerConverterInternal()
  .setInputCols(Array("sentence", "token", "ner"))
  .setOutputCol("ner_chunk")
  .setWhiteList(Array("DRUG"))

val c2doc = Chunk2Doc()
  .setInputCols("ner_chunk")
  .setOutputCol("ner_chunk_doc")

val sentence_chunk_embeddings = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")
  .setInputCols(Array("ner_chunk_doc"))
  .setOutputCol("sentence_embeddings")


val snomed_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_drug", "en", "clinical/models")
  .setInputCols(Array("sentence_embeddings"))
  .setOutputCol("snomed_code")
  .setDistanceFunction("EUCLIDEAN")


resolver_pipeline = new Pipeline().setStages(
    document_assembler,
    sentenceDetectorDL,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter,
    c2doc,
    sentence_chunk_embeddings,
    snomed_resolver)

data = Seq("John's doctor prescribed aspirin for his heart condition, along with paracetamol for his fever and headache, amoxicillin for his tonsilitis and lansoprazole for his GORD on 2023-12-01.").toDF("text")

result = resolver_pipeline.fit(data).transform(data)

Results

+------------+-----+-----------+------------+--------------------------------------------------+--------------------------------------------------+
|       chunk|label|snomed_code|  resolution|                                         all_codes|                                   all_resolutions|
+------------+-----+-----------+------------+--------------------------------------------------+--------------------------------------------------+
|     aspirin| DRUG|    7947003|     aspirin|7947003:::358427004:::426365001:::412566001:::2...|aspirin:::oral aspirin:::aspirin, buffered:::bu...|
| paracetamol| DRUG|  387517004| paracetamol|387517004:::90332006:::437876006:::437818001:::...|paracetamol:::paracetamol product:::oral form p...|
| amoxicillin| DRUG|   27658006| amoxicillin|27658006:::350162003:::427483001:::350164002:::...|amoxicillin:::oral amoxicillin:::amoxicillin so...|
|lansoprazole| DRUG|  108666007|lansoprazole|108666007:::437961004:::441863009:::716069007::...|lansoprazole:::oral form lansoprazole:::dexlans...|
+------------+-----+-----------+------------+--------------------------------------------------+--------------------------------------------------+

Model Information

Model Name: sbiobertresolve_snomed_drug
Compatibility: Healthcare NLP 5.3.0+
License: Licensed
Edition: Official
Input Labels: [sentence_embeddings]
Output Labels: [snomed_code]
Language: en
Size: 283.0 MB
Case sensitive: false

References

This model is trained with the augmented version of NIH September 2023 SNOMED CT United States (US) Edition.