Sentence Entity Resolver for ICD10-CM (Augmented)

Description

This model maps extracted medical entities to ICD-10-CM codes using sbiobert_base_cased_mli Sentence Bert Embeddings. Also, it has been augmented with synonyms to make it more accurate.

Predicted Entities

ICD-10-CM Codes

Download Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("word_embeddings")

ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token", "word_embeddings"])\
    .setOutputCol("ner")\

ner_converter = NerConverterInternal()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")\
    .setWhiteList(["PROBLEM"])

c2doc = Chunk2Doc()\
    .setInputCols("ner_chunk")\
    .setOutputCol("ner_chunk_doc") 

sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
    .setInputCols(["ner_chunk_doc"])\
    .setOutputCol("sentence_embeddings")\
    .setCaseSensitive(False)

icd_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented", "en", "clinical/models") \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("resolution")\
    .setDistanceFunction("EUCLIDEAN")

resolver_pipeline = Pipeline(stages = [document_assembler,
                                       sentenceDetectorDL,
                                       tokenizer,
                                       word_embeddings,
                                       ner,
                                       ner_converter,
                                       c2doc,
                                       sbert_embedder,
                                       icd_resolver])

data = spark.createDataFrame([["""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus, associated with obesity with a body mass index (BMI) of 33.5 kg/m2, presented with a one-week history of polyuria, polydipsia, poor appetite, and vomiting. Two weeks prior to presentation, she was treated with a five-day course of amoxicillin for a respiratory tract infection."""]]).toDF("text")

result = resolver_pipeline.fit(data).transform(data)

val document_assembler = new DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")

val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
    .setInputCols("document")
    .setOutputCol("sentence")

val tokenizer = new Tokenizer()
    .setInputCols("sentence")
    .setOutputCol("token")

val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
    .setInputCols(Array("sentence","token"))
    .setOutputCol("embeddings")

val clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models")
    .setInputCols(Array("sentence","token","embeddings"))
    .setOutputCol("ner")

val ner_converter = new NerConverter()
    .setInputCols(Array("sentence","token","ner"))
    .setOutputCol("ner_chunk")
    .setWhiteList("PROBLEM")

val chunk2doc = new Chunk2Doc()
    .setInputCols("ner_chunk")
    .setOutputCol("ner_chunk_doc")

val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli","en","clinical/models")
    .setInputCols("ner_chunk_doc")
    .setOutputCol("sbert_embeddings")
    .setCaseSensitive(False)

val icd10_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented", "en", "clinical/models")
    .setInputCols("sbert_embeddings") 
    .setOutputCol("resolution")
    .setDistanceFunction("EUCLIDEAN")

val pipeline = new Pipeline().setStages(Array(document_assembler, 
                               sentence_detector, 
                               tokenizer, 
                               word_embeddings, 
                               clinical_ner, 
                               ner_converter, 
                               chunk2doc, 
                               sbert_embedder, 
                               icd10_resolver))

val data = Seq("A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus, associated with obesity with a body mass index (BMI) of 33.5 kg/m2, presented with a one-week history of polyuria, polydipsia, poor appetite, and vomiting. Two weeks prior to presentation, she was treated with a five-day course of amoxicillin for a respiratory tract infection.").toDS().toDF("text")

val result = pipeline.fit(data).transform(data)

Results

+-------------------------------------+-------+----------+---------------------------------------------------------------------------+---------------------------------------------------------------------------+
|                            ner_chunk| entity|icd10_code|                                                                resolutions|                                                                  all_codes|
+-------------------------------------+-------+----------+---------------------------------------------------------------------------+---------------------------------------------------------------------------+
|        gestational diabetes mellitus|PROBLEM|     O24.4|[gestational diabetes mellitus [gestational diabetes mellitus], gestatio...|      [O24.4, O24.41, O24.43, Z86.32, Z87.5, O24.31, O24.11, O24.1, O24.81]|
|subsequent type two diabetes mellitus|PROBLEM|    O24.11|[pre-existing type 2 diabetes mellitus [pre-existing type 2 diabetes mel...|[O24.11, E11.8, E11, E13.9, E11.9, E11.3, E11.44, Z86.3, Z86.39, E11.32,...|
|                              obesity|PROBLEM|     E66.9|[obesity [obesity, unspecified], abdominal obesity [other obesity], obes...|[E66.9, E66.8, Z68.41, Q13.0, E66, E66.01, Z86.39, E34.9, H35.50, Z83.49...|
|                    a body mass index|PROBLEM|    Z68.41|[finding of body mass index [body mass index [bmi] 40.0-44.9, adult], ob...|[Z68.41, E66.9, R22.9, Z68.1, R22.3, R22.1, Z68, R22.2, R22.0, R41.89, M...|
|                             polyuria|PROBLEM|       R35|[polyuria [polyuria], nocturnal polyuria [nocturnal polyuria], polyuric ...|[R35, R35.81, R35.8, E23.2, R31, R35.0, R82.99, N40.1, E72.3, O04.8, R30...|
|                           polydipsia|PROBLEM|     R63.1|[polydipsia [polydipsia], psychogenic polydipsia [other impulse disorder...|[R63.1, F63.89, E23.2, F63.9, O40, G47.5, M79.89, R63.2, R06.1, H53.8, I...|
|                        poor appetite|PROBLEM|     R63.0|[poor appetite [anorexia], poor feeding [feeding problem of newborn, uns...|[R63.0, P92.9, R43.8, R43.2, E86, R19.6, F52.0, Z72.4, R06.89, Z76.89, R...|
|                             vomiting|PROBLEM|     R11.1|[vomiting [vomiting], intermittent vomiting [nausea and vomiting], vomit...|          [R11.1, R11, R11.10, G43.A1, P92.1, P92.09, G43.A, R11.13, R11.0]|
|        a respiratory tract infection|PROBLEM|     J98.8|[respiratory tract infection [other specified respiratory disorders], up...|[J98.8, J06.9, A49.9, J22, J20.9, Z59.3, T17, J04.10, Z13.83, J18.9, P28...|
+-------------------------------------+-------+----------+---------------------------------------------------------------------------+---------------------------------------------------------------------------+

Model Information

Model Name:	sbiobertresolve_icd10cm_augmented
Compatibility:	Healthcare NLP 5.4.0+
License:	Licensed
Edition:	Official
Input Labels:	[sentence_embeddings]
Output Labels:	[icd10cm_code]
Language:	en
Size:	1.3 GB
Case sensitive:	false

References

This model is trained with the 2025 version of ICD-10-CM dataset.

PREVIOUSSentence Entity Resolver for Billable ICD-10-CM HCC Codes

NEXTICD10CM Sentence Entity Resolver (Slim, normalized)