Description
This model maps clinical entities to ICD-10-CM codes using bge_base_en_v1_5_onnx embeddings. It resolves entities from the following domains: Condition, Observation, Measurement, and Procedure.
Predicted Entities
ICD-10-CM Codes
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("word_embeddings")
ner_clinical = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token", "word_embeddings"])\
.setOutputCol("ner")
ner_clinical_converter = NerConverterInternal()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")\
.setWhiteList(["PROBLEM"])
chunk2doc = Chunk2Doc()\
.setInputCols("ner_chunk")\
.setOutputCol("ner_chunk_doc")
bge_embeddings = BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("bge_embeddings")
icd10cm_resolver = SentenceEntityResolverModel.pretrained("bgeresolve_icd10cm", "en", "clinical/models")\
.setInputCols(["bge_embeddings"])\
.setOutputCol("icd10cm_code")\
.setDistanceFunction("EUCLIDEAN")
pipeline = Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_clinical,
ner_clinical_converter,
chunk2doc,
bge_embeddings,
icd10cm_resolver
])
text = """A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with acute hepatitis and obesity, presented with a one-week history of polyuria, polydipsia, and vomiting."""
data = spark.createDataFrame([[text]]).toDF("text")
result = pipeline.fit(data).transform(data)
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("word_embeddings")
ner_clinical = medical.NerModel.pretrained("ner_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token", "word_embeddings"])\
.setOutputCol("ner")
ner_clinical_converter = medical.NerConverter()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")\
.setWhiteList(["PROBLEM"])
chunk2doc = nlp.Chunk2Doc()\
.setInputCols("ner_chunk")\
.setOutputCol("ner_chunk_doc")
bge_embeddings = nlp.BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en")\
.setInputCols(["ner_chunk_doc"])\
.setOutputCol("bge_embeddings")
icd10cm_resolver = medical.SentenceEntityResolverModel.pretrained("bgeresolve_icd10cm", "en", "clinical/models")\
.setInputCols(["bge_embeddings"])\
.setOutputCol("icd10cm_code")\
.setDistanceFunction("EUCLIDEAN")
pipeline = nlp.Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_clinical,
ner_clinical_converter,
chunk2doc,
bge_embeddings,
icd10cm_resolver
])
text = """A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with acute hepatitis and obesity, presented with a one-week history of polyuria, polydipsia, and vomiting."""
data = spark.createDataFrame([[text]]).toDF("text")
result = pipeline.fit(data).transform(data)
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("word_embeddings")
val nerClinical = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "word_embeddings"))
.setOutputCol("ner")
val nerClinicalConverter = new NerConverterInternal()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")
.setWhiteList(Array("PROBLEM"))
val chunk2doc = new Chunk2Doc()
.setInputCols("ner_chunk")
.setOutputCol("ner_chunk_doc")
val bgeEmbeddings = BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en")
.setInputCols(Array("ner_chunk_doc"))
.setOutputCol("bge_embeddings")
val icd10cmResolver = SentenceEntityResolverModel.pretrained("bgeresolve_icd10cm", "en", "clinical/models")
.setInputCols(Array("bge_embeddings"))
.setOutputCol("icd10cm_code")
.setDistanceFunction("EUCLIDEAN")
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
sentenceDetector,
tokenizer,
wordEmbeddings,
nerClinical,
nerClinicalConverter,
chunk2doc,
bgeEmbeddings,
icd10cmResolver
))
val data = Seq("""A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with acute hepatitis and obesity, presented with a one-week history of polyuria, polydipsia, and vomiting.""").toDF("text")
val result = pipeline.fit(data).transform(data)
Results
|sent_id|ner_chunk |entity |icd10cm_code|resolutions |all_codes |all_resolutions |
|-------|--------------------------------------|-------|------------|-----------------------------|-----------------------------------------------|-----------------------------------------------|
|0 |gestational diabetes mellitus |PROBLEM|O24.4 |gestational diabetes mellitus|[O24.4, O24.41, O24.43, O24.42, O24.414, ...] |[gestational diabetes mellitus, gestational di...]|
|0 |subsequent type two diabetes mellitus |PROBLEM|E11 |type 2 diabetes mellitus |[E11, E11.69, E11.6, E11.64, E13, E11.65, ...] |[type 2 diabetes mellitus, type 2 diabetes mel...]|
|0 |T2DM |PROBLEM|E11 |type 2 diabetes mellitus |[E11, E11.65, E11.64, E11.9, E11.44, E11.5, ...]|[type 2 diabetes mellitus, type 2 diabetes mel...]|
|0 |HTG-induced pancreatitis |PROBLEM|K85.3 |drug induced acute pancreatitis|[K85.3, K86.0, K85.2, K85.31, K85.21, ...] |[drug induced acute pancreatitis, alcohol-indu...]|
|0 |acute hepatitis |PROBLEM|B15 |acute hepatitis a |[B15, B16, B17.1, B17, B17.2, B17.9, ...] |[acute hepatitis a, acute hepatitis b, acute h...]|
|0 |obesity |PROBLEM|E66 |overweight and obesity |[E66, E66.9, E66.0, E66.8, E66.3, O99.21, ...] |[overweight and obesity, obesity, unspecified,...]|
|0 |polyuria |PROBLEM|R35 |polyuria |[R35, R35.89, R35.8, R35.81, R80, R80.8, ...] |[polyuria, other polyuria, other polyuria, noc...]|
|0 |polydipsia |PROBLEM|R63.1 |polydipsia |[R63.1, O40, R35, R35.89, R35.8, R63.2, ...] |[polydipsia, polyhydramnios, polyuria, other p...]|
|0 |vomiting |PROBLEM|R11.1 |vomiting |[R11.1, R11, R11.12, R11.10, R11.11, R11.13, ...]|[vomiting, nausea and vomiting, projectile vom...]|
Model Information
| Model Name: | bgeresolve_icd10cm |
| Compatibility: | Healthcare NLP 6.2.0+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [bge_embeddings] |
| Output Labels: | [icd10cm_code] |
| Language: | en |
| Size: | 281.4 MB |
| Case sensitive: | false |