Description
This model extracts substance usage information related to Social Determinants of Health from various kinds of biomedical documents.
Predicted Entities
Smoking
, Substance_Duration
, Substance_Use
, Substance_Quantity
, Substance_Frequency
, Alcohol
Live Demo Open in Colab Copy S3 URI
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_model = MedicalNerModel.pretrained("ner_sdoh_substance_usage_wip", "en", "clinical/models")\
.setInputCols(["sentence", "token","embeddings"])\
.setOutputCol("ner")
ner_converter = NerConverterInternal()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")
pipeline = Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
clinical_embeddings,
ner_model,
ner_converter
])
sample_texts = ["He does drink occasional alcohol approximately 5 to 6 alcoholic drinks per month.",
"He continues to smoke one pack of cigarettes daily, as he has for the past 28 years."]
data = spark.createDataFrame(sample_texts, StringType()).toDF("text")
result = pipeline.fit(data).transform(data)
val document_assembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en")
.setInputCols("document")
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols("sentence")
.setOutputCol("token")
val clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val ner_model = MedicalNerModel.pretrained("ner_sdoh_substance_usage_wip", "en", "clinical/models")
.setInputCols(Array("sentence", "token","embeddings"))
.setOutputCol("ner")
val ner_converter = new NerConverterInternal()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")
val pipeline = new Pipeline().setStages(Array(
document_assembler,
sentence_detector,
tokenizer,
clinical_embeddings,
ner_model,
ner_converter
))
val data = Seq("He does drink occasional alcohol approximately 5 to 6 alcoholic drinks per month.").toDS.toDF("text")
val result = pipeline.fit(data).transform(data)
Results
+----------------+-----+---+-------------------+
|chunk |begin|end|ner_label |
+----------------+-----+---+-------------------+
|drink |8 |12 |Alcohol |
|occasional |14 |23 |Substance_Frequency|
|alcohol |25 |31 |Alcohol |
|5 to 6 |47 |52 |Substance_Quantity |
|alcoholic drinks|54 |69 |Alcohol |
|per month |71 |79 |Substance_Frequency|
|smoke |16 |20 |Smoking |
|one pack |22 |29 |Substance_Quantity |
|cigarettes |34 |43 |Smoking |
|daily |45 |49 |Substance_Frequency|
|past 28 years |70 |82 |Substance_Duration |
+----------------+-----+---+-------------------+
Model Information
Model Name: | ner_sdoh_substance_usage_wip |
Compatibility: | Healthcare NLP 4.3.1+ |
License: | Licensed |
Edition: | Official |
Input Labels: | [sentence, token, embeddings] |
Output Labels: | [ner] |
Language: | en |
Size: | 3.0 MB |
Benchmarking
label tp fp fn total precision recall f1
Substance_Frequency 52.0 2.0 12.0 64.0 0.962963 0.812500 0.881356
Smoking 77.0 4.0 2.0 79.0 0.950617 0.974684 0.962500
Alcohol 327.0 8.0 15.0 342.0 0.976119 0.956140 0.966027
Substance_Quantity 74.0 7.0 12.0 86.0 0.913580 0.860465 0.886228
Substance_Duration 27.0 7.0 14.0 41.0 0.794118 0.658537 0.720000
Substance_Use 204.0 8.0 6.0 210.0 0.962264 0.971429 0.966825