Description
RoBERTa-based token classification model for extracting symptom mentions from text using BIO labels (B-SYMPTOM, I-SYMPTOM, O). Intended for dense symptom recognition in clinical and biomedical narratives.
How to use
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols("document") \
.setOutputCol("token")
tokenClassifier = RoBertaForTokenClassification \
.pretrained("roberta_procedure_ner_onnx", "en", "clinical/models") \
.setInputCols(["document", "token"]) \
.setOutputCol("ner")
converter = NerConverter() \
.setInputCols(["document", "token", "ner"]) \
.setOutputCol("ner_chunk")
pipeline = Pipeline(stages=[
documentAssembler,
tokenizer,
tokenClassifier,
converter
])
data = spark.createDataFrame([
["Patient underwent blood draw, venipuncture, intravenous catheter insertion, chest x ray, computed tomography scan, magnetic resonance imaging, electrocardiogram, echocardiogram, stress test, cardiac catheterization, coronary angiography, and pulmonary function testing during initial evaluation."],
["During hospitalization the care team performed endotracheal intubation, mechanical ventilation, central line placement, arterial line placement, bronchoscopy, lumbar puncture, thoracentesis, paracentesis, hemodialysis, continuous renal replacement therapy, and bedside ultrasound examinations."],
["In the surgical setting the patient received laparoscopic appendectomy, exploratory laparotomy, bowel resection, colostomy creation, wound debridement, skin grafting, blood transfusion, postoperative drain placement, staple removal, physical therapy evaluation, occupational therapy assessment, and rehabilitation training."]
]).toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
"chunk.result as text",
"chunk.metadata['entity'] as entity"
).show(truncate=False)
from johnsnowlabs import nlp, medical
documentAssembler = nlp.DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = nlp.Tokenizer() \
.setInputCols("document") \
.setOutputCol("token")
tokenClassifier = medical.RoBertaForTokenClassification \
.pretrained("roberta_procedure_ner_onnx", "en", "clinical/models") \
.setInputCols(["document", "token"]) \
.setOutputCol("ner")
converter = nlp.NerConverter() \
.setInputCols(["document", "token", "ner"]) \
.setOutputCol("ner_chunk")
pipeline = nlp.Pipeline(stages=[
documentAssembler,
tokenizer,
tokenClassifier,
converter
])
data = spark.createDataFrame([
["Patient underwent blood draw, venipuncture, intravenous catheter insertion, chest x ray, computed tomography scan, magnetic resonance imaging, electrocardiogram, echocardiogram, stress test, cardiac catheterization, coronary angiography, and pulmonary function testing during initial evaluation."],
["During hospitalization the care team performed endotracheal intubation, mechanical ventilation, central line placement, arterial line placement, bronchoscopy, lumbar puncture, thoracentesis, paracentesis, hemodialysis, continuous renal replacement therapy, and bedside ultrasound examinations."],
["In the surgical setting the patient received laparoscopic appendectomy, exploratory laparotomy, bowel resection, colostomy creation, wound debridement, skin grafting, blood transfusion, postoperative drain placement, staple removal, physical therapy evaluation, occupational therapy assessment, and rehabilitation training."]
]).toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
"chunk.result as text",
"chunk.metadata['entity'] as entity"
).show(truncate=False)
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.sql.functions._
import org.apache.spark.ml.Pipeline
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val tokenizer = new Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
val tokenClassifier = RoBertaForTokenClassification
.pretrained("roberta_procedure_ner_onnx", "en", "clinical/models")
.setInputCols(Array("document", "token"))
.setOutputCol("ner")
val converter = new NerConverter()
.setInputCols(Array("document", "token", "ner"))
.setOutputCol("ner_chunk")
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
tokenizer,
tokenClassifier,
converter
))
val data = Seq(
"Patient underwent blood draw, venipuncture, intravenous catheter insertion, chest x ray, computed tomography scan, magnetic resonance imaging, electrocardiogram, echocardiogram, stress test, cardiac catheterization, coronary angiography, and pulmonary function testing during initial evaluation.",
"During hospitalization the care team performed endotracheal intubation, mechanical ventilation, central line placement, arterial line placement, bronchoscopy, lumbar puncture, thoracentesis, paracentesis, hemodialysis, continuous renal replacement therapy, and bedside ultrasound examinations.",
"In the surgical setting the patient received laparoscopic appendectomy, exploratory laparotomy, bowel resection, colostomy creation, wound debridement, skin grafting, blood transfusion, postoperative drain placement, staple removal, physical therapy evaluation, occupational therapy assessment, and rehabilitation training."
).toDF("text")
val result = pipeline.fit(data).transform(data)
result.selectExpr("explode(ner_chunk) as chunk")
.selectExpr("chunk.result as text", "chunk.metadata['entity'] as entity")
.show(false)
Results
+---------------+---------+
|text |entity |
+---------------+---------+
|cardiac |PROCEDURE|
|catheterization|PROCEDURE|
|coronary |PROCEDURE|
|angiography |PROCEDURE|
|bronchoscopy |PROCEDURE|
|thoracentesis |PROCEDURE|
|paracentesis |PROCEDURE|
|hemodialysis |PROCEDURE|
|continuous |PROCEDURE|
|laparoscopic |PROCEDURE|
|appendectomy |PROCEDURE|
|exploratory |PROCEDURE|
|laparotomy |PROCEDURE|
|bowel |PROCEDURE|
|colostomy |PROCEDURE|
|wound |PROCEDURE|
+---------------+---------+
Model Information
| Model Name: | roberta_procedure_ner_onnx |
| Compatibility: | Healthcare NLP 6.2.0+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [token, sentence] |
| Output Labels: | [ner] |
| Language: | en |
| Size: | 466.2 MB |
| Case sensitive: | true |
| Max sentence length: | 128 |