Symptom Named Entity Recognition (Base, ONNX)

Description

RoBERTa-based token classification model for extracting symptom mentions from text using BIO labels (B-SYMPTOM, I-SYMPTOM, O). Intended for dense symptom recognition in clinical and biomedical narratives.

Download Copy S3 URI

How to use

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

tokenClassifier = RoBertaForTokenClassification \
    .pretrained("roberta_symptom_ner_onnx", "en", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("ner")

converter = NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

pipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    tokenClassifier,
    converter
])

data = spark.createDataFrame([
    ["Patient reported fever, chills, night sweats, fatigue, malaise, weight loss, headache, dizziness, blurred vision, chest pain, palpitations, shortness of breath, cough, wheezing, nausea, vomiting, abdominal pain, diarrhea, constipation, dysphagia, and loss of appetite."],
    ["During evaluation the patient complained of severe back pain, joint stiffness, muscle weakness, numbness, tingling, tremors, gait instability, confusion, memory loss, anxiety, depression, insomnia, irritability, mood swings, difficulty concentrating, and episodes of syncope."],
    ["Postoperatively the patient experienced incisional pain, swelling, redness, warmth, bruising, bleeding, wound drainage, fever spikes, chills, shortness of breath, rapid heartbeat, lightheadedness, nausea, vomiting, bloating, constipation, urinary retention, dysuria, and generalized weakness."]
]).toDF("text")

result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
    "chunk.result as text",
    "chunk.metadata['entity'] as entity"
).show(truncate=False)

from johnsnowlabs import nlp, medical

documentAssembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

tokenClassifier = medical.RoBertaForTokenClassification \
    .pretrained("roberta_symptom_ner_onnx", "en", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("ner")

converter = nlp.NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

pipeline = nlp.Pipeline(stages=[
    documentAssembler,
    tokenizer,
    tokenClassifier,
    converter
])

data = spark.createDataFrame([
    ["Patient reported fever, chills, night sweats, fatigue, malaise, weight loss, headache, dizziness, blurred vision, chest pain, palpitations, shortness of breath, cough, wheezing, nausea, vomiting, abdominal pain, diarrhea, constipation, dysphagia, and loss of appetite."],
    ["During evaluation the patient complained of severe back pain, joint stiffness, muscle weakness, numbness, tingling, tremors, gait instability, confusion, memory loss, anxiety, depression, insomnia, irritability, mood swings, difficulty concentrating, and episodes of syncope."],
    ["Postoperatively the patient experienced incisional pain, swelling, redness, warmth, bruising, bleeding, wound drainage, fever spikes, chills, shortness of breath, rapid heartbeat, lightheadedness, nausea, vomiting, bloating, constipation, urinary retention, dysuria, and generalized weakness."]
]).toDF("text")

result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
    "chunk.result as text",
    "chunk.metadata['entity'] as entity"
).show(truncate=False)

import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.sql.functions._
import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val tokenizer = new Tokenizer()
  .setInputCols(Array("document"))
  .setOutputCol("token")

val tokenClassifier = RoBertaForTokenClassification
  .pretrained("roberta_symptom_ner_onnx", "en", "clinical/models")
  .setInputCols(Array("document", "token"))
  .setOutputCol("ner")

val converter = new NerConverter()
  .setInputCols(Array("document", "token", "ner"))
  .setOutputCol("ner_chunk")

val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  tokenizer,
  tokenClassifier,
  converter
))

val data = Seq(
  "Patient reported fever, chills, night sweats, fatigue, malaise, weight loss, headache, dizziness, blurred vision, chest pain, palpitations, shortness of breath, cough, wheezing, nausea, vomiting, abdominal pain, diarrhea, constipation, dysphagia, and loss of appetite.",
  "During evaluation the patient complained of severe back pain, joint stiffness, muscle weakness, numbness, tingling, tremors, gait instability, confusion, memory loss, anxiety, depression, insomnia, irritability, mood swings, difficulty concentrating, and episodes of syncope.",
  "Postoperatively the patient experienced incisional pain, swelling, redness, warmth, bruising, bleeding, wound drainage, fever spikes, chills, shortness of breath, rapid heartbeat, lightheadedness, nausea, vomiting, bloating, constipation, urinary retention, dysuria, and generalized weakness."
).toDF("text")

val result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk")
  .selectExpr("chunk.result as text", "chunk.metadata['entity'] as entity")
  .show(false)

Results

+---------+-------+
|text     |entity |
+---------+-------+
|chills   |SYMPTOM|
|night    |SYMPTOM|
|sweats   |SYMPTOM|
|fatigue  |SYMPTOM|
|malaise  |SYMPTOM|
|abdominal|SYMPTOM|
|back     |SYMPTOM|
|joint    |SYMPTOM|
|muscle   |SYMPTOM|
|numbness |SYMPTOM|
|tingling |SYMPTOM|
|dysuria  |SYMPTOM|
+---------+-------+

Model Information

Model Name:	roberta_symptom_ner_onnx
Compatibility:	Healthcare NLP 6.2.0+
License:	Licensed
Edition:	Official
Input Labels:	[token, sentence]
Output Labels:	[ner]
Language:	en
Size:	466.2 MB
Case sensitive:	true
Max sentence length:	128

PREVIOUSSymptom Named Entity Recognition (Base, ONNX)

NEXTPretrained Zero-Shot Named Entity Recognition (zeroshot_ner_deid_generic_nonMedical_large)