Disease Named Entity Recognition (Base, ONNX)

Description

RoBERTa-based token classification model for identifying disease mentions in text using the BIO tagging scheme (B-DISEASE, I-DISEASE, O). Designed for dense disease extraction in clinical and biomedical narratives.

Copy S3 URI

How to use

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

tokenClassifier = RoBertaForTokenClassification \
    .pretrained("roberta_disease_ner_onnx", "en", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("ner")

converter = NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

pipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    tokenClassifier,
    converter
])

data = spark.createDataFrame([
    ["Patient presented with type 2 diabetes, hypertension, chronic kidney disease, congestive heart failure, chronic obstructive pulmonary disease, and atrial fibrillation, later complicated by diabetic retinopathy, peripheral neuropathy, gastroesophageal reflux disease, and osteoarthritis, requiring management of hyperlipidemia, hypothyroidism, and anemia."],
    ["During hospitalization, the patient was diagnosed with pneumonia, sepsis, acute respiratory distress syndrome, acute kidney injury, liver cirrhosis, and deep vein thrombosis, with coexisting conditions of ulcerative colitis, Crohn's disease, systemic lupus erythematosus, and rheumatoid arthritis, necessitating interventions for heart failure exacerbation, arrhythmia, and electrolyte imbalance."],
    ["In the oncology unit, the patient was treated for breast cancer, ovarian cancer, pancreatic cancer, and leukemia, with comorbidities including anemia, neutropenia, thrombocytopenia, febrile neutropenia, and cachexia, complicated by infections such as cytomegalovirus, herpes zoster, and candidiasis, alongside chronic conditions like osteoporosis, hypertension, diabetes mellitus, and chronic pain syndrome."]
]).toDF("text")

result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
    "chunk.result as text",
    "chunk.metadata['entity'] as entity"
).show(truncate=False)
from johnsnowlabs import nlp, medical

documentAssembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

tokenClassifier = medical.RoBertaForTokenClassification \
    .pretrained("roberta_disease_ner_onnx", "en", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("ner")

converter = nlp.NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

pipeline = nlp.Pipeline(stages=[
    documentAssembler,
    tokenizer,
    tokenClassifier,
    converter
])

data = spark.createDataFrame([
    ["Patient presented with type 2 diabetes, hypertension, chronic kidney disease, congestive heart failure, chronic obstructive pulmonary disease, and atrial fibrillation, later complicated by diabetic retinopathy, peripheral neuropathy, gastroesophageal reflux disease, and osteoarthritis, requiring management of hyperlipidemia, hypothyroidism, and anemia."],
    ["During hospitalization, the patient was diagnosed with pneumonia, sepsis, acute respiratory distress syndrome, acute kidney injury, liver cirrhosis, and deep vein thrombosis, with coexisting conditions of ulcerative colitis, Crohn's disease, systemic lupus erythematosus, and rheumatoid arthritis, necessitating interventions for heart failure exacerbation, arrhythmia, and electrolyte imbalance."],
    ["In the oncology unit, the patient was treated for breast cancer, ovarian cancer, pancreatic cancer, and leukemia, with comorbidities including anemia, neutropenia, thrombocytopenia, febrile neutropenia, and cachexia, complicated by infections such as cytomegalovirus, herpes zoster, and candidiasis, alongside chronic conditions like osteoporosis, hypertension, diabetes mellitus, and chronic pain syndrome."]
]).toDF("text")

result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
    "chunk.result as text",
    "chunk.metadata['entity'] as entity"
).show(truncate=False)

import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.sql.functions._
import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val tokenizer = new Tokenizer()
  .setInputCols(Array("document"))
  .setOutputCol("token")

val tokenClassifier = RoBertaForTokenClassification
  .pretrained("roberta_disease_ner_onnx", "en", "clinical/models")
  .setInputCols(Array("document", "token"))
  .setOutputCol("ner")

val converter = new NerConverter()
  .setInputCols(Array("document", "token", "ner"))
  .setOutputCol("ner_chunk")

val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  tokenizer,
  tokenClassifier,
  converter
))

val data = Seq(
  "Patient presented with type 2 diabetes, hypertension, chronic kidney disease, congestive heart failure, chronic obstructive pulmonary disease, and atrial fibrillation, later complicated by diabetic retinopathy, peripheral neuropathy, gastroesophageal reflux disease, and osteoarthritis, requiring management of hyperlipidemia, hypothyroidism, and anemia.",
  "During hospitalization, the patient was diagnosed with pneumonia, sepsis, acute respiratory distress syndrome, acute kidney injury, liver cirrhosis, and deep vein thrombosis, with coexisting conditions of ulcerative colitis, Crohn's disease, systemic lupus erythematosus, and rheumatoid arthritis, necessitating interventions for heart failure exacerbation, arrhythmia, and electrolyte imbalance.",
  "In the oncology unit, the patient was treated for breast cancer, ovarian cancer, pancreatic cancer, and leukemia, with comorbidities including anemia, neutropenia, thrombocytopenia, febrile neutropenia, and cachexia, complicated by infections such as cytomegalovirus, herpes zoster, and candidiasis, alongside chronic conditions like osteoporosis, hypertension, diabetes mellitus, and chronic pain syndrome."
).toDF("text")

val result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk")
  .selectExpr("chunk.result as text", "chunk.metadata['entity'] as entity")
  .show(false)

Results


+--------------------+-------+
|text                |entity |
+--------------------+-------+
|hypertension        |DISEASE|
|atrial fibrillation |DISEASE|
|diabetic retinopathy|DISEASE|
|peripheral          |DISEASE|
|gastroesophageal    |DISEASE|
|osteoarthritis      |DISEASE|
|hyperlipidemia      |DISEASE|
|hypothyroidism      |DISEASE|
|anemia              |DISEASE|
|pneumonia           |DISEASE|
|sepsis              |DISEASE|
|cirrhosis           |DISEASE|
|thrombosis          |DISEASE|
|ulcerative          |DISEASE|
|colitis             |DISEASE|
+--------------------+-------+

Model Information

Model Name: roberta_disease_ner_onnx
Compatibility: Healthcare NLP 6.2.0+
License: Licensed
Edition: Official
Input Labels: [token, sentence]
Output Labels: [ner]
Language: en
Size: 466.2 MB
Case sensitive: true
Max sentence length: 128