Disease Named Entity Recognition Spanish (Base, ONNX)

Description

This is a RoBerta-based NER model for Spanish medical text. It identifies disease mentions in clinical sentences and labels them using B-DISEASE, I-DISEASE, or O. The model is intended for extracting disease names from notes, reports, and diagnoses, and is provided in ONNX format for efficient inference.

Copy S3 URI

How to use

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

tokenClassifier = RoBertaForTokenClassification \
    .pretrained("roberta_disease_ner_es_onnx", "es", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("ner")

converter = NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

pipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    tokenClassifier,
    converter
])

data = spark.createDataFrame([["El paciente presenta diabetes tipo 2 e hipertensión arterial."]]).toDF("text")
result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
    "chunk.result as text",
    "chunk.metadata['entity'] as entity"
).show(truncate=False)
from johnsnowlabs import nlp, medical

documentAssembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

tokenClassifier = nlp.RoBertaForTokenClassification \
    .pretrained("roberta_disease_ner_es_onnx", "es", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("ner")

converter = nlp.NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

pipeline = nlp.Pipeline(stages=[
    documentAssembler,
    tokenizer,
    tokenClassifier,
    converter
])

data = spark.createDataFrame([["El paciente presenta diabetes tipo 2 e hipertensión arterial."]]).toDF("text")
result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
    "chunk.result as text",
    "chunk.metadata['entity'] as entity"
).show(truncate=False)
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.sql.functions._
import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val tokenizer = new Tokenizer()
  .setInputCols(Array("document"))
  .setOutputCol("token")

val tokenClassifier = RoBertaForTokenClassification
  .pretrained("roberta_disease_ner_es_onnx", "es", "clinical/models")
  .setInputCols(Array("document", "token"))
  .setOutputCol("ner")

val converter = new NerConverter()
  .setInputCols(Array("document", "token", "ner"))
  .setOutputCol("ner_chunk")

val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  tokenizer,
  tokenClassifier,
  converter
))

val data = Seq("El paciente presenta diabetes tipo 2 e hipertensión arterial.").toDF("text")
val result = pipeline.fit(data).transform(data)

result.selectExpr("explode(ner_chunk) as chunk")
  .selectExpr("chunk.result as text", "chunk.metadata['entity'] as entity")
  .show(false)

Results


+------------+-------+
|text        |entity |
+------------+-------+
|diabetes    |DISEASE|
|hipertensión|DISEASE|
|arterial    |DISEASE|
+------------+-------+

Model Information

Model Name: roberta_disease_ner_es_onnx
Compatibility: Healthcare NLP 6.2.0+
License: Licensed
Edition: Official
Input Labels: [token, sentence]
Output Labels: [ner]
Language: es
Size: 469.7 MB
Case sensitive: true
Max sentence length: 128