Description
This is a BERT-based NER model for Italian medical text. It identifies disease mentions in clinical sentences and labels them using B-DISEASE, I-DISEASE, or O. The model is intended for extracting disease names from notes, reports, and diagnoses, and is provided in ONNX format for efficient inference.
How to use
from sparknlp.base import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols("document") \
.setOutputCol("token")
tokenClassifier = MedicalBertForTokenClassifier \
.pretrained("bert_token_classifier_disease_ner_it_onnx", "it", "clinical/models") \
.setInputCols(["document", "token"]) \
.setOutputCol("ner")
converter = NerConverterInternal() \
.setInputCols(["document", "token", "ner"]) \
.setOutputCol("ner_chunk")
pipeline = Pipeline(stages=[
documentAssembler,
tokenizer,
tokenClassifier,
converter
])
data = spark.createDataFrame([["Il paziente è stato diagnosticato con diabete e ipertensione."]]).toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
"chunk.result as text",
"chunk.metadata['entity'] as entity"
).show(truncate=False)
from johnsnowlabs import nlp, medical
documentAssembler = nlp.DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = nlp.Tokenizer() \
.setInputCols("document") \
.setOutputCol("token")
tokenClassifier = medical.MedicalBertForTokenClassifier \
.pretrained("bert_token_classifier_disease_ner_it_onnx", "it", "clinical/models") \
.setInputCols(["document", "token"]) \
.setOutputCol("ner")
converter = nlp.NerConverterInternal() \
.setInputCols(["document", "token", "ner"]) \
.setOutputCol("ner_chunk")
pipeline = nlp.Pipeline(stages=[
documentAssembler,
tokenizer,
tokenClassifier,
converter
])
data = spark.createDataFrame([["Il paziente è stato diagnosticato con diabete e ipertensione."]]).toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("explode(ner_chunk) as chunk").selectExpr(
"chunk.result as text",
"chunk.metadata['entity'] as entity"
).show(truncate=False)
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.ml.Pipeline
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val tokenizer = new Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
val tokenClassifier = MedicalBertForTokenClassifier
.pretrained("bert_token_classifier_disease_ner_it_onnx", "it", "clinical/models")
.setInputCols(Array("document", "token"))
.setOutputCol("ner")
val converter = new NerConverter()
.setInputCols(Array("document", "token", "ner"))
.setOutputCol("ner_chunk")
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
tokenizer,
tokenClassifier,
converter
))
val data = Seq("Il paziente è stato diagnosticato con diabete e ipertensione.").toDF("text")
val result = pipeline.fit(data).transform(data)
result.selectExpr("explode(ner_chunk) as chunk")
.selectExpr("chunk.result as text", "chunk.metadata['entity'] as entity")
.show(false)
Results
+------------+-------+
|text |entity |
+------------+-------+
|diabete |DISEASE|
|ipertensione|DISEASE|
+------------+-------+
Model Information
| Model Name: | bert_token_classifier_disease_ner_it_onnx |
| Compatibility: | Healthcare NLP 6.2.0+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [token, sentence] |
| Output Labels: | [ner] |
| Language: | it |
| Size: | 441.5 MB |
| Case sensitive: | true |
| Max sentence length: | 128 |