Description
Detect chemical compounds and genes in the medical text using the pretrained NER model. This model is trained with the BertForTokenClassification method from the transformers library and imported into Spark NLP.
Predicted Entities
B-CHEMICAL
, I-CHEMICAL
, B-GENE-Y
, I-GENE-Y
, B-GENE-N
, I-GENE-N
, O
, PAD
How to use
from sparknlp.base import DocumentAssembler
from sparknlp_jsl.annotator import MedicalBertForTokenClassifier
from sparknlp.annotator import Tokenizer, NerConverter
from pyspark.ml import Pipeline
document_assembler = (
DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
)
tokenizer = (
Tokenizer()
.setInputCols(["document"])
.setOutputCol("token")
)
token_classifier = (
MedicalBertForTokenClassifier.pretrained(
"bert_token_classifier_ner_chemprot_onnx",
"en",
"clinical/models"
)
.setInputCols(["token", "document"])
.setOutputCol("ner")
.setCaseSensitive(True)
)
ner_converter = (
NerConverterInternal()
.setInputCols(["document", "token", "ner"])
.setOutputCol("ner_chunk")
)
pipeline = Pipeline(stages=[
document_assembler,
tokenizer,
token_classifier,
ner_converter
])
test_sentence = "Keratinocyte growth factor and acidic fibroblast growth factor are mitogens for primary cultures of mammary epithelium."
data = spark.createDataFrame([[test_sentence]]).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
from johnsnowlabs import nlp, medical
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
tokenizer = nlp.Tokenizer()\
.setInputCols(["document"])\
.setOutputCol("token")
token_classifier = medical.BertForTokenClassifier.pretrained(
"bert_token_classifier_ner_chemprot_onnx",
"en",
"clinical/models"
)\
.setInputCols(["token", "document"])\
.setOutputCol("ner")\
.setCaseSensitive(True)
ner_converter = medical.NerConverterInternal()\
.setInputCols(["document", "token", "ner"])\
.setOutputCol("ner_chunk")
pipeline = Pipeline(stages=[
document_assembler,
tokenizer,
token_classifier,
ner_converter
])
test_sentence = "Keratinocyte growth factor and acidic fibroblast growth factor are mitogens for primary cultures of mammary epithelium."
data = spark.createDataFrame([[test_sentence]]).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.ner.NerConverter
import com.johnsnowlabs.nlp.annotators.classifier.dl.MedicalBertForTokenClassifier
import org.apache.spark.ml.Pipeline
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val tokenizer = new Tokenizer()
.setInputCols("document")
.setOutputCol("token")
val tokenClassifier = MedicalBertForTokenClassifier
.pretrained("bert_token_classifier_ner_chemprot_onnx", "en", "clinical/models")
.setInputCols(Array("token", "document"))
.setOutputCol("ner")
.setCaseSensitive(true)
val nerConverter = new NerConverterInternal()
.setInputCols(Array("document", "token", "ner"))
.setOutputCol("ner_chunk")
val pipeline = new Pipeline()
.setStages(Array(
documentAssembler,
tokenizer,
tokenClassifier,
nerConverter
))
val testSentence = "Keratinocyte growth factor and acidic fibroblast growth factor are mitogens for primary cultures of mammary epithelium."
val data = Seq(testSentence).toDF("text")
val model = pipeline.fit(data)
val result = model.transform(data)
Results
+-------------------------------+------+
|text |entity|
+-------------------------------+------+
|Keratinocyte growth factor |GENE-Y|
|acidic fibroblast growth factor|GENE-Y|
+-------------------------------+------+
Model Information
Model Name: | bert_token_classifier_ner_chemprot_onnx |
Compatibility: | Healthcare NLP 6.1.1+ |
License: | Licensed |
Edition: | Official |
Input Labels: | [document, token] |
Output Labels: | [ner] |
Language: | en |
Size: | 403.7 MB |
Case sensitive: | true |
Max sentence length: | 128 |