Description
A BERT-based NER model for extracting key concepts from clinical trial texts, including trial groups, clinical end points, efficacy and safety measures, patient information, and statistical metrics.
Predicted Entities
B-Patient_Group, B-Confidence_Range, B-Trial_Group, B-Confidence_Interval, O, B-DATE, I-Confidence_level, I-P_Value, I-Patient_Group, I-Follow_Up, B-End_Point, B-Duration, B-Value, B-Follow_Up, B-Patient_Count, I-DATE, I-Value, I-ADE, I-Trial_Group, B-Confidence_level, B-Hazard_Ratio, I-Patient_Count, I-End_Point, B-P_Value, I-Confidence_Interval, I-Confidence_Range, B-ADE, I-Duration, PAD
How to use
from sparknlp.base import DocumentAssembler
from sparknlp_jsl.annotator import MedicalBertForTokenClassifier
from sparknlp.annotator import Tokenizer, NerConverter
from pyspark.ml import Pipeline
document_assembler = (
DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
)
tokenizer = (
Tokenizer()
.setInputCols(["document"])
.setOutputCol("token")
)
token_classifier = (
MedicalBertForTokenClassifier.pretrained(
"bert_token_classifier_drug_development_trials_onnx",
"en",
"clinical/models"
)
.setInputCols(["token", "document"])
.setOutputCol("ner")
.setCaseSensitive(True)
)
ner_converter = (
NerConverterInternal()
.setInputCols(["document", "token", "ner"])
.setOutputCol("ner_chunk")
)
pipeline = Pipeline(stages=[
document_assembler,
tokenizer,
token_classifier,
ner_converter
])
test_sentence = (
"In June 2003, the median overall survival with and without topotecan "
"were 4.0 and 3.6 months, respectively. The best complete response (CR), "
"partial response (PR), stable disease, and progressive disease were "
"observed in 23, 63, 55, and 33 patients with topotecan, and 11, 61, 66, "
"and 32 patients without topotecan."
)
data = spark.createDataFrame([[test_sentence]]).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
from johnsnowlabs import nlp, medical
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
tokenizer = nlp.Tokenizer()\
.setInputCols(["document"])\
.setOutputCol("token")
token_classifier = medical.BertForTokenClassifier.pretrained(
"bert_token_classifier_drug_development_trials_onnx",
"en",
"clinical/models"
)\
.setInputCols(["token", "document"])\
.setOutputCol("ner")\
.setCaseSensitive(True)
ner_converter = medical.NerConverterInternal()\
.setInputCols(["document", "token", "ner"])\
.setOutputCol("ner_chunk")
pipeline = Pipeline(stages=[
document_assembler,
tokenizer,
token_classifier,
ner_converter
])
test_sentence = (
"In June 2003, the median overall survival with and without topotecan "
"were 4.0 and 3.6 months, respectively. The best complete response (CR), "
"partial response (PR), stable disease, and progressive disease were "
"observed in 23, 63, 55, and 33 patients with topotecan, and 11, 61, 66, "
"and 32 patients without topotecan."
)
data = spark.createDataFrame([[test_sentence]]).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.ner.NerConverter
import com.johnsnowlabs.nlp.annotators.classifier.dl.MedicalBertForTokenClassifier
import org.apache.spark.ml.Pipeline
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val tokenizer = new Tokenizer()
.setInputCols("document")
.setOutputCol("token")
val tokenClassifier = MedicalBertForTokenClassifier
.pretrained("bert_token_classifier_drug_development_trials_onnx", "en", "clinical/models")
.setInputCols(Array("token", "document"))
.setOutputCol("ner")
.setCaseSensitive(true)
val nerConverter = new NerConverterInternal()
.setInputCols(Array("document", "token", "ner"))
.setOutputCol("ner_chunk")
val pipeline = new Pipeline()
.setStages(Array(
documentAssembler,
tokenizer,
tokenClassifier,
nerConverter
))
val testSentence =
"In June 2003, the median overall survival with and without topotecan " +
"were 4.0 and 3.6 months, respectively. The best complete response (CR), " +
"partial response (PR), stable disease, and progressive disease were " +
"observed in 23, 63, 55, and 33 patients with topotecan, and 11, 61, 66, " +
"and 32 patients without topotecan."
val data = Seq(testSentence).toDF("text")
val model = pipeline.fit(data)
val result = model.transform(data)
Results
+-----------------------+-------------+
|text |entity |
+-----------------------+-------------+
|June 2003 |DATE |
|median |Duration |
|overall survival |End_Point |
|without topotecan |Trial_Group |
|4.0 |Value |
|3.6 months |Value |
|complete response (CR) |End_Point |
|partial response (PR) |End_Point |
|stable disease |End_Point |
|progressive disease |End_Point |
|23 |Patient_Count|
|63 |Patient_Count|
|55 |Patient_Count|
|33 patients |Patient_Count|
|topotecan |Trial_Group |
|11 |Patient_Count|
|61 |Patient_Count|
|66 |Patient_Count|
|32 patients |Patient_Count|
|without topotecan |Trial_Group |
+-----------------------+-------------+
Model Information
| Model Name: | bert_token_classifier_drug_development_trials_onnx |
| Compatibility: | Healthcare NLP 6.1.1+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [document, token] |
| Output Labels: | [ner] |
| Language: | en |
| Size: | 403.7 MB |
| Case sensitive: | true |
| Max sentence length: | 128 |