Description
This model detects PHI (Protected Health Information) entities for deidentification purposes. It is a generic model capable of detecting various PHI entities such as DATE, NAME, LOCATION, ID, CONTACT, AGE, PROFESSION, etc.
Predicted Entities
DATE, NAME, LOCATION, ID, CONTACT, AGE, PROFESSION
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_model = MedicalNerModel.pretrained("ner_deid_generic_nonMedical", "en", "clinical/models")\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = NerConverterInternal()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")
pipeline = Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_model,
ner_converter
])
text = """
Mr. James Wilson is a 65-year-old male who presented to the emergency department at Boston General Hospital on 10/25/2023.
He lives at 123 Oak Street, Springfield, IL 62704. He can be contacted at 555-0199.
His SSN is 999-00-1234. Dr. Gregory House is the attending physician.
"""
data = spark.createDataFrame([[text]]).toDF("text")
result = pipeline.fit(data).transform(data)
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
ner_model = medical.MedicalNerModel.pretrained("ner_deid_generic_nonMedical", "en", "clinical/models")\
.setInputCols(["sentence", "token", "embeddings"])\
.setOutputCol("ner")
ner_converter = medical.NerConverterInternal()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")
pipeline = nlp.Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
ner_model,
ner_converter
])
text = """
Mr. James Wilson is a 65-year-old male who presented to the emergency department at Boston General Hospital on 10/25/2023.
He lives at 123 Oak Street, Springfield, IL 62704. He can be contacted at 555-0199.
His SSN is 999-00-1234. Dr. Gregory House is the attending physician.
"""
data = spark.createDataFrame([[text]]).toDF("text")
result = pipeline.fit(data).transform(data)
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val nerModel = MedicalNerModel.pretrained("ner_deid_generic_nonMedical", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner")
val nerConverter = new NerConverterInternal()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
sentenceDetector,
tokenizer,
wordEmbeddings,
nerModel,
nerConverter
))
val data = Seq("""
Mr. James Wilson is a 65-year-old male who presented to the emergency department at Boston General Hospital on 10/25/2023.
He lives at 123 Oak Street, Springfield, IL 62704. He can be contacted at 555-0199.
His SSN is 999-00-1234. Dr. Gregory House is the attending physician.
""").toDF("text")
val result = pipeline.fit(data).transform(data)
Results
|chunk |begin|end|ner_label|
|-----------------------|-----|---|---------|
|James Wilson |5 |16 |NAME |
|65-year-old |23 |33 |AGE |
|Boston General Hospital|85 |107|LOCATION |
|10/25/2023 |112 |121|DATE |
|123 Oak Street |137 |150|LOCATION |
|Springfield |153 |163|LOCATION |
|IL |166 |167|LOCATION |
|555-0199 |199 |206|CONTACT |
|999-00-1234 |221 |231|ID |
|Gregory House |238 |250|NAME |
Model Information
| Model Name: | ner_deid_generic_nonMedical |
| Compatibility: | Healthcare NLP 6.2.2+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [sentence, token, embeddings] |
| Output Labels: | [ner] |
| Language: | en |
| Size: | 14.7 MB |