Description
This model is a text matcher designed to automatically extract mentions of phenotypic abnormalities associated with human diseases from clinical or biomedical text.
Predicted Entities
How to use
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
tokenizer = Tokenizer()\
.setInputCols(["document"])\
.setOutputCol("token")
stopwords_cleaner = StopWordsCleaner()\
.setInputCols("token")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)
token_assembler = TokenAssembler()\
.setInputCols(['document',"cleanTokens"])\
.setOutputCol("cleanTokens_newDoc")
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") \
.setInputCols(["cleanTokens_newDoc"]) \
.setOutputCol("sentence")
tokenizer_2 = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("clean_tokens")
entityExtractor = TextMatcherInternalModel().pretrained("hpo_matcher","en","clinical/models")\
.setInputCols(["sentence", "clean_tokens"])\
.setOutputCol("hpo_term")\
.setCaseSensitive(False)\
.setMergeOverlapping(False)
matcher_pipeline = Pipeline().setStages([
documentAssembler,
tokenizer,
stopwords_cleaner,
token_assembler,
sentenceDetector,
tokenizer_2,
entityExtractor])
text = ''' APNEA: Presumed apnea of prematurity since < 34 wks gestation at birth.
GENETICS: Holds thumbs in palms, findings on Hemolytic Uremic Syndrome, history of meconium plugs.
HYPERBILIRUBINEMIA: At risk for hyperbilirubinemia d/t prematurity.
1/25-1/30: Received Amp/Gent while undergoing sepsis evaluation. '''
data = spark.createDataFrame([[text]]).toDF("text")
matcher_model = matcher_pipeline.fit(data).transform(data)
documentAssembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
tokenizer = nlp.Tokenizer()\
.setInputCols(["document"])\
.setOutputCol("token")
stopwords_cleaner = nlp.StopWordsCleaner()\
.setInputCols("token")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)
token_assembler = nlp.TokenAssembler()\
.setInputCols(['document',"cleanTokens"])\
.setOutputCol("cleanTokens_newDoc")
sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") \
.setInputCols(["cleanTokens_newDoc"]) \
.setOutputCol("sentence")
tokenizer_2 = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("clean_tokens")
entityExtractor = medical.TextMatcherInternalModel().pretrained("hpo_matcher","en","clinical/models")\
.setInputCols(["sentence", "clean_tokens"])\
.setOutputCol("hpo_term")\
.setCaseSensitive(False)\
.setMergeOverlapping(False)
matcher_pipeline = nlp.Pipeline().setStages([
documentAssembler,
tokenizer,
stopwords_cleaner,
token_assembler,
sentenceDetector,
tokenizer_2,
entityExtractor])
text = ''' APNEA: Presumed apnea of prematurity since < 34 wks gestation at birth.
GENETICS: Holds thumbs in palms, findings on Hemolytic Uremic Syndrome, history of meconium plugs.
HYPERBILIRUBINEMIA: At risk for hyperbilirubinemia d/t prematurity.
1/25-1/30: Received Amp/Gent while undergoing sepsis evaluation. '''
data = spark.createDataFrame([[text]]).toDF("text")
matcher_model = matcher_pipeline.fit(data).transform(data)
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val tokenizer = new Tokenizer()
.setInputCols("document")
.setOutputCol("token")
val stopwordsCleaner = new StopWordsCleaner()
.setInputCols("token")
.setOutputCol("cleanTokens")
.setCaseSensitive(false)
val tokenAssembler = new TokenAssembler()
.setInputCols(Array("document", "cleanTokens"))
.setOutputCol("cleanTokens_newDoc")
val sentenceDetector = SentenceDetectorDLModel
.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")
.setInputCols(Array("cleanTokens_newDoc"))
.setOutputCol("sentence")
val tokenizer2 = new Tokenizer()
.setInputCols("sentence")
.setOutputCol("clean_tokens")
val entityExtractor = TextMatcherInternalModel
.pretrained("hpo_matcher", "en", "clinical/models")
.setInputCols(Array("sentence", "clean_tokens"))
.setOutputCol("hpo_term")
.setCaseSensitive(false)
.setMergeOverlapping(false)
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
tokenizer,
stopwordsCleaner,
tokenAssembler,
sentenceDetector,
tokenizer2,
entityExtractor
))
val data = Seq("""APNEA: Presumed apnea of prematurity since < 34 wks gestation at birth.
GENETICS: Holds thumbs in palms, findings on Hemolytic Uremic Syndrome, history of meconium plugs.
HYPERBILIRUBINEMIA: At risk for hyperbilirubinemia d/t prematurity.
1/25-1/30: Received Amp/Gent while undergoing sepsis evaluation.""").toDF("text")
val matcher = pipeline.fit(data).transform(data)
Results
+-------------------------+-----+---+-----+
| chunk|begin|end|label|
+-------------------------+-----+---+-----+
| APNEA| 0| 4| HPO|
| apnea| 16| 20| HPO|
|Hemolytic Uremic Syndrome| 105|129| HPO|
| HYPERBILIRUBINEMIA| 156|173| HPO|
| hyperbilirubinemia| 181|198| HPO|
| sepsis| 257|262| HPO|
+-------------------------+-----+---+-----+
Model Information
Model Name: | hpo_matcher |
Compatibility: | Healthcare NLP 6.0.0+ |
License: | Licensed |
Edition: | Official |
Input Labels: | [document, token] |
Output Labels: | [hpo_term] |
Language: | en |
Size: | 2.1 MB |
Case sensitive: | false |