Description
This pretrained model maps genes to their corresponding HPO codes. It also returns all the possible HPO codes in the all_k_resolutions
in the metadata.
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
clinical_ner = MedicalNerModel.pretrained("ner_human_phenotype_gene_clinical_langtest", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")
ner_converter = NerConverterInternal()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")\
.setWhiteList(["GENE"])
mapperModel = ChunkMapperModel.pretrained("gene_hpo_code_mapper", "en", "clinical/models")\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")\
.setRels(["hpo_code"])
nlp_pipeline = Pipeline(stages=[document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
clinical_ner,
ner_converter,
mapperModel])
model = nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))
result = model.transform(spark.createDataFrame([["""We will systematically examine seven genes (CHN1, MDH1, and SNAP25) that are altered in the three neurodegenerative diseases."""]]).toDF("text"))
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")
clinical_ner = medical.NerModel.pretrained("ner_human_phenotype_gene_clinical_langtest", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")
ner_converter = medical.NerConverterInternal()\
.setInputCols(["sentence", "token", "ner"])\
.setOutputCol("ner_chunk")\
.setWhiteList(["GENE"])
mapperModel = medical.ChunkMapperModel.pretrained("gene_hpo_code_mapper", "en", "clinical/models")\
.setInputCols(["ner_chunk"])\
.setOutputCol("mappings")\
.setRels(["hpo_code"])
nlp_pipeline = nlp.Pipeline(stages=[document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
clinical_ner,
ner_converter,
mapperModel])
model = nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))
result = model.transform(spark.createDataFrame([["""We will systematically examine seven genes (CHN1, MDH1, and SNAP25) that are altered in the three neurodegenerative diseases."""]]).toDF("text"))
val document_assembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols("document")
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols("sentence")
.setOutputCol("token")
val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
val clinical_ner = MedicalNerModel.pretrained("ner_human_phenotype_gene_clinical_langtest", "en", "clinical/models")
.setInputCols(Array("sentence", "token", "embeddings"))
.setOutputCol("ner")
val ner_converter = new NerConverterInternal()
.setInputCols(Array("sentence", "token", "ner"))
.setOutputCol("ner_chunk")
.setWhiteList(Array("GENE"))
val mapperModel = ChunkMapperModel.pretrained("gene_hpo_code_mapper", "en", "clinical/models")
.setInputCols("ner_chunk")
.setOutputCol("mappings")
.setRels(Array("hpo_code"))
val nlp_pipeline = new Pipeline().setStages(Array(document_assembler,
sentence_detector,
tokenizer,
word_embeddings,
clinical_ner, ner_converter,
mapperModel))
val sample_texts = Seq("""We will systematically examine seven genes (CHN1, MDH1, and SNAP25) that are altered in the three neurodegenerative diseases.""").toDF("text")
val result = nlp_pipeline.fit(sample_texts).transform(sample_texts)
Results
+------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| gene| hpo_code| all_k_resolutions|
+------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| CHN1|HP:0001177|HP:0001177:::HP:0001156:::HP:0025186:::HP:0001199:::HP:0009921:::HP:0001250:::HP:0001263:::HP:0007400:::HP:0000086:::HP:0001357:::HP:0000006:::HP:0003974:::HP:0000175:::HP:0003312:::HP:0009601:::HP...|
| MDH1|HP:0500149|HP:0500149:::HP:0001276:::HP:0001250:::HP:0001263:::HP:0100876:::HP:0002521:::HP:0001338:::HP:0000007:::HP:0008936:::HP:0012110:::HP:0200134:::HP:0007068:::HP:0000253:::HP:0000232:::HP:0001510:::HP...|
|SNAP25|HP:0002465|HP:0002465:::HP:0002421:::HP:0003701:::HP:0001270:::HP:0001288:::HP:0001283:::HP:0001284:::HP:0001250:::HP:0001252:::HP:0001251:::HP:0001249:::HP:0001265:::HP:0001260:::HP:0001263:::HP:0002515:::HP...|
+------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Model Information
Model Name: | gene_hpo_code_mapper |
Compatibility: | Healthcare NLP 6.0.4+ |
License: | Licensed |
Edition: | Official |
Input Labels: | [ner_chunk] |
Output Labels: | [mappings] |
Language: | en |
Size: | 733.1 KB |