Sentence Entity Resolver for HGNC

Description

This model maps extracted gene names and their short-form abbreviations to HGNC codes using sbiobert_base_cased_mli Sentence Bert Embeddings. Also, it returns the locus groups and locus types of the genes as aux labels separated by under the metadata.

Predicted Entities

HGNC Codes, Locus Group, Locus Type

Open in Colab Copy S3 URI

How to use

document_assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")


sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")


word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence","token"])\
  .setOutputCol("embeddings")


clinical_ner = MedicalNerModel.pretrained("ner_human_phenotype_gene_clinical", "en", "clinical/models")\
  .setInputCols(["sentence","token","embeddings"])\
  .setOutputCol("ner")

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence","token","ner"])\
  .setOutputCol("ner_chunk")\
  .setWhiteList(['GENE'])

chunk2doc = Chunk2Doc()\
  .setInputCols("ner_chunk")\
  .setOutputCol("ner_chunk_doc")

sbert_embedder = BertSentenceEmbeddings\
  .pretrained("sbiobert_base_cased_mli","en","clinical/models")\
  .setInputCols(["ner_chunk_doc"])\
  .setOutputCol("sbert_embeddings")

resolver = SentenceEntityResolverModel\
  .pretrained("sbiobertresolve_hgnc","en", "clinical/models") \
  .setInputCols(["ner_chunk", "sbert_embeddings"]) \
  .setOutputCol("resolution")\
  .setDistanceFunction("EUCLIDEAN")

nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, chunk2doc, sbert_embedder, resolver])


clinical_note = ["Recent studies have suggested a potential link between the double homeobox 4 like 20 (pseudogene), also known as DUX4L20, and FBXO48 and RNA guanine-7 methyltransferase "]


data= spark.createDataFrame([clinical_note]).toDF('text')
results = nlpPipeline.fit(data).transform(data)

val document_assembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")


val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
  .setInputCols(Array("document"))
  .setOutputCol("sentence")

val tokenizer = new Tokenizer()
  .setInputCols(Array("sentence"))
  .setOutputCol("token")


val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
  .setInputCols(Array("sentence","token"))
  .setOutputCol("embeddings")


val clinical_ner = MedicalNerModel.pretrained("ner_human_phenotype_gene_clinical", "en", "clinical/models")
  .setInputCols(Array("sentence","token","embeddings"))
  .setOutputCol("ner")

val ner_converter = new NerConverterInternal()
  .setInputCols(Array("sentence","token","ner"))
  .setOutputCol("ner_chunk")
  .setWhiteList(Array("GENE"))

val chunk2doc = new Chunk2Doc()
  .setInputCols("ner_chunk")
  .setOutputCol("ner_chunk_doc")

val sbert_embedder = BertSentenceEmbeddings
  .pretrained("sbiobert_base_cased_mli","en","clinical/models")
  .setInputCols(Array("ner_chunk_doc"))
  .setOutputCol("sbert_embeddings")

val resolver = SentenceEntityResolverModel
  .pretrained("sbiobertresolve_hgnc","en", "clinical/models") 
  .setInputCols(Array("ner_chunk", "sbert_embeddings")) 
  .setOutputCol("resolution")
  .setDistanceFunction("EUCLIDEAN")

val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter, chunk2doc, sbert_embedder, resolver))

val data = Seq("A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus, associated with an acute hepatitis and obesity with a body mass index (BMI) of 33.5 kg/m2").toDS.toDF("text")

val result = pipeline.fit(data).transform(data)

Results

|   sent_id | ner_chunk   | entity   | HGNC Code    | all_codes                                                               | resolutions                                                                                                                    | AUX                                                                                                                             |
|----------:|:------------|:---------|:-------------|:------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------|
|         0 | DUX4L20     | GENE     | HGNC:50801   | ['HGNC:50801', 'HGNC:31982', 'HGNC:42423', 'HGNC:39776', 'HGNC:42023'...| ['DUX4L20 [double homeobox 4 like 20 (pseudogene)]', 'ANKRD20A4P [ankyrin repeat domain 20 family member A4, pseudogene]', ...| [pseudogene :: pseudogene, pseudogene :: pseudogene, non-coding RNA :: RNA, long non-coding, pseudogene :: pseudogene...|
|         0 | FBXO48      | GENE     | HGNC:33857   | ['HGNC:33857', 'HGNC:4930', 'HGNC:16653', 'HGNC:13114', 'HGNC:23535' ...| ['FBXO48 [F-box protein 48]', 'ZBTB48 [zinc finger and BTB domain containing 48]', 'MRPL48 [mitochondrial ribosomal protein' ...| [protein-coding gene :: gene with protein product, protein-coding gene :: gene with protein product, protein-coding gene...|

Model Information

Model Name: sbiobertresolve_hgnc
Compatibility: Healthcare NLP 4.3.2+
License: Licensed
Edition: Official
Input Labels: [sentence_embeddings]
Output Labels: [hgnc_code]
Language: en
Size: 251.9 MB
Case sensitive: false

References

https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/