Description
This model, extracts specimen entities from clinical texts.
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
specimen_contextual_parser = ContextualParserModel.pretrained("specimen_parser","en","clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("chunk_zip")
chunk_converter = ChunkConverter()\
.setInputCols(["chunk_zip"])\
.setOutputCol("ner_chunk")
parserPipeline = Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
specimen_contextual_parser,
chunk_converter
])
model = parserPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
sample_text = """Specimen ID: AB123-456789 was collected from the patient. The laboratory processed Specimen Number CD987654 yesterday. Use Specimen Code: XYZ12-3456 for tracking purposes. Sample was labeled as Specimen#EF34-789. Specimen No. GH56-123456 was sent to the pathology department."""
result = model.transform(spark.createDataFrame([[sample_text]]).toDF("text"))
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
specimen_contextual_parser = medical.ContextualParserModel.pretrained("specimen_parser","en","clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("chunk_zip")
chunk_converter = medical.ChunkConverter()\
.setInputCols(["chunk_zip"])\
.setOutputCol("ner_chunk")
parserPipeline = nlp.Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
specimen_contextual_parser,
chunk_converter
])
model = parserPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
sample_text = """Specimen ID: AB123-456789 was collected from the patient. The laboratory processed Specimen Number CD987654 yesterday. Use Specimen Code: XYZ12-3456 for tracking purposes. Sample was labeled as Specimen#EF34-789. Specimen No. GH56-123456 was sent to the pathology department."""
result = model.transform(spark.createDataFrame([[sample_text]]).toDF("text"))
val document_assembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols(Array("document"))
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")
val specimen_contextual_parser = ContextualParserModel.pretrained("specimen_parser","en","clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("chunk_zip")
val chunk_converter = new ChunkConverter()
.setInputCols(Array("chunk_zip"))
.setOutputCol("ner_chunk")
val parserPipeline = new Pipeline().setStages(Array(
document_assembler,
sentence_detector,
tokenizer,
specimen_contextual_parser,
chunk_converter
))
val model = parserPipeline.fit(Seq(Array("")).toDS.toDF("text"))
val sample_text = """Specimen ID: AB123-456789 was collected from the patient. The laboratory processed Specimen Number CD987654 yesterday. Use Specimen Code: XYZ12-3456 for tracking purposes. Sample was labeled as Specimen#EF34-789. Specimen No. GH56-123456 was sent to the pathology department."""
val result = model.transform(Seq(Array(sample_text)).toDS.toDF("text"))
Results
| specimen_id | begin | end | label |
|:--------------|--------:|------:|:---------|
| AB123 | 13 | 18 | SPECIMEN |
| CD987654 | 99 | 107 | SPECIMEN |
| XYZ12-3456 | 138 | 148 | SPECIMEN |
| EF34-789 | 203 | 211 | SPECIMEN |
| GH56-123456 | 226 | 237 | SPECIMEN |
Model Information
| Model Name: | specimen_parser |
| Compatibility: | Healthcare NLP 6.2.2+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [document, token_doc] |
| Output Labels: | [entity_specimen] |
| Language: | en |
| Size: | 4.4 KB |
| Case sensitive: | false |