Description
This model, extracts ZIP code entities from clinical texts.
How to use
document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
zip_contextual_parser = ContextualParserModel.pretrained("zip_parser", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("chunk_zip")
chunk_converter = ChunkConverter()\
.setInputCols(["chunk_zip"])\
.setOutputCol("ner_chunk")
parserPipeline = Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
zip_contextual_parser,
chunk_converter
])
model = parserPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
sample_text = """John Doe lives at 1234 Maple Street, Springfield, IL 62704. He works at 5678 Oak Avenue, Austin, TX 73301. His previous address was 4321 Pine Street, Los Angeles, CA 90001. His cousin Jane lives at 7890 Elm Street, Chicago, IL 60614."""
result = model.transform(spark.createDataFrame([[sample_text]]).toDF("text"))
document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
.setInputCols(["document"])\
.setOutputCol("sentence")
tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")
zip_contextual_parser = medical.ContextualParserModel.pretrained("zip_parser", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("chunk_zip")
chunk_converter = medical.ChunkConverter()\
.setInputCols(["chunk_zip"])\
.setOutputCol("ner_chunk")
parserPipeline = nlp.Pipeline(stages=[
document_assembler,
sentence_detector,
tokenizer,
zip_contextual_parser,
chunk_converter
])
model = parserPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
sample_text = """John Doe lives at 1234 Maple Street, Springfield, IL 62704. He works at 5678 Oak Avenue, Austin, TX 73301. His previous address was 4321 Pine Street, Los Angeles, CA 90001. His cousin Jane lives at 7890 Elm Street, Chicago, IL 60614."""
result = model.transform(spark.createDataFrame([[sample_text]]).toDF("text"))
val document_assembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")
.setInputCols("document")
.setOutputCol("sentence")
val tokenizer = new Tokenizer()
.setInputCols("sentence")
.setOutputCol("token")
val zip_cp.json_contextual_parser = ContextualParserModel.pretrained("zip_parser", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("chunk_zip")
val chunk_converter = new ChunkConverter()
.setInputCols(Array("chunk_zip"))
.setOutputCol("ner_chunk")
val parserPipeline = new Pipeline().setStages(Array(
document_assembler,
sentence_detector,
tokenizer,
zip_contextual_parser,
chunk_converter
))
val data = Seq(Array("""John Doe lives at 1234 Maple Street, Springfield, IL 62704. He works at 5678 Oak Avenue, Austin, TX 73301. His previous address was 4321 Pine Street, Los Angeles, CA 90001. His cousin Jane lives at 7890 Elm Street, Chicago, IL 60614.""")).toDS.toDF("text")
val result = parserPipeline.fit(data).transform(data)
Results
| chunk | begin | end | label |
|--------:|--------:|------:|:--------|
| 62704 | 53 | 57 | ZIP |
| 73301 | 100 | 104 | ZIP |
| 90001 | 166 | 170 | ZIP |
| 60614 | 227 | 231 | ZIP |
Model Information
| Model Name: | zip_regex_matcher |
| Compatibility: | Healthcare NLP 6.2.2+ |
| License: | Licensed |
| Edition: | Official |
| Input Labels: | [document, token_doc] |
| Output Labels: | [entity_zip_code] |
| Language: | en |
| Size: | 4.6 KB |
| Case sensitive: | false |