



package annotators

  1. class Chunk2Token extends AnnotatorModel[Chunk2Token] with HasSimpleAnnotate[Chunk2Token]

    A feature transformer that converts the input array of strings (annotatorType CHUNK) into an array of chunk-based tokens (annotatorType TOKEN).

    When the input is empty, an empty array is returned.

    This Annotator is specially convenient when using NGramGenerator annotations as inputs to WordEmbeddingsModels


    Define a pipeline for generating n-grams

    val data = Seq(("A 63-year-old man presents to the hospital ...")).toDF("text")
    val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
    val sentenceDetector = new SentenceDetector().setInputCols("document").setOutputCol("sentence")
    val token = new Tokenizer().setInputCols("sentence").setOutputCol("token")
    val ngrammer = new NGramGenerator()

    Stage to convert n-gram CHUNKS to TOKEN type

    val chunk2Token = new Chunk2Token().setInputCols("ngrams").setOutputCol("ngram_tokens")
    val trainingPipeline = new Pipeline().setStages(Array(document, sentenceDetector, token, ngrammer, chunk2Token)).fit(data)
    val result = trainingPipeline.transform(data).cache()
    result.selectExpr("explode(ngram_tokens)").show(5, false)
      |col                                                             |
      |{token, 3, 15, A_63-year-old, {sentence -> 0, chunk -> 0}, []}  |
      |{token, 5, 19, 63-year-old_man, {sentence -> 0, chunk -> 1}, []}|
      |{token, 17, 28, man_presents, {sentence -> 0, chunk -> 2}, []}  |
      |{token, 21, 31, presents_to, {sentence -> 0, chunk -> 3}, []}   |
      |{token, 30, 35, to_the, {sentence -> 0, chunk -> 4}, []}        |
    See also


  2. class DocumentFiltererByClassifier extends AnnotatorModel[DocumentFiltererByClassifier] with HasSimpleAnnotate[DocumentFiltererByClassifier] with WhiteAndBlackListParams

    Filters documents by the result of classifier annotators.

    val text ="""British Department of Health confirms first two cases of in UK.
       |So my trip to visit my australian exchange student just got canceled because of Coronavirus.
       |I wish everyone to be safe at home and stop pandemic.""".stripMargin
    val documentAssembler = new DocumentAssembler()
    val sentenceDetector = new SentenceDetector()
    val tokenizer = new Tokenizer()
    val medicalBFSC = MedicalBertForSequenceClassification
       .pretrained("bert_sequence_classifier_covid_sentiment", "en", "clinical/models")
       .setInputCols("sentence", "token").setOutputCol("classifier")
    val documentFilterer = new DocumentFiltererByClassifier()
       .setInputCols("sentence", "classifier").setOutputCol("filteredDocuments")
    import spark.implicits._
    val textDF = Seq(text).toDF("text")
    val result = new Pipeline().setStages(Array(
    |col                                                                                             |
    |{document, 159, 211, I wish everyone to be safe at home and stop pandemic., {sentence -> 2}, []}|
  3. class DrugNormalizer extends AnnotatorModel[DrugNormalizer] with HasSimpleAnnotate[DrugNormalizer] with CheckLicense

    Annotator which normalizes raw text from clinical documents, e.g.

    See Spark NLP Workshop for more examples of usage.


    val data = Seq(
      ("Sodium Chloride/Potassium Chloride 13bag"),
      ("interferon alfa-2b 10 million unit ( 1 ml ) injec"),
      ("aspirin 10 meq/ 5 ml oral sol")
    val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
    val drugNormalizer = new DrugNormalizer().setInputCols("document").setOutputCol("document_normalized")
    val trainingPipeline = new Pipeline().setStages(Array(document, drugNormalizer))
    val result =
    result.selectExpr("explode(document_normalized.result) as normalized_text").show(false)
    |normalized_text                                     |
    |Sodium Chloride / Potassium Chloride 13 bag         |
    |interferon alfa - 2b 10000000 unt ( 1 ml ) injection|
    |aspirin 2 meq/ml oral solution                      |
  4. class Flattener extends Transformer with ParamsAndFeaturesWritable

    Converts annotation results into exploded and flattened format.

     val dataSet = Seq("GENERAL: He is an elderly gentleman in no acute distress. He is sitting up in bed eating his breakfast." +
    " He is alert and oriented and answering questions appropriately.\nHEENT: Sclerae showed mild arcus senilis in the right." +
    " Left was clear. Pupils are equally round and reactive to light. Extraocular movements are intact. Oropharynx is clear." +
    "\nNECK: Supple. Trachea is midline. No jugular venous pressure distention is noted. No adenopathy in the cervical, " +
    "supraclavicular, or axillary areas.\nABDOMEN: Soft and not tender. There may be some fullness in the left upper quadrant, " +
    "although I do not appreciate a true spleen with inspiration.\nEXTREMITIES: There is some edema, but no cyanosis and " ).toDS.toDF("text")
    val documentAssembler = new DocumentAssembler().setInputCol("text").setOutputCol("document")
    val sentenceDetector = new SentenceDetector().setInputCols(Array("document")).setOutputCol("sentence")
    val tokenizer = new Tokenizer().setInputCols(Array("sentence")).setOutputCol("token")
    val wordEmbeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models").setInputCols(Array("sentence", "token")).setOutputCol("embeddings")
    val clinicalNer = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models").setInputCols(Array("sentence", "token", "embeddings")).setOutputCol("ner")
    val nerConverter = new NerConverter().setInputCols(Array("sentence", "token", "ner")).setOutputCol("ner_chunk")
    val clinicalAssertion = AssertionDLModel.pretrained("assertion_jsl_augmented", "en", "clinical/models").setInputCols(Array("sentence", "ner_chunk", "embeddings")).setOutputCol("assertion").setEntityAssertionCaseSensitive(false)
    val flattener = new Flattener()
     .setInputCols("sentence", "ner_chunk", "assertion")
     .setExplodeSelectedFields(Map("ner_chunk" -> Array("result","metadata.entity"),
    val pipeline = new Pipeline().setStages(
     val result =
        |ner_chunk_result                  |ner_chunk_metadata_entity|assertion_result|assertion_metadata_confidence|
        |distress                          |Symptom                  |Absent          |1.0                          |
        |arcus senilis                     |Disease_Syndrome_Disorder|Past            |1.0                          |
        |jugular venous pressure distention|Symptom                  |Absent          |1.0                          |
        |adenopathy                        |Symptom                  |Absent          |1.0                          |
        |tender                            |Symptom                  |Absent          |1.0                          |
        |fullness                          |Symptom                  |Possible        |0.9999                       |
        |edema                             |Symptom                  |Present         |1.0                          |
        |cyanosis                          |VS_Finding               |Absent          |1.0                          |
  5. class MultiChunk2Doc extends AnnotatorModel[MultiChunk2Doc] with HasSimpleAnnotate[MultiChunk2Doc] with WhiteAndBlackListParams with CheckLicense

    MultiChunk2Doc annotator merges a given chunks to create a document.

    See also

    WhiteAndBlackListParams Additionally, specified prefix and suffix texts can be placed before and after the merged chunks in the resulting document. And a separator can be placed between the chunks.


    val document_assembler = new DocumentAssembler()
    val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models")
    val tokenizer = new Tokenizer()
    val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")
     .setInputCols(Array("sentence", "token")).setOutputCol("embeddings")
    val ner = MedicalNerModel.pretrained("ner_clinical_large_langtest", "en", "clinical/models")
     .setInputCols("sentence", "token", "embeddings").setOutputCol("ner")
    val ner_converter = new NerConverterInternal()
     .setInputCols(Array("sentence", "token", "ner")).setOutputCol("ner_chunk")
    val multi_chunk2_doc = new MultiChunk2Doc()
    val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, ner, ner_converter, multi_chunk2_doc))
    import spark.implicits._
    val data = Seq(
    """A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM),
    | one prior episode of HTG-induced pancreatitis three years prior to presentation, and associated with an acute hepatitis,
    | presented with a one-week history of polyuria, poor appetite, and vomiting. She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG.
    | She had been on dapagliflozin for six months at the time of presentation. Physical examination on presentation was significant for dry oral mucosa; significantly,
    | her abdominal examination was benign with no tenderness, guarding, or rigidity.""".stripMargin)
    val result =

    result.selectExpr("explode(new_doc) as result").show(false)
    |result                                                                                                    |
    |{document, 0, 48, <Physical examination><her abdominal examination>, {document -> 0, chunk_count -> 2}, []}|

Value Members

  1. object DocumentFiltererByClassifier extends DefaultParamsReadable[DocumentFiltererByClassifier] with Serializable

    This is the companion object of DocumentFiltererByClassifier.

  2. object DrugNormalizer extends DefaultParamsReadable[DrugNormalizer] with Serializable
  3. object Flattener extends ParamsAndFeaturesReadable[Flattener] with Serializable

    This is the companion object of Flattener.

  4. object MultiChunk2Doc extends DefaultParamsReadable[MultiChunk2Doc] with Serializable

    This is the companion object of MultiChunk2Doc.

