Training

Training Datasets

These are classes to load common datasets to train annotators for tasks such as Relation Model, Assertion models and more.

Annotation tool json reader.

All the annotations from Annotation Lab can be exported in a standard JSON format as shown below. The JSON holds multiple types of annotations like NER, Assertion, and Relations. To generate training datasets from the json, a utility class AnnotationToolJsonReader can be used, which can generate training datasets for training NER and Assertion models. AnnotationToolJsonReader Colab Notebook provides the code and details of processing the exported JSON to generate training datasets for NER and Assertion models in section 2. Users can distinguish between different label types by using constructor parameters described below. This notebook also explains how to connect to your Annotation Lab instance via API for uploading tasks, pre-annotations, and exporting entire projects.

Input File Format:

[
  {
    "completions": [
      {
        "created_ago": "2020-05-18T20:48:18.117Z",
        "created_username": "admin",
        "id": 3001,
        "lead_time": 19.255,
        "result": [
          {
            "from_name": "ner",
            "id": "o752YyB2g9",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 12,
              "labels": [
                "AsPresent"
              ],
              "start": 3,
              "text": "have faith"
            }
          },
          {
            "from_name": "ner",
            "id": "wf2U3o7I6T",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 24,
              "labels": [
                "AsPresent"
              ],
              "start": 16,
              "text": " to trust"
            }
          },
          {
            "from_name": "ner",
            "id": "Q3BkU5eZNx",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 40,
              "labels": [
                "AsPresent"
              ],
              "start": 35,
              "text": "to the"
            }
          }
        ]
      }
    ],
    "created_at": "2020-05-18 20:47:53",
    "created_by": "andres.fernandez",
    "data": {
      "text": "To have faith is to trust yourself to the water"
    },
    "id": 3
  },
  {
    "completions": [
      {
        "created_ago": "2020-05-17T17:52:41.563Z",
        "created_username": "andres.fernandez",
        "id": 1,
        "lead_time": 31.449,
        "result": [
          {
            "from_name": "ner",
            "id": "IQjoZJNKEv",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 12,
              "labels": [
                "Disease"
              ],
              "start": 3,
              "text": "have faith"
            }
          },
          {
            "from_name": "ner",
            "id": "tHsbn4oYy5",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 46,
              "labels": [
                "Treatment"
              ],
              "start": 42,
              "text": "water"
            }
          },
          {
            "from_name": "ner",
            "id": "IJHkc9bxJ-",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 12,
              "labels": [
                "AsPresent"
              ],
              "start": 0,
              "text": "To have faith"
            }
          }
        ]
      }
    ],
    "created_at": "2020-05-17 17:52:02",
    "created_by": "andres.fernandez",
    "data": {
      "text": "To have faith is to trust yourself to the water"
    },
    "id": 0
  },
  {
    "completions": [
      {
        "created_ago": "2020-05-17T17:57:19.402Z",
        "created_username": "andres.fernandez",
        "id": 1001,
        "lead_time": 15.454,
        "result": [
          {
            "from_name": "ner",
            "id": "j_lT0zwtrJ",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 46,
              "labels": [
                "Disease"
              ],
              "start": 20,
              "text": "trust yourself to the water"
            }
          },
          {
            "from_name": "ner",
            "id": "e1FuGWu7EQ",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 33,
              "labels": [
                "AsPresent"
              ],
              "start": 19,
              "text": " trust yourself"
            }
          },
          {
            "from_name": "ner",
            "id": "q0MCSM9SXz",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 12,
              "labels": [
                "Treatment"
              ],
              "start": 0,
              "text": "To have faith"
            }
          },
          {
            "from_name": "ner",
            "id": "9R7dvPphPX",
            "source": "$text",
            "to_name": "text",
            "type": "labels",
            "value": {
              "end": 12,
              "labels": [
                "AsPresent"
              ],
              "start": 0,
              "text": "To have faith"
            }
          }
        ]
      }
    ],
    "created_at": "2020-05-17 17:52:54",
    "created_by": "andres.fernandez",
    "data": {
      "text": "To have faith is to trust yourself to the water"
    },
    "id": 1,
    "predictions": []
  }
]

Constructor Parameters:

assertion_labels: The assertions labels are used for the training dataset creation.
excluded_labels: The assertions labels that are excluded for the training dataset creation.
split_chars: The split chars that are used in the default tokenizer.
context_chars: The context chars that are used in the default tokenizer.
SDDLPath: The context chars that are used in the default tokenizer.

Parameters for readDataset:

spark: Initiated Spark Session with Spark NLP
path: Path to the resource

Refer to the documentation for more details on the API:

Python API:

Scala API: AnnotationToolJsonReader

Show Example

from sparknlp_jsl.training import AnnotationToolJsonReader
assertion_labels = ["AsPresent","Absent"]
excluded_labels = ["Treatment"]
split_chars = [" ", "\\-"]
context_chars = [".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'","+","%","'"]
SDDLPath = ""
rdr = AnnotationToolJsonReader(assertion_labels = assertion_labels, excluded_labels = excluded_labels, split_chars = split_chars, context_chars = context_chars,SDDLPath=SDDLPath)
path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
df = rdr.readDataset(spark, json_path)
assertion_df = rdr.generateAssertionTrainSet(df)
assertion_df.show()

+--------------------+--------------+---------+-----+---+
|                text|        target|    label|start|end|
+--------------------+--------------+---------+-----+---+
|To have faith is ...| To have faith|AsPresent|    0|  2|
|To have faith is ...|    have faith|AsPresent|    1|  2|
|To have faith is ...|      to trust|AsPresent|    4|  5|
|To have faith is ...|        to the|AsPresent|    7|  8|
|To have faith is ...|      yourself|AsPresent|    6|  6|
|To have faith is ...| To have faith|AsPresent|    0|  2|
|To have faith is ...|trust yourself|AsPresent|    5|  6|
+--------------------+--------------+---------+-----+---+

import com.johnsnowlabs.nlp.training.POS

val filename = "src/test/resources/json_import.json"

val reader = new AnnotationToolJsonReader(assertionLabels=List("AsPresent","Absent").asJava, splitChars=List(" ", "\\-").asJava, excludedLabels = List("Treatment").asJava)
val df = reader.readDataset(ResourceHelper.spark, filename)
val assertionDf = reader.generateAssertionTrainSet(df)
assertionDf.show()

+--------------------+--------------+---------+-----+---+
|                text|        target|    label|start|end|
+--------------------+--------------+---------+-----+---+
|To have faith is ...| To have faith|AsPresent|    0|  2|
|To have faith is ...|    have faith|AsPresent|    1|  2|
|To have faith is ...|      to trust|AsPresent|    4|  5|
|To have faith is ...|        to the|AsPresent|    7|  8|
|To have faith is ...|      yourself|AsPresent|    6|  6|
|To have faith is ...| To have faith|AsPresent|    0|  2|
|To have faith is ...|trust yourself|AsPresent|    5|  6|
+--------------------+--------------+---------+-----+---+

Assertion

Trains AssertionDL, a deep Learning based approach used to extract Assertion Status from extracted entities and text.

AssertionDLApproach

Train a Assertion Model algorithm using deep learning.

The training data should have annotations columns of type DOCUMENT, CHUNK, WORD_EMBEDDINGS, the labelcolumn (The assertion status that you want to predict), the start (the start index for the term that has the assertion status), the end column (the end index for the term that has the assertion status).This model use a deep learning to predict the entity.

Excluding the label, this can be done with for example

a SentenceDetector,
a Chunk ,
a WordEmbeddingsModel (any word embeddings can be chosen, e.g. BertEmbeddings for BERT based embeddings).

Input Annotator Types: DOCUMENT, CHUNK, WORD_EMBEDDINGS

Output Annotator Type: ASSERTION

Python API: AssertionDLApproach

Scala API: AssertionDLApproach

Show Example

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler().setInputCol('text').setOutputCol('document')

sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = Tokenizer().setInputCols("sentence").setOutputCol("token")

POSTag = PerceptronModel.pretrained() \
.setInputCols("sentence", "token") \
.setOutputCol("pos")

chunker = Chunker() \
.setInputCols(["pos", "sentence"]) \
.setOutputCol("chunk") \
.setRegexParsers(["(<NN>)+"])

pubmed = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") \
.setInputCols("sentence", "token") \
.setOutputCol("embeddings") \
.setCaseSensitive(False)

assertion_status = AssertionDLApproach() \
.setInputCols("sentence", "chunk", "embeddings") \
.setOutputCol("assertion") \
.setStartCol("start") \
.setEndCol("end") \
.setLabelCol("label") \
.setLearningRate(0.01) \
.setDropout(0.15) \
.setBatchSize(16) \
.setEpochs(3) \
.setValidationSplit(0.2) \
.setIncludeConfidence(True)

pipeline = Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
POSTag,
chunker,
pubmed,
assertion_status
])


conll = CoNLL()
trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")

pipelineModel = pipeline.fit(trainingData)

// This CoNLL dataset already includes the sentence, token, pos and label column with their respective annotator types.
// If a custom dataset is used, these need to be defined.

import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.annotators.{Chunker, Tokenizer}
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
import com.johnsnowlabs.nlp.annotator.PerceptronModel
import com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLModel
import com.johnsnowlabs.nlp.annotator.NerCrfApproach
import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")

val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")

val POSTag = PerceptronModel
.pretrained()
.setInputCols("sentence", "token")
.setOutputCol("pos")

val chunker = new Chunker()
.setInputCols(Array("pos", "sentence"))
.setOutputCol("chunk")
.setRegexParsers(Array("(<NN>)+"))

val pubmed = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")
.setInputCols("sentence", "token")
.setOutputCol("embeddings")
.setCaseSensitive(false)

val assertionStatus = new AssertionDLApproach()
      .setInputCols("sentence", "chunk", "embeddings")
      .setOutputCol("assertion")
      .setStartCol("start")
      .setEndCol("end")
      .setLabelCol("label")
      .setLearningRate(0.01f)
      .setDropout(0.15f)
      .setBatchSize(16)
      .setEpochs(3)
      .setValidationSplit(0.2f)

val pipeline = new Pipeline().setStages(Array(
documentAssembler, 
sentenceDetector, 
tokenizer, 
POSTag, 
chunker, 
pubmed,
assertionStatus
))


datasetPath = "/../src/test/resources/rsAnnotations-1-120-random.csv"
train_data = SparkContextForTest.spark.read.option("header", "true").csv(path="file:///" + os.getcwd() + datasetPath)

val pipelineModel = pipeline.fit(trainingData)

AssertionLogRegApproach

Train a Assertion Model algorithm using a regression log model.

Excluding the label, this can be done with for example

a SentenceDetector,
a Chunk ,
a WordEmbeddingsModel (any word embeddings can be chosen, e.g. BertEmbeddings for BERT based embeddings).

Input Annotator Types: DOCUMENT, CHUNK, WORD_EMBEDDINGS

Output Annotator Type: ASSERTION

Python API: AssertionLogRegApproach

Scala API: AssertionLogRegApproach

Show Example

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler().setInputCol('text').setOutputCol('document')

sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = Tokenizer().setInputCols("sentence").setOutputCol("token")

POSTag = PerceptronModel.pretrained() \
.setInputCols("sentence", "token") \
.setOutputCol("pos")

chunker = Chunker() \
.setInputCols(["pos", "sentence"]) \
.setOutputCol("chunk") \
.setRegexParsers(["(<NN>)+"])

pubmed = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") \
.setInputCols("sentence", "token") \
.setOutputCol("embeddings") \
.setCaseSensitive(False)

assertion_status = AssertionLogRegApproach() \
.setInputCols("sentence", "chunk", "embeddings") \
.setOutputCol("assertion") \
.setStartCol("start") \
.setEndCol("end") \
.setLabelCol("label") \
.setReg(0.01) \
.setBefore(11) \
.setAfter(13) \
.setEpochs(3) 


pipeline = Pipeline().setStages([
document_assembler,
sentence_detector,
tokenizer,
POSTag,
chunker,
pubmed,
assertion_status
])


conll = CoNLL()
trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")

pipelineModel = pipeline.fit(trainingData)

// This CoNLL dataset already includes the sentence, token, pos and label column with their respective annotator types.
// If a custom dataset is used, these need to be defined.

import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.annotators.{Chunker, Tokenizer}
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
import com.johnsnowlabs.nlp.annotator.PerceptronModel
import com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLModel
import com.johnsnowlabs.nlp.annotator.NerCrfApproach
import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")

val tokenizer = new Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")

val POSTag = PerceptronModel
.pretrained()
.setInputCols("sentence", "token")
.setOutputCol("pos")

val chunker = new Chunker()
.setInputCols(Array("pos", "sentence"))
.setOutputCol("chunk")
.setRegexParsers(Array("(<NN>)+"))

val pubmed = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")
.setInputCols("sentence", "token")
.setOutputCol("embeddings")
.setCaseSensitive(false)

val assertion = new AssertionLogRegApproach()
.setLabelCol("label")
.setInputCols("document", "chunk", "embeddings")
.setOutputCol("assertion")
.setReg(0.01)
.setBefore(11)
.setAfter(13)
.setStartCol("start")
.setEndCol("end")

val pipeline = new Pipeline().setStages(Array(
documentAssembler,
sentenceDetector,
tokenizer,
POSTag,
chunker,
pubmed,
assertion
))

datasetPath = "/../src/test/resources/rsAnnotations-1-120-random.csv"
train_data = SparkContextForTest.spark.read.option("header", "true").csv(path="file:///" + os.getcwd() + datasetPath)

val pipelineModel = pipeline.fit(trainingData)

Token Classification

These are annotators that can be trained to recognize named entities in text.

MedicalNer

This Named Entity recognition annotator allows to train generic NER model based on Neural Networks.

The architecture of the neural network is a Char CNNs - BiLSTM - CRF that achieves state-of-the-art in most datasets.

For instantiated/pretrained models, see NerDLModel.

The training data should be a labeled Spark Dataset, in the format of CoNLL 2003 IOB with Annotation type columns. The data should have columns of type DOCUMENT, TOKEN, WORD_EMBEDDINGS and an additional label column of annotator type NAMED_ENTITY. Excluding the label, this can be done with for example

a SentenceDetector,
a Tokenizer and
a WordEmbeddingsModel with clinical embeddings (any clinical word embeddings can be chosen).

Input Annotator Types: DOCUMENT, TOKEN, WORD_EMBEDDINGS

Output Annotator Type: NAMED_ENTITY

Python API: MedicalNerApproach

Scala API: MedicalNerApproach

Show Example

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.training import *
from pyspark.ml import Pipeline

# First extract the prerequisites for the NerDLApproach
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

sentence = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")

tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")

clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings")

# Then the training can start
nerTagger = MedicalNerApproach()\
.setInputCols(["sentence", "token", "embeddings"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(2)\
.setBatchSize(64)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True) \
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setOutputLogsPath('ner_logs')\
.setGraphFolder('medical_ner_graphs')\
.setEnableMemoryOptimizer(True) #>> if you have a limited memory and a large conll file, you can set this True to train batch by batch

pipeline = Pipeline().setStages([
documentAssembler,
sentence,
tokenizer,
clinical_embeddings,
nerTagger
])

# We use the text and labels from the CoNLL dataset
conll = CoNLL()
trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")

pipelineModel = pipeline.fit(trainingData)

import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
import com.johnsnowlabs.nlp.annotators.ner.MedicalNerApproach
import com.johnsnowlabs.nlp.training.CoNLL
import org.apache.spark.ml.Pipeline

// First extract the prerequisites for the NerDLApproach
val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentence = new SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentence")

val tokenizer = new Tokenizer()
  .setInputCols("sentence")
  .setOutputCol("token")

val embeddings = BertEmbeddings.pretrained()
  .setInputCols("sentence", "token")
  .setOutputCol("embeddings")

// Then the training can start
val nerTagger =new MedicalNerApproach()
.setInputCols(Array("sentence", "token", "embeddings"))
.setLabelColumn("label")
.setOutputCol("ner")
.setMaxEpochs(5)
.setLr(0.003f)
.setBatchSize(8)
.setRandomSeed(0)
.setVerbose(1)
.setEvaluationLogExtended(false)
.setEnableOutputLogs(false)
.setIncludeConfidence(true)


val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  sentence,
  tokenizer,
  embeddings,
  nerTagger
))

// We use the text and labels from the CoNLL dataset
val conll = CoNLL()
val trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")

val pipelineModel = pipeline.fit(trainingData)

Text Classification

These are annotators that can be trained to classify text into different classes, such as sentiment.

DocumentLogRegClassifier

Trains a model to classify documents with a Logarithmic Regression algorithm. Training data requires columns for text and their label. The result is a trained GenericClassifierModel.

Input Annotator Types: TOKEN

Output Annotator Type: CATEGORY

Python API: DocumentLogRegClassifierApproach

Scala API: DocumentLogRegClassifierApproach

Show Example

import sparknlp
from sparknlp.common import *
from sparknlp.annotator import *
from sparknlp.training import *
import sparknlp_jsl
from sparknlp_jsl.base import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler() \
    .setInputCols("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols("token") \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

stemmer = Stemmer() \
    .setInputCols("cleanTokens") \
    .setOutputCol("stem")

gen_clf = DocumentLogRegClassifierApproach() \
    .setLabelColumn("category") \
    .setInputCols("stem") \
    .setOutputCol("prediction") 

pipeline = Pipeline().setStages([
    document_assembler,
    tokenizer,
    normalizer,
    stopwords_cleaner,
    stemmer,
    logreg
])

clf_model = pipeline.fit(data)

import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
import com.johnsnowlabs.nlp.annotators.ner.MedicalNerApproach
import com.johnsnowlabs.nlp.training.CoNLL
import org.apache.spark.ml.Pipeline

// First extract the prerequisites for the NerDLApproach
val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentence = new SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentence")

val tokenizer = new Tokenizer()
  .setInputCols("sentence")
  .setOutputCol("token")

val embeddings = BertEmbeddings.pretrained()
  .setInputCols("sentence", "token")
  .setOutputCol("embeddings")

// Then the training can start
val nerTagger =new MedicalNerApproach()
.setInputCols(Array("sentence", "token", "embeddings"))
.setLabelColumn("label")
.setOutputCol("ner")
.setMaxEpochs(5)
.setLr(0.003f)
.setBatchSize(8)
.setRandomSeed(0)
.setVerbose(1)
.setEvaluationLogExtended(false)
.setEnableOutputLogs(false)
.setIncludeConfidence(true)


val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  sentence,
  tokenizer,
  embeddings,
  nerTagger
))

// We use the text and labels from the CoNLL dataset
val conll = CoNLL()
val trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")

val pipelineModel = pipeline.fit(trainingData)

GenericClassifier

Trains a TensorFlow model for generic classification of feature vectors. It takes FEATURE_VECTOR annotations from FeaturesAssembler as input, classifies them and outputs CATEGORY annotations. Please see the Parameters section for required training parameters.

For a more extensive example please see the Spark NLP Workshop.

Input Annotator Types: FEATURE_VECTOR

Output Annotator Type: CATEGORY

Python API: GenericClassifierApproach

Scala API: GenericClassifierApproach

Show Example

import sparknlp
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from sparknlp.training import *
import sparknlp_jsl
from sparknlp_jsl.base import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline
features_asm = FeaturesAssembler() \
    .setInputCols(["feature_1", "feature_2", "...", "feature_n"]) \
    .setOutputCol("features")

gen_clf = GenericClassifierApproach() \
    .setLabelColumn("target") \
    .setInputCols(["features"]) \
    .setOutputCol("prediction") \
    .setModelFile("/path/to/graph_file.pb") \
    .setEpochsNumber(50) \
    .setBatchSize(100) \
    .setFeatureScaling("zscore") \
    .setlearningRate(0.001) \
    .setFixImbalance(True) \
    .setOutputLogsPath("logs") \
    .setValidationSplit(0.2) # keep 20% of the data for validation purposes

pipeline = Pipeline().setStages([
    features_asm,
    gen_clf
])

clf_model = pipeline.fit(data)

import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
import com.johnsnowlabs.nlp.annotators.ner.MedicalNerApproach
import com.johnsnowlabs.nlp.training.CoNLL
import org.apache.spark.ml.Pipeline

// First extract the prerequisites for the NerDLApproach
val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentence = new SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentence")

val tokenizer = new Tokenizer()
  .setInputCols("sentence")
  .setOutputCol("token")

val embeddings = BertEmbeddings.pretrained()
  .setInputCols("sentence", "token")
  .setOutputCol("embeddings")

// Then the training can start
val nerTagger =new MedicalNerApproach()
.setInputCols(Array("sentence", "token", "embeddings"))
.setLabelColumn("label")
.setOutputCol("ner")
.setMaxEpochs(5)
.setLr(0.003f)
.setBatchSize(8)
.setRandomSeed(0)
.setVerbose(1)
.setEvaluationLogExtended(false)
.setEnableOutputLogs(false)
.setIncludeConfidence(true)


val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  sentence,
  tokenizer,
  embeddings,
  nerTagger
))

// We use the text and labels from the CoNLL dataset
val conll = CoNLL()
val trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")

val pipelineModel = pipeline.fit(trainingData)

Relation Models

RelationExtractionApproach

Trains a Relation Extraction Model to predict attributes and relations for entities in a sentence.

Relation Extraction is the key component for building relation knowledge graphs, and it is of crucial significance to natural language processing applications such as structured search, sentiment analysis, question answering, and summarization.

The dataset will be a csv with the following that contains the following columns (sentence,chunk1,firstCharEnt1,lastCharEnt1,label1,chunk2,firstCharEnt2,lastCharEnt2,label2,rel),

This annotator can be don with for example: Excluding the rel, this can be done with for example

a SentenceDetector,
a Tokenizer and
a WordEmbeddingsModel (any word embeddings can be chosen, e.g. BertEmbeddings for BERT based embeddings).
a Chunk can be created using the firstCharEnt1, lastCharEnt1,chunk1, label1 columns and firstCharEnt2, lastCharEnt2, chunk2, label2 columns

An example of that dataset can be found in the following link i2b2_clinical_dataset

sentence,chunk1,firstCharEnt1,lastCharEnt1,label1,chunk2,firstCharEnt2,lastCharEnt2,label2,rel						
Previous studies have reported the association of prodynorphin (PDYN) promoter polymorphism with temporal lobe epilepsy (TLE) susceptibility, but the results remain inconclusive.,PDYN,64,67,GENE,epilepsy,111,118,PHENOTYPE,0						
The remaining cases, clinically similar to XLA, are autosomal recessive agammaglobulinemia (ARA).,XLA,43,45,GENE,autosomal recessive,52,70,PHENOTYPE,0						
YAP/TAZ have been reported to be highly expressed in malignant tumors.,YAP,19,21,GENE,tumors,82,87,PHENOTYPE,0						

Apart from that, no additional training data is needed.

Input Annotator Types: WORD_EMBEDDINGS, POS, CHUNK, DEPENDENCY

Output Annotator Type: CATEGORY

Python API: RelationExtractionApproach

Scala API: RelationExtractionApproach

Show Example

import functools
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T
from sparknlp.base import


annotationType = T.StructType([
T.StructField('annotatorType', T.StringType(), False),
T.StructField('begin', T.IntegerType(), False),
T.StructField('end', T.IntegerType(), False),
T.StructField('result', T.StringType(), False),
T.StructField('metadata', T.MapType(T.StringType(), T.StringType()), False),
T.StructField('embeddings', T.ArrayType(T.FloatType()), False)
])


@F.udf(T.ArrayType(annotationType))
def createTrainAnnotations(begin1, end1, begin2, end2, chunk1, chunk2, label1, label2):
    entity1 = sparknlp.annotation.Annotation("chunk", begin1, end1, chunk1, {'entity': label1.upper(), 'sentence': '0'}, [])
    entity2 = sparknlp.annotation.Annotation("chunk", begin2, end2, chunk2, {'entity': label2.upper(), 'sentence': '0'}, [])    
        
    entity1.annotatorType = "chunk"
    entity2.annotatorType = "chunk"
    return [entity1, entity2]

data = spark.read.option("header","true").format("csv").load("i2b2_clinical_rel_dataset.csv")


data = data
    .withColumn("begin1i", F.expr("cast(firstCharEnt1 AS Int)"))
    .withColumn("end1i", F.expr("cast(lastCharEnt1 AS Int)"))
    .withColumn("begin2i", F.expr("cast(firstCharEnt2 AS Int)"))
    .withColumn("end2i", F.expr("cast(lastCharEnt2 AS Int)"))
    .where("begin1i IS NOT NULL")
    .where("end1i IS NOT NULL")
    .where("begin2i IS NOT NULL")
    .where("end2i IS NOT NULL")
    .withColumn(
    "train_ner_chunks",
    createTrainAnnotations(
        "begin1i", "end1i", "begin2i", "end2i", "chunk1", "chunk2", "label1", "label2"
    ).alias("train_ner_chunks", metadata={'annotatorType': "chunk"}))



documentAssembler = DocumentAssembler() \
    .setInputCol("sentence") \
    .setOutputCol("sentences")


tokenizer = Tokenizer() \
    .setInputCols("sentences") \
    .setOutputCol("token")

words_embedder = WordEmbeddingsModel()
    .pretrained("embeddings_clinical", "en", "clinical/models")
    .setInputCols(["sentences", "tokens"])
    .setOutputCol("embeddings")

pos_tagger = PerceptronModel()
    .pretrained("pos_clinical", "en", "clinical/models")
    .setInputCols(["sentences", "tokens"])
    .setOutputCol("pos_tags")

dependency_parser = DependencyParserModel()
    .pretrained("dependency_conllu", "en")
    .setInputCols(["sentences", "pos_tags", "tokens"])
    .setOutputCol("dependencies")

reApproach = RelationExtractionApproach()
    .setInputCols(["embeddings", "pos_tags", "train_ner_chunks", "dependencies"])
    .setOutputCol("relations")
    .setLabelColumn("rel")
    .setEpochsNumber(70)
    .setBatchSize(200)
    .setDropout(0.5)
    .setLearningRate(0.001)
    .setModelFile("/content/RE_in1200D_out20.pb")
    .setFixImbalance(True)
    .setFromEntity("begin1i", "end1i", "label1")
    .setToEntity("begin2i", "end2i", "label2")
    .setOutputLogsPath('/content')

train_pipeline = Pipeline(stages=[
            documenter,
            tokenizer,
            words_embedder,
            pos_tagger,
            dependency_parser,
            reApproach
])
rel_model = train_pipeline.fit(data)

import com.johnsnowlabs.nlp.{DocumentAssembler}
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.ner.{MedicalNerModel, NerConverter}
import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserModel
import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel
package com.johnsnowlabs.nlp.annotators.re.RelationExtractionApproach()
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.functions._


val data = spark.read.option("header",true).csv("src/test/resources/re/gene_hpi.csv").limit(10)



def createTrainAnnotations = udf {
 ( begin1:Int, end1:Int, begin2:Int, end2:Int, chunk1:String, chunk2:String, label1:String, label2:String) => {

    val an1 =   Annotation(CHUNK,begin1,end1,chunk1,Map("entity" -> label1.toUpperCase,"sentence" -> "0"))
    val an2 =   Annotation(CHUNK,begin2,end2,chunk2,Map("entity" -> label2.toUpperCase,"sentence" -> "0"))
    Seq(an1,an2)
 }

}
val metadataBuilder: MetadataBuilder = new MetadataBuilder()
val meta = metadataBuilder.putString("annotatorType", CHUNK).build()

val dataEncoded =  data
.withColumn("begin1i", expr("cast(firstCharEnt1 AS Int)"))
.withColumn("end1i", expr("cast(lastCharEnt1 AS Int)"))
.withColumn("begin2i", expr("cast(firstCharEnt2 AS Int)"))
.withColumn("end2i", expr("cast(lastCharEnt2 AS Int)"))
.where("begin1i IS NOT NULL")
.where("end1i IS NOT NULL")
.where("begin2i IS NOT NULL")
.where("end2i IS NOT NULL")
.withColumn(
  "train_ner_chunks",
  createTrainAnnotations(
    col("begin1i"), col("end1i"), col("begin2i"), col("end2i"), col("chunk1"), col("chunk2"), col("label1"), col("label2")
  ).as("train_ner_chunks",meta))

val documentAssembler = new DocumentAssembler()
  .setInputCol("sentence")
  .setOutputCol("sentences")

val tokenizer = new Tokenizer()
  .setInputCols(Array("sentences"))
  .setOutputCol("tokens")

val embedder = ParallelDownload(WordEmbeddingsModel
  .pretrained("embeddings_clinical", "en", "clinical/models")
  .setInputCols(Array("document", "tokens"))
  .setOutputCol("embeddings"))

val posTagger = ParallelDownload(PerceptronModel
  .pretrained("pos_clinical", "en", "clinical/models")
  .setInputCols(Array("sentences", "tokens"))
  .setOutputCol("posTags"))

val nerTagger = ParallelDownload(MedicalNerModel
  .pretrained("ner_events_clinical", "en", "clinical/models")
  .setInputCols(Array("sentences", "tokens", "embeddings"))
  .setOutputCol("ner_tags"))

val nerConverter = new NerConverter()
  .setInputCols(Array("sentences", "tokens", "ner_tags"))
  .setOutputCol("nerChunks")

val depencyParser = ParallelDownload(DependencyParserModel
  .pretrained("dependency_conllu", "en")
  .setInputCols(Array("sentences", "posTags", "tokens"))
  .setOutputCol("dependencies"))

val re = new RelationExtractionApproach()
  .setInputCols(Array("embeddings", "posTags", "train_ner_chunks", "dependencies"))
  .setOutputCol("rel")
  .setLabelColumn("target_rel")
  .setEpochsNumber(30)
  .setBatchSize(200)
  .setlearningRate(0.001f)
  .setValidationSplit(0.05f)
  .setFromEntity("begin1i", "end1i", "label1")
  .setToEntity("end2i", "end2i", "label2")



val pipeline = new Pipeline()
  .setStages(Array(
    documentAssembler,
    tokenizer,
    embedder,
    posTagger,
    nerTagger,
    nerConverter,
    depencyParser,
    re).parallelDownload)

    val model = pipeline.fit(dataEncoded)

Entity Resolution

Those models predict what are the normalized entity for a particular trained ontology / curated dataset. (e.g. ICD-10, RxNorm, SNOMED etc.).

SentenceEntityResolver

Contains all the parameters and methods to train a SentenceEntityResolverModel. The model transforms a dataset with Input Annotation type SENTENCE_EMBEDDINGS, coming from e.g. BertSentenceEmbeddings and returns the normalized entity for a particular trained ontology / curated dataset. (e.g. ICD-10, RxNorm, SNOMED etc.)

To use pretrained models please use SentenceEntityResolverModel and see the Models Hub for available models.

Input Annotator Types: SENTENCE_EMBEDDINGS

Output Annotator Type: ENTITY

Python API: SentenceEntityResolverApproach

Scala API: SentenceEntityResolverApproach

Show Example

import sparknlp
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from sparknlp.training import *
import sparknlp_jsl
from sparknlp_jsl.base import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline
# Training a SNOMED resolution model using BERT sentence embeddings
# Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels.
documentAssembler = DocumentAssembler() \
  .setInputCol("normalized_text") \
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased") \
  .setInputCols(["sentence"]) \
  .setOutputCol("bert_embeddings")

snomedTrainingPipeline = Pipeline(stages=[
  documentAssembler,
  sentenceDetector,
  bertEmbeddings
])
snomedTrainingModel = snomedTrainingPipeline.fit(data)
snomedData = snomedTrainingModel.transform(data).cache()

# Then the Resolver can be trained with
bertExtractor = SentenceEntityResolverApproach() \
  .setNeighbours(25) \
  .setThreshold(1000) \
  .setInputCols(["bert_embeddings"]) \
  .setNormalizedCol("normalized_text") \
  .setLabelCol("label") \
  .setOutputCol("snomed_code") \
  .setDistanceFunction("EUCLIDIAN") \
  .setCaseSensitive(False)

snomedModel = bertExtractor.fit(snomedData)

// Training a SNOMED resolution model using BERT sentence embeddings
// Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels.
val documentAssembler = new DocumentAssembler()
   .setInputCol("normalized_text")
   .setOutputCol("document")

val sentenceDetector = SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentence")

 val bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased")
   .setInputCols("sentence")
   .setOutputCol("bert_embeddings")
 val snomedTrainingPipeline = new Pipeline().setStages(Array(
   documentAssembler,
   sentenceDetector,
   bertEmbeddings
 ))
 val snomedTrainingModel = snomedTrainingPipeline.fit(data)
 val snomedData = snomedTrainingModel.transform(data).cache()

// Then the Resolver can be trained with
val bertExtractor = new SentenceEntityResolverApproach()
  .setNeighbours(25)
  .setThreshold(1000)
  .setInputCols("bert_embeddings")
  .setNormalizedCol("normalized_text")
  .setLabelCol("label")
  .setOutputCol("snomed_code")
  .setDistanceFunction("EUCLIDIAN")
  .setCaseSensitive(false)

val snomedModel = bertExtractor.fit(snomedData)

ChunkEntityResolver

Contains all the parameters and methods to train a ChunkEntityResolverModel. It transform a dataset with two Input Annotations of types TOKEN and WORD_EMBEDDINGS, coming from e.g. ChunkTokenizer and ChunkEmbeddings Annotators and returns the normalized entity for a particular trained ontology / curated dataset. (e.g. ICD-10, RxNorm, SNOMED etc.)

To use pretrained models please use ChunkEntityResolverModel and see the Models Hub for available models.

Input Annotator Types: TOKEN, WORD_EMBEDDINGS

Output Annotator Type: ENTITY

Python API: ChunkEntityResolverApproach

Scala API: ChunkEntityResolverApproach

Show Example

import sparknlp
from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from sparknlp.training import *
import sparknlp_jsl
from sparknlp_jsl.base import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline
# Training a SNOMED model
# Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data
# and their labels.
document = DocumentAssembler() \
    .setInputCol("normalized_text") \
    .setOutputCol("document")

chunk = Doc2Chunk() \
    .setInputCols(["document"]) \
    .setOutputCol("chunk")

token = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddings = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d", "en", "clinical/models") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

chunkEmb = ChunkEmbeddings() \
        .setInputCols(["chunk", "embeddings"]) \
        .setOutputCol("chunk_embeddings")

snomedTrainingPipeline = Pipeline().setStages([
    document,
    chunk,
    token,
    embeddings,
    chunkEmb
])

snomedTrainingModel = snomedTrainingPipeline.fit(data)

snomedData = snomedTrainingModel.transform(data).cache()

# Then the Resolver can be trained with
snomedExtractor = ChunkEntityResolverApproach() \
    .setInputCols(["token", "chunk_embeddings"]) \
    .setOutputCol("recognized") \
    .setNeighbours(1000) \
    .setAlternatives(25) \
    .setNormalizedCol("normalized_text") \
    .setLabelCol("label") \
    .setEnableWmd(True).setEnableTfidf(True).setEnableJaccard(True) \
    .setEnableSorensenDice(True).setEnableJaroWinkler(True).setEnableLevenshtein(True) \
    .setDistanceWeights([1, 2, 2, 1, 1, 1]) \
    .setAllDistancesMetadata(True) \
    .setPoolingStrategy("MAX") \
    .setThreshold(1e32)
model = snomedExtractor.fit(snomedData)

// Training a SNOMED resolution model using BERT sentence embeddings
// Define pre-processing pipeline for training data. It needs consists of columns for the normalized training data and their labels.
val documentAssembler = new DocumentAssembler()
   .setInputCol("normalized_text")
   .setOutputCol("document")

val sentenceDetector = SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentence")

 val bertEmbeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased")
   .setInputCols("sentence")
   .setOutputCol("bert_embeddings")
 val snomedTrainingPipeline = new Pipeline().setStages(Array(
   documentAssembler,
   sentenceDetector,
   bertEmbeddings
 ))
 val snomedTrainingModel = snomedTrainingPipeline.fit(data)
 val snomedData = snomedTrainingModel.transform(data).cache()

// Then the Resolver can be trained with
val bertExtractor = new SentenceEntityResolverApproach()
  .setNeighbours(25)
  .setThreshold(1000)
  .setInputCols("bert_embeddings")
  .setNormalizedCol("normalized_text")
  .setLabelCol("label")
  .setOutputCol("snomed_code")
  .setDistanceFunction("EUCLIDIAN")
  .setCaseSensitive(false)

val snomedModel = bertExtractor.fit(snomedData)

PREVIOUSAnnotators

NEXTEvaluation