Packages

case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true) extends Product with Serializable

Helper class to load a CoNLL type dataset for training.

The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

Example

val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
  .show(3, false)
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+

trainingData.printSchema
root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- pos: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- label: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
documentCol

Name of the DOCUMENT Annotator type column

sentenceCol

Name of the Sentences of DOCUMENT Annotator type column

tokenCol

Name of the TOKEN Annotator type column

posCol

Name of the POS Annotator type column

conllLabelIndex

Index of the column for NER Label in the dataset

conllPosIndex

Index of the column for the POS tags in the dataset

conllTextCol

Index of the column for the text in the dataset

labelCol

Name of the NAMED_ENTITY Annotator type column

explodeSentences

Whether to explode each sentence to a separate row

Linear Supertypes
Serializable, Serializable, Product, Equals, AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. CoNLL
  2. Serializable
  3. Serializable
  4. Product
  5. Equals
  6. AnyRef
  7. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true)

    documentCol

    Name of the DOCUMENT Annotator type column

    sentenceCol

    Name of the Sentences of DOCUMENT Annotator type column

    tokenCol

    Name of the TOKEN Annotator type column

    posCol

    Name of the POS Annotator type column

    conllLabelIndex

    Index of the column for NER Label in the dataset

    conllPosIndex

    Index of the column for the POS tags in the dataset

    conllTextCol

    Index of the column for the text in the dataset

    labelCol

    Name of the NAMED_ENTITY Annotator type column

    explodeSentences

    Whether to explode each sentence to a separate row

Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  4. val annotationType: ArrayType
  5. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  6. def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]
  7. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  8. val conllLabelIndex: Int
  9. val conllPosIndex: Int
  10. val conllTextCol: String
  11. val documentCol: String
  12. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  13. val explodeSentences: Boolean
  14. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  15. def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
  16. final def getClass(): Class[_]
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  17. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  18. val labelCol: String
  19. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  20. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  21. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  22. def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
  23. def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]
  24. def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]
  25. def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
  26. def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
  27. def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
  28. val posCol: String
  29. def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString): Dataset[_]
  30. def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
  31. def readDocs(er: ExternalResource): Seq[CoNLLDocument]
  32. def readLines(lines: Array[String]): Seq[CoNLLDocument]
  33. def schema: StructType
  34. val sentenceCol: String
  35. final def synchronized[T0](arg0: ⇒ T0): T0
    Definition Classes
    AnyRef
  36. val tokenCol: String
  37. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  38. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  39. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped