com.johnsnowlabs.finance.sequence_classification

FinanceDocumentMLClassifierApproach

class FinanceDocumentMLClassifierApproach extends DocumentMLClassifierApproach

Trains a model to classify documents with a Logarithmic Regression algorithm. Training data requires columns for text and their label. The result is a trained DocumentLogRegClassifierModel.

Example

Define pipeline stages to prepare the data

val document_assembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val tokenizer = new Tokenizer()
  .setInputCols("document")
  .setOutputCol("token")

val normalizer = new Normalizer()
  .setInputCols("token")
  .setOutputCol("normalized")

val stopwords_cleaner = new StopWordsCleaner()
  .setInputCols("normalized")
  .setOutputCol("cleanTokens")
  .setCaseSensitive(false)

val stemmer = new Stemmer()
  .setInputCols("cleanTokens")
  .setOutputCol("stem")

Define the document classifier and fit training data to it

val logreg = new DocumentLogRegClassifierApproach()
  .setInputCols("stem")
  .setLabelCol("category")
  .setOutputCol("prediction")

val pipeline = new Pipeline().setStages(Array(
  document_assembler,
  tokenizer,
  normalizer,
  stopwords_cleaner,
  stemmer,
  logreg
))

val model = pipeline.fit(trainingData)

See also: DocumentLogRegClassifierModel for instantiated models

Linear Supertypes

DocumentMLClassifierApproach, CheckLicense, DocumentMLClassifierParams, AnnotatorApproach[DocumentMLClassifierModel], CanBeLazy, DefaultParamsWritable, MLWritable, HasOutputAnnotatorType, HasOutputAnnotationCol, HasInputAnnotationCols, Estimator[DocumentMLClassifierModel], PipelineStage, Logging, Params, Serializable, Serializable, Identifiable, AnyRef, Any

Ordering

Grouped
Alphabetic
By Inheritance

Inherited

FinanceDocumentMLClassifierApproach
DocumentMLClassifierApproach
CheckLicense
DocumentMLClassifierParams
AnnotatorApproach
CanBeLazy
DefaultParamsWritable
MLWritable
HasOutputAnnotatorType
HasOutputAnnotationCol
HasInputAnnotationCols
Estimator
PipelineStage
Logging
Params
Serializable
Serializable
Identifiable
AnyRef
Any

Hide All
Show All

Visibility

Public
All

Instance Constructors

new FinanceDocumentMLClassifierApproach()
new FinanceDocumentMLClassifierApproach(uid: String)
uid
a unique identifier for the instantiated AnnotatorModel

Type Members

type AnnotatorType = String

Definition Classes
HasOutputAnnotatorType

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def $[T](param: Param[T]): T

Attributes
protected
Definition Classes
Params
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def _fit(dataset: Dataset[_], recursiveStages: Option[PipelineModel]): DocumentMLClassifierModel

Attributes
protected
Definition Classes
AnnotatorApproach
final def asInstanceOf[T0]: T0

Definition Classes
Any
def beforeTraining(spark: SparkSession): Unit

Definition Classes
AnnotatorApproach
def calculateNgramsUdf: UserDefinedFunction

Definition Classes
DocumentMLClassifierParams
final def checkSchema(schema: StructType, inputAnnotatorType: String): Boolean

Attributes
protected
Definition Classes
HasInputAnnotationCols
def checkValidEnvironment(spark: Option[SparkSession], scopes: Seq[String], metadata: Option[Map[String, String]]): Unit

Definition Classes
CheckLicense
def checkValidScope(scope: String): Unit

Definition Classes
CheckLicense
def checkValidScopeAndEnvironment(scope: String, spark: Option[SparkSession], checkLp: Boolean, metadata: Option[Map[String, String]]): Unit

Definition Classes
CheckLicense
def checkValidScopesAndEnvironment(scopes: Seq[String], spark: Option[SparkSession], checkLp: Boolean, metadata: Option[Map[String, String]]): Unit

Definition Classes
CheckLicense
val classificationModelClass: Param[String]
specify the classification model if it has been already trained.
specify the classification model if it has been already trained.

Definition Classes
DocumentMLClassifierApproach
val classificationModelPath: Param[String]
specify the classification model if it has been already trained.
specify the classification model if it has been already trained.

Definition Classes
DocumentMLClassifierApproach
final def clear(param: Param[_]): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
Params
def clone(): AnyRef

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
final def copy(extra: ParamMap): Estimator[DocumentMLClassifierModel]

Definition Classes
AnnotatorApproach → Estimator → PipelineStage → Params
def copyValues[T <: Params](to: T, extra: ParamMap): T

Attributes
protected
Definition Classes
Params
final def defaultCopy[T <: Params](extra: ParamMap): T

Attributes
protected
Definition Classes
Params
val description: String

Definition Classes
DocumentMLClassifierApproach → AnnotatorApproach
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def explainParam(param: Param[_]): String

Definition Classes
Params
def explainParams(): String

Definition Classes
Params
final def extractParamMap(): ParamMap

Definition Classes
Params
final def extractParamMap(extra: ParamMap): ParamMap

Definition Classes
Params
def finalize(): Unit

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def fit(dataset: Dataset[_]): DocumentMLClassifierModel

Definition Classes
AnnotatorApproach → Estimator
def fit(dataset: Dataset[_], paramMaps: Seq[ParamMap]): Seq[DocumentMLClassifierModel]

Definition Classes
Estimator
Annotations
@Since( "2.0.0" )
def fit(dataset: Dataset[_], paramMap: ParamMap): DocumentMLClassifierModel

Definition Classes
Estimator
Annotations
@Since( "2.0.0" )
def fit(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DocumentMLClassifierModel

Definition Classes
Estimator
Annotations
@Since( "2.0.0" ) @varargs()
val fitIntercept: Param[Boolean]
whether to fit an intercept term (Default: true)
whether to fit an intercept term (Default: true)

Definition Classes
DocumentMLClassifierApproach
final def get[T](param: Param[T]): Option[T]

Definition Classes
Params
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
Annotations
@native()
def getClassificationModelClass: String
get the SparkML classification class to use
get the SparkML classification class to use

Definition Classes
DocumentMLClassifierApproach
def getClassificationModelPath: String
get the classification model if it has been already trained.
get the classification model if it has been already trained.

Definition Classes
DocumentMLClassifierApproach
final def getDefault[T](param: Param[T]): Option[T]

Definition Classes
Params
def getFitIntercept: Boolean
get whether to fit an intercept term (Default: true)
get whether to fit an intercept term (Default: true)

Definition Classes
DocumentMLClassifierApproach
def getInputCols: Array[String]

Definition Classes
HasInputAnnotationCols
def getLabelCol: String
column with the value result we are trying to predict.
column with the value result we are trying to predict.

Definition Classes
DocumentMLClassifierApproach
def getLabels: Array[String]
array to output the label in the original form.
array to output the label in the original form.

Definition Classes
DocumentMLClassifierParams
def getLazyAnnotator: Boolean

Definition Classes
CanBeLazy
def getMaxIter: Int
maximum number of iterations (Default: 10)
maximum number of iterations (Default: 10)

Definition Classes
DocumentMLClassifierApproach
def getMaxTokenNgramFingerprint: Int

Definition Classes
DocumentMLClassifierParams
def getMergeChunks: Boolean
whether to merge all chunks in a document or not (Default: false)
whether to merge all chunks in a document or not (Default: false)

Definition Classes
DocumentMLClassifierParams
def getMinTokenNgramFingerprint: Int

Definition Classes
DocumentMLClassifierParams
final def getOrDefault[T](param: Param[T]): T

Definition Classes
Params
final def getOutputCol: String

Definition Classes
HasOutputAnnotationCol
def getParam(paramName: String): Param[Any]

Definition Classes
Params
def getTol: Double
get convergence tolerance after each iteration (Default: 1e-6)
get convergence tolerance after each iteration (Default: 1e-6)

Definition Classes
DocumentMLClassifierApproach
def getVectorizationModelPath: String
get the vectorization model if it has been already trained.
get the vectorization model if it has been already trained.

Definition Classes
DocumentMLClassifierApproach
final def hasDefault[T](param: Param[T]): Boolean

Definition Classes
Params
def hasParam(paramName: String): Boolean

Definition Classes
Params
def hashCode(): Int

Definition Classes
AnyRef → Any
Annotations
@native()
lazy val idf: IDF

Definition Classes
DocumentMLClassifierApproach
def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean

Attributes
protected
Definition Classes
Logging
def initializeLogIfNecessary(isInterpreter: Boolean): Unit

Attributes
protected
Definition Classes
Logging
val inputAnnotatorTypes: Array[AnnotatorType]
Input annotator types: TOKEN
Input annotator types: TOKEN

Definition Classes
DocumentMLClassifierApproach → HasInputAnnotationCols
final val inputCols: StringArrayParam

Attributes
protected
Definition Classes
HasInputAnnotationCols
final def isDefined(param: Param[_]): Boolean

Definition Classes
Params
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def isSet(param: Param[_]): Boolean

Definition Classes
Params
def isTraceEnabled(): Boolean

Attributes
protected
Definition Classes
Logging
val labelCol: Param[String]
column with the value result we are trying to predict.
column with the value result we are trying to predict.

Definition Classes
DocumentMLClassifierApproach
lazy val labelEncodedCol: String

Definition Classes
DocumentMLClassifierApproach
lazy val labelPredictedCol: String

Definition Classes
DocumentMLClassifierApproach
lazy val labelRawCol: String

Definition Classes
DocumentMLClassifierApproach
val labels: StringArrayParam
array to output the label in the original form.
array to output the label in the original form.

Definition Classes
DocumentMLClassifierParams
val lazyAnnotator: BooleanParam

Definition Classes
CanBeLazy
def log: Logger

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logName: String

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
val maxIter: Param[Int]
maximum number of iterations (Default: 10)
maximum number of iterations (Default: 10)

Definition Classes
DocumentMLClassifierApproach
val maxTokenNgram: IntParam
the max number of tokens for Ngrams
the max number of tokens for Ngrams

Definition Classes
DocumentMLClassifierParams
val mergeChunks: BooleanParam
whether to merge all chunks in a document or not (Default: false)
whether to merge all chunks in a document or not (Default: false)

Definition Classes
DocumentMLClassifierParams
val minTokenNgram: IntParam
the min number of tokens for Ngrams
the min number of tokens for Ngrams

Definition Classes
DocumentMLClassifierParams
lazy val mlClassifier: Classifier[_, _, _] with HasMaxIter with HasTol with HasFitIntercept

Definition Classes
DocumentMLClassifierApproach
def msgHelper(schema: StructType): String

Attributes
protected
Definition Classes
HasInputAnnotationCols
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
Annotations
@native()
final def notifyAll(): Unit

Definition Classes
AnyRef
Annotations
@native()
def onTrained(model: DocumentMLClassifierModel, spark: SparkSession): Unit

Definition Classes
AnnotatorApproach
val optionalInputAnnotatorTypes: Array[String]

Definition Classes
HasInputAnnotationCols
val outputAnnotatorType: AnnotatorType
Output annotator types: CATEGORY
Output annotator types: CATEGORY

Definition Classes
DocumentMLClassifierApproach → HasOutputAnnotatorType
final val outputCol: Param[String]

Attributes
protected
Definition Classes
HasOutputAnnotationCol
lazy val ovrClassifier: OneVsRest

Definition Classes
DocumentMLClassifierApproach
lazy val params: Array[Param[_]]

Definition Classes
Params
def save(path: String): Unit

Definition Classes
MLWritable
Annotations
@Since( "1.6.0" ) @throws( ... )
final def set(paramPair: ParamPair[_]): FinanceDocumentMLClassifierApproach.this.type

Attributes
protected
Definition Classes
Params
final def set(param: String, value: Any): FinanceDocumentMLClassifierApproach.this.type

Attributes
protected
Definition Classes
Params
final def set[T](param: Param[T], value: T): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
Params
def setClassificationModelClass(value: String): FinanceDocumentMLClassifierApproach.this.type
set the SparkML classification class to use
set the SparkML classification class to use

Definition Classes
DocumentMLClassifierApproach
def setClassificationModelPath(value: String): FinanceDocumentMLClassifierApproach.this.type
set the classification model if it has been already trained.
set the classification model if it has been already trained.

Definition Classes
DocumentMLClassifierApproach
final def setDefault(paramPairs: ParamPair[_]*): FinanceDocumentMLClassifierApproach.this.type

Attributes
protected
Definition Classes
Params
final def setDefault[T](param: Param[T], value: T): FinanceDocumentMLClassifierApproach.this.type

Attributes
protected[org.apache.spark.ml]
Definition Classes
Params
def setFitIntercept(value: Boolean): FinanceDocumentMLClassifierApproach.this.type
set whether to fit an intercept term (Default: true)
set whether to fit an intercept term (Default: true)

Definition Classes
DocumentMLClassifierApproach
final def setInputCols(value: String*): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
HasInputAnnotationCols
def setInputCols(value: Array[String]): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
HasInputAnnotationCols
def setLabelCol(value: String): FinanceDocumentMLClassifierApproach.this.type
column with the value result we are trying to predict.
column with the value result we are trying to predict.

Definition Classes
DocumentMLClassifierApproach
def setLabels(value: Array[String]): FinanceDocumentMLClassifierApproach.this.type
array to output the label in the original form.
array to output the label in the original form.

Definition Classes
DocumentMLClassifierParams
def setLazyAnnotator(value: Boolean): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
CanBeLazy
def setMaxIter(value: Int): FinanceDocumentMLClassifierApproach.this.type
maximum number of iterations (Default: 10)
maximum number of iterations (Default: 10)

Definition Classes
DocumentMLClassifierApproach
def setMaxTokenNgram(value: Int): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
DocumentMLClassifierApproach
def setMaxTokenNgramFingerprint(value: Int): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
DocumentMLClassifierParams
def setMergeChunks(value: Boolean): FinanceDocumentMLClassifierApproach.this.type
whether to merge all chunks in a document or not (Default: false)
whether to merge all chunks in a document or not (Default: false)

Definition Classes
DocumentMLClassifierParams
def setMinTokenNgram(value: Int): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
DocumentMLClassifierApproach
def setMinTokenNgramFingerprint(value: Int): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
DocumentMLClassifierParams
final def setOutputCol(value: String): FinanceDocumentMLClassifierApproach.this.type

Definition Classes
HasOutputAnnotationCol
def setTol(value: Double): FinanceDocumentMLClassifierApproach.this.type
set convergence tolerance after each iteration (Default: 1e-6)
set convergence tolerance after each iteration (Default: 1e-6)

Definition Classes
DocumentMLClassifierApproach
def setVectorizationModelPath(value: String): FinanceDocumentMLClassifierApproach.this.type
set the vectorization model if it has been already trained.
set the vectorization model if it has been already trained.

Definition Classes
DocumentMLClassifierApproach
lazy val sidx: StringIndexer

Definition Classes
DocumentMLClassifierApproach
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
lazy val tf: HashingTF

Definition Classes
DocumentMLClassifierApproach
lazy val tfCol: String

Definition Classes
DocumentMLClassifierApproach
lazy val tfidfCol: String

Definition Classes
DocumentMLClassifierApproach
def toString(): String

Definition Classes
Identifiable → AnyRef → Any
lazy val tokenAnnotationCol: String

Definition Classes
DocumentMLClassifierApproach
lazy val tokenRawCol: String

Definition Classes
DocumentMLClassifierApproach
val tol: Param[Double]
convergence tolerance after each iteration (Default: 1e-6)
convergence tolerance after each iteration (Default: 1e-6)

Definition Classes
DocumentMLClassifierApproach
def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): FinanceDocumentMLClassifierModel

Definition Classes
FinanceDocumentMLClassifierApproach → DocumentMLClassifierApproach → AnnotatorApproach
final def transformSchema(schema: StructType): StructType

Definition Classes
AnnotatorApproach → PipelineStage
def transformSchema(schema: StructType, logging: Boolean): StructType

Attributes
protected
Definition Classes
PipelineStage
Annotations
@DeveloperApi()
val uid: String

Definition Classes
FinanceDocumentMLClassifierApproach → DocumentMLClassifierApproach → Identifiable
def validate(schema: StructType): Boolean

Attributes
protected
Definition Classes
AnnotatorApproach
val vectorizationModelPath: Param[String]
specify the vectorization model if it has been already trained.
specify the vectorization model if it has been already trained.

Definition Classes
DocumentMLClassifierApproach
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
def write: MLWriter

Definition Classes
DefaultParamsWritable → MLWritable

Packages

FinanceDocumentMLClassifierApproach

class FinanceDocumentMLClassifierApproach extends DocumentMLClassifierApproach

Example

Instance Constructors

Type Members

Value Members

Inherited from DocumentMLClassifierApproach

Inherited from CheckLicense

Inherited from DocumentMLClassifierParams

Inherited from AnnotatorApproach[DocumentMLClassifierModel]

Inherited from CanBeLazy

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from HasOutputAnnotatorType

Inherited from HasOutputAnnotationCol

Inherited from HasInputAnnotationCols

Inherited from Estimator[DocumentMLClassifierModel]

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

Parameters

Annotator types

Members

Parameter setters

Parameter getters

Packages

FinanceDocumentMLClassifierApproach 

class FinanceDocumentMLClassifierApproach extends DocumentMLClassifierApproach

Example

Instance Constructors

Type Members

Value Members

Inherited from DocumentMLClassifierApproach

Inherited from CheckLicense

Inherited from DocumentMLClassifierParams

Inherited from AnnotatorApproach[DocumentMLClassifierModel]

Inherited from CanBeLazy

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from HasOutputAnnotatorType

Inherited from HasOutputAnnotationCol

Inherited from HasInputAnnotationCols

Inherited from Estimator[DocumentMLClassifierModel]

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

Parameters

Annotator types

Members

Parameter setters

Parameter getters

FinanceDocumentMLClassifierApproach