com.johnsnowlabs.nlp.annotators.classification

DocumentLogRegClassifierApproach

class DocumentLogRegClassifierApproach extends AnnotatorApproach[DocumentLogRegClassifierModel] with CheckLicense

Trains a model to classify documents with a Logarithmic Regression algorithm. Training data requires columns for text and their label. The result is a trained DocumentLogRegClassifierModel.

Example

Define pipeline stages to prepare the data

val document_assembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val tokenizer = new Tokenizer()
  .setInputCols("document")
  .setOutputCol("token")

val normalizer = new Normalizer()
  .setInputCols("token")
  .setOutputCol("normalized")

val stopwords_cleaner = new StopWordsCleaner()
  .setInputCols("normalized")
  .setOutputCol("cleanTokens")
  .setCaseSensitive(false)

val stemmer = new Stemmer()
  .setInputCols("cleanTokens")
  .setOutputCol("stem")

Define the document classifier and fit training data to it

val logreg = new DocumentLogRegClassifierApproach()
  .setInputCols("stem")
  .setLabelCol("category")
  .setOutputCol("prediction")

val pipeline = new Pipeline().setStages(Array(
  document_assembler,
  tokenizer,
  normalizer,
  stopwords_cleaner,
  stemmer,
  logreg
))

val model = pipeline.fit(trainingData)

See also: DocumentLogRegClassifierModel for instantiated models

Linear Supertypes

CheckLicense, AnnotatorApproach[DocumentLogRegClassifierModel], CanBeLazy, DefaultParamsWritable, MLWritable, HasOutputAnnotatorType, HasOutputAnnotationCol, HasInputAnnotationCols, Estimator[DocumentLogRegClassifierModel], PipelineStage, Logging, Params, Serializable, Serializable, Identifiable, AnyRef, Any

Ordering

Grouped
Alphabetic
By Inheritance

Inherited

DocumentLogRegClassifierApproach
CheckLicense
AnnotatorApproach
CanBeLazy
DefaultParamsWritable
MLWritable
HasOutputAnnotatorType
HasOutputAnnotationCol
HasInputAnnotationCols
Estimator
PipelineStage
Logging
Params
Serializable
Serializable
Identifiable
AnyRef
Any

Hide All
Show All

Visibility

Public
All

Instance Constructors

new DocumentLogRegClassifierApproach()
new DocumentLogRegClassifierApproach(uid: String)
uid
a unique identifier for the instantiated AnnotatorModel

Type Members

type AnnotatorType = String

Definition Classes
HasOutputAnnotatorType

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def $[T](param: Param[T]): T

Attributes
protected
Definition Classes
Params
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def _fit(dataset: Dataset[_], recursiveStages: Option[PipelineModel]): DocumentLogRegClassifierModel

Attributes
protected
Definition Classes
AnnotatorApproach
final def asInstanceOf[T0]: T0

Definition Classes
Any
def beforeTraining(spark: SparkSession): Unit

Definition Classes
AnnotatorApproach
final def checkSchema(schema: StructType, inputAnnotatorType: String): Boolean

Attributes
protected
Definition Classes
HasInputAnnotationCols
def checkValidEnvironment(spark: Option[SparkSession], scopes: Seq[String], metadata: Option[Map[String, Value]]): Unit

Definition Classes
CheckLicense
def checkValidScope(scope: String): Unit

Definition Classes
CheckLicense
def checkValidScopeAndEnvironment(scope: String, spark: Option[SparkSession], checkLp: Boolean, metadata: Option[Map[String, Value]]): Unit

Definition Classes
CheckLicense
def checkValidScopesAndEnvironment(scopes: Seq[String], spark: Option[SparkSession], checkLp: Boolean, metadata: Option[Map[String, Value]]): Unit

Definition Classes
CheckLicense
val classificationModelPath: Param[String]
specify the classification model if it has been already trained.
final def clear(param: Param[_]): DocumentLogRegClassifierApproach.this.type

Definition Classes
Params
def clone(): AnyRef

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
final def copy(extra: ParamMap): Estimator[DocumentLogRegClassifierModel]

Definition Classes
AnnotatorApproach → Estimator → PipelineStage → Params
def copyValues[T <: Params](to: T, extra: ParamMap): T

Attributes
protected
Definition Classes
Params
final def defaultCopy[T <: Params](extra: ParamMap): T

Attributes
protected
Definition Classes
Params
val description: String

Definition Classes
DocumentLogRegClassifierApproach → AnnotatorApproach
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def explainParam(param: Param[_]): String

Definition Classes
Params
def explainParams(): String

Definition Classes
Params
final val extraInputCols: StringArrayParam

Attributes
protected
Definition Classes
HasInputAnnotationCols
final def extractParamMap(): ParamMap

Definition Classes
Params
final def extractParamMap(extra: ParamMap): ParamMap

Definition Classes
Params
def finalize(): Unit

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def fit(dataset: Dataset[_]): DocumentLogRegClassifierModel

Definition Classes
AnnotatorApproach → Estimator
def fit(dataset: Dataset[_], paramMaps: Seq[ParamMap]): Seq[DocumentLogRegClassifierModel]

Definition Classes
Estimator
Annotations
@Since( "2.0.0" )
def fit(dataset: Dataset[_], paramMap: ParamMap): DocumentLogRegClassifierModel

Definition Classes
Estimator
Annotations
@Since( "2.0.0" )
def fit(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DocumentLogRegClassifierModel

Definition Classes
Estimator
Annotations
@Since( "2.0.0" ) @varargs()
val fitIntercept: Param[Boolean]
whether to fit an intercept term (Default: true)
final def get[T](param: Param[T]): Option[T]

Definition Classes
Params
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
Annotations
@native()
def getClassificationModelPath: String
specify the classification model if it has been already trained.
final def getDefault[T](param: Param[T]): Option[T]

Definition Classes
Params
def getFitIntercept: Boolean
whether to fit an intercept term (Default: true)
def getInputCols: Array[String]

Definition Classes
HasInputAnnotationCols
def getLabelCol: String
column with the value result we are trying to predict.
def getLabels: Array[String]
array to output the label in the original form.
def getLazyAnnotator: Boolean

Definition Classes
CanBeLazy
def getMaxIter: Int
maximum number of iterations (Default: 10)
def getMergeChunks: Boolean
whether to merge all chunks in a document or not (Default: false)
final def getOrDefault[T](param: Param[T]): T

Definition Classes
Params
final def getOutputCol: String

Definition Classes
HasOutputAnnotationCol
def getParam(paramName: String): Param[Any]

Definition Classes
Params
def getTol: Double
convergence tolerance after each iteration (Default: 1e-6)
def getVectorizationModelPath: String
specify the vectorization model if it has been already trained.
final def hasDefault[T](param: Param[T]): Boolean

Definition Classes
Params
def hasParam(paramName: String): Boolean

Definition Classes
Params
def hashCode(): Int

Definition Classes
AnyRef → Any
Annotations
@native()
lazy val idf: IDF
def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean

Attributes
protected
Definition Classes
Logging
def initializeLogIfNecessary(isInterpreter: Boolean): Unit

Attributes
protected
Definition Classes
Logging
val inputAnnotatorTypes: Array[AnnotatorType]
Input annotator types: TOKEN
Input annotator types: TOKEN

Definition Classes
DocumentLogRegClassifierApproach → HasInputAnnotationCols
final val inputCols: StringArrayParam

Attributes
protected
Definition Classes
HasInputAnnotationCols
final def isDefined(param: Param[_]): Boolean

Definition Classes
Params
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def isSet(param: Param[_]): Boolean

Definition Classes
Params
def isTraceEnabled(): Boolean

Attributes
protected
Definition Classes
Logging
val labelCol: Param[String]
column with the value result we are trying to predict.
lazy val labelEncodedCol: String
lazy val labelPredictedCol: String
lazy val labelRawCol: String
val labels: StringArrayParam
array to output the label in the original form.
val lazyAnnotator: BooleanParam

Definition Classes
CanBeLazy
def log: Logger

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logName: String

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
lazy val lrClassifier: LogisticRegression
val maxIter: Param[Int]
maximum number of iterations (Default: 10)
val mergeChunks: BooleanParam
whether to merge all chunks in a document or not (Default: false)
def msgHelper(schema: StructType): String

Attributes
protected
Definition Classes
HasInputAnnotationCols
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
Annotations
@native()
final def notifyAll(): Unit

Definition Classes
AnyRef
Annotations
@native()
def onTrained(model: DocumentLogRegClassifierModel, spark: SparkSession): Unit

Definition Classes
AnnotatorApproach
val optionalInputAnnotatorTypes: Array[String]

Definition Classes
HasInputAnnotationCols
val outputAnnotatorType: AnnotatorType
Output annotator types: CATEGORY
Output annotator types: CATEGORY

Definition Classes
DocumentLogRegClassifierApproach → HasOutputAnnotatorType
final val outputCol: Param[String]

Attributes
protected
Definition Classes
HasOutputAnnotationCol
lazy val ovrClassifier: OneVsRest
lazy val params: Array[Param[_]]

Definition Classes
Params
def save(path: String): Unit

Definition Classes
MLWritable
Annotations
@Since( "1.6.0" ) @throws( ... )
final def set(paramPair: ParamPair[_]): DocumentLogRegClassifierApproach.this.type

Attributes
protected
Definition Classes
Params
final def set(param: String, value: Any): DocumentLogRegClassifierApproach.this.type

Attributes
protected
Definition Classes
Params
final def set[T](param: Param[T], value: T): DocumentLogRegClassifierApproach.this.type

Definition Classes
Params
def setClassificationModelPath(value: String): DocumentLogRegClassifierApproach.this.type
specify the classification model if it has been already trained.
final def setDefault(paramPairs: ParamPair[_]*): DocumentLogRegClassifierApproach.this.type

Attributes
protected
Definition Classes
Params
final def setDefault[T](param: Param[T], value: T): DocumentLogRegClassifierApproach.this.type

Attributes
protected[org.apache.spark.ml]
Definition Classes
Params
def setExtraInputCols(value: Array[String]): DocumentLogRegClassifierApproach.this.type

Definition Classes
HasInputAnnotationCols
def setFitIntercept(value: Boolean): DocumentLogRegClassifierApproach.this.type
whether to fit an intercept term (Default: true)
final def setInputCols(value: String*): DocumentLogRegClassifierApproach.this.type

Definition Classes
HasInputAnnotationCols
def setInputCols(value: Array[String]): DocumentLogRegClassifierApproach.this.type

Definition Classes
HasInputAnnotationCols
def setLabelCol(value: String): DocumentLogRegClassifierApproach.this.type
column with the value result we are trying to predict.
def setLabels(value: Array[String]): DocumentLogRegClassifierApproach.this.type
array to output the label in the original form.
def setLazyAnnotator(value: Boolean): DocumentLogRegClassifierApproach.this.type

Definition Classes
CanBeLazy
def setMaxIter(value: Int): DocumentLogRegClassifierApproach.this.type
maximum number of iterations (Default: 10)
def setMergeChunks(value: Boolean): DocumentLogRegClassifierApproach.this.type
whether to merge all chunks in a document or not (Default: false)
final def setOutputCol(value: String): DocumentLogRegClassifierApproach.this.type

Definition Classes
HasOutputAnnotationCol
def setTol(value: Double): DocumentLogRegClassifierApproach.this.type
convergence tolerance after each iteration (Default: 1e-6)
def setVectorizationModelPath(value: String): DocumentLogRegClassifierApproach.this.type
specify the vectorization model if it has been already trained.
lazy val sidx: StringIndexer
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
lazy val tf: HashingTF
lazy val tfCol: String
lazy val tfidfCol: String
def toString(): String

Definition Classes
Identifiable → AnyRef → Any
lazy val tokenAnnotationCol: String
lazy val tokenRawCol: String
val tol: Param[Double]
convergence tolerance after each iteration (Default: 1e-6)
def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): DocumentLogRegClassifierModel

Definition Classes
DocumentLogRegClassifierApproach → AnnotatorApproach
final def transformSchema(schema: StructType): StructType

Definition Classes
AnnotatorApproach → PipelineStage
def transformSchema(schema: StructType, logging: Boolean): StructType

Attributes
protected
Definition Classes
PipelineStage
Annotations
@DeveloperApi()
val uid: String

Definition Classes
DocumentLogRegClassifierApproach → Identifiable
def validate(schema: StructType): Boolean

Attributes
protected
Definition Classes
AnnotatorApproach
val vectorizationModelPath: Param[String]
specify the vectorization model if it has been already trained.
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
def write: MLWriter

Definition Classes
DefaultParamsWritable → MLWritable

Packages

DocumentLogRegClassifierApproach

class DocumentLogRegClassifierApproach extends AnnotatorApproach[DocumentLogRegClassifierModel] with CheckLicense

Example

Instance Constructors

Type Members

Value Members

Inherited from CheckLicense

Inherited from AnnotatorApproach[DocumentLogRegClassifierModel]

Inherited from CanBeLazy

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from HasOutputAnnotatorType

Inherited from HasOutputAnnotationCol

Inherited from HasInputAnnotationCols

Inherited from Estimator[DocumentLogRegClassifierModel]

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

Parameters

Annotator types

Members

Parameter setters

Parameter getters

Packages

DocumentLogRegClassifierApproach 

class DocumentLogRegClassifierApproach extends AnnotatorApproach[DocumentLogRegClassifierModel] with CheckLicense

Example

Instance Constructors

Type Members

Value Members

Inherited from CheckLicense

Inherited from AnnotatorApproach[DocumentLogRegClassifierModel]

Inherited from CanBeLazy

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from HasOutputAnnotatorType

Inherited from HasOutputAnnotationCol

Inherited from HasInputAnnotationCols

Inherited from Estimator[DocumentLogRegClassifierModel]

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

Parameters

Annotator types

Members

Parameter setters

Parameter getters

DocumentLogRegClassifierApproach