Packages

class TextMatcherInternal extends AnnotatorApproach[TextMatcherInternalModel] with TextMatcherInternalParams with ParamsAndFeaturesWritable

Annotator to match exact phrases (by token) provided in a file against a Document.

A text file of predefined phrases must be provided with setEntities. The text file can als be set directly as an ExternalResource.

For extended examples of usage, see the

Example

In this example, the entities file is of the form

...
dolore magna aliqua
lorem ipsum dolor. sit
laborum
...

where each line represents an entity phrase to be extracted.

import spark.implicits._
import com.johnsnowlabs.nlp.DocumentAssembler
import com.johnsnowlabs.nlp.annotator.Tokenizer
import com.johnsnowlabs.nlp.annotator.TextMatcherInternal
import com.johnsnowlabs.nlp.util.io.ReadAs
import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val tokenizer = new Tokenizer()
  .setInputCols("document")
  .setOutputCol("token")

val data = Seq("Hello dolore magna aliqua. Lorem ipsum dolor. sit in laborum").toDF("text")
val entityExtractor = new TextMatcherInternal()
  .setInputCols("document", "token")
  .setEntities("src/test/resources/entity-extractor/test-phrases.txt", ReadAs.TEXT)
  .setOutputCol("entity")
  .setCaseSensitive(false)
  .setTokenizer(tokenizer.fit(data))

val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, entityExtractor))
val results = pipeline.fit(data).transform(data)

results.selectExpr("explode(entity) as result").show(false)
+------------------------------------------------------------------------------------------+
|result                                                                                    |
+------------------------------------------------------------------------------------------+
|[chunk, 6, 24, dolore magna aliqua, [entity -> entity, sentence -> 0, chunk -> 0], []]    |
|[chunk, 27, 48, Lorem ipsum dolor. sit, [entity -> entity, sentence -> 0, chunk -> 1], []]|
|[chunk, 53, 59, laborum, [entity -> entity, sentence -> 0, chunk -> 2], []]               |
+------------------------------------------------------------------------------------------+
Linear Supertypes
ParamsAndFeaturesWritable, TextMatcherInternalParams, HasFeatures, AnnotatorApproach[TextMatcherInternalModel], CanBeLazy, DefaultParamsWritable, MLWritable, HasOutputAnnotatorType, HasOutputAnnotationCol, HasInputAnnotationCols, Estimator[TextMatcherInternalModel], PipelineStage, Logging, Params, Serializable, Serializable, Identifiable, AnyRef, Any
Ordering
  1. Grouped
  2. Alphabetic
  3. By Inheritance
Inherited
  1. TextMatcherInternal
  2. ParamsAndFeaturesWritable
  3. TextMatcherInternalParams
  4. HasFeatures
  5. AnnotatorApproach
  6. CanBeLazy
  7. DefaultParamsWritable
  8. MLWritable
  9. HasOutputAnnotatorType
  10. HasOutputAnnotationCol
  11. HasInputAnnotationCols
  12. Estimator
  13. PipelineStage
  14. Logging
  15. Params
  16. Serializable
  17. Serializable
  18. Identifiable
  19. AnyRef
  20. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new TextMatcherInternal()
  2. new TextMatcherInternal(uid: String)

    uid

    internal uid required to generate writable annotators

Type Members

  1. type AnnotatorType = String
    Definition Classes
    HasOutputAnnotatorType

Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int
    Definition Classes
    AnyRef → Any
  3. final def $[T](param: Param[T]): T
    Attributes
    protected
    Definition Classes
    Params
  4. def $$[T](feature: StructFeature[T]): T
    Attributes
    protected
    Definition Classes
    HasFeatures
  5. def $$[K, V](feature: MapFeature[K, V]): Map[K, V]
    Attributes
    protected
    Definition Classes
    HasFeatures
  6. def $$[T](feature: SetFeature[T]): Set[T]
    Attributes
    protected
    Definition Classes
    HasFeatures
  7. def $$[T](feature: ArrayFeature[T]): Array[T]
    Attributes
    protected
    Definition Classes
    HasFeatures
  8. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  9. def _fit(dataset: Dataset[_], recursiveStages: Option[PipelineModel]): TextMatcherInternalModel
    Attributes
    protected
    Definition Classes
    AnnotatorApproach
  10. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  11. def beforeTraining(spark: SparkSession): Unit
    Definition Classes
    AnnotatorApproach
  12. val buildFromTokens: BooleanParam

    Whether the TextMatcherInternal should take the CHUNK from TOKEN or not (Default: false)

  13. def cartesianTokenVariants(tokens: Seq[Annotation], lemmaDictionary: Map[String, String]): Seq[Seq[String]]
    Attributes
    protected
    Definition Classes
    TextMatcherInternalParams
  14. val caseSensitive: BooleanParam

    Whether to match regardless of case (Default: true)

    Whether to match regardless of case (Default: true)

    Definition Classes
    TextMatcherInternalParams
  15. final def checkSchema(schema: StructType, inputAnnotatorType: String): Boolean
    Attributes
    protected
    Definition Classes
    HasInputAnnotationCols
  16. val cleanKeywords: StringArrayParam

    A parameter defining additional keywords to be removed during text processing, in addition to the standard stopwords.

    A parameter defining additional keywords to be removed during text processing, in addition to the standard stopwords.

    These keywords are appended to the default stopwords list and will be excluded from the text when cleanStopWords is enabled.

    By default, this parameter is an empty array, meaning no additional keywords are filtered unless specified.

    Definition Classes
    TextMatcherInternalParams
  17. val cleanStopWords: BooleanParam

    Parameter indicating whether to clean stop words during text processing.

    Parameter indicating whether to clean stop words during text processing. Defaults to true.

    Definition Classes
    TextMatcherInternalParams
  18. final def clear(param: Param[_]): TextMatcherInternal.this.type
    Definition Classes
    Params
  19. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  20. final def copy(extra: ParamMap): Estimator[TextMatcherInternalModel]
    Definition Classes
    AnnotatorApproach → Estimator → PipelineStage → Params
  21. def copyValues[T <: Params](to: T, extra: ParamMap): T
    Attributes
    protected
    Definition Classes
    Params
  22. final def defaultCopy[T <: Params](extra: ParamMap): T
    Attributes
    protected
    Definition Classes
    Params
  23. val delimiter: Param[String]

    Value for the delimiter between Phrase, Entity in the entities file (Default: ,)

  24. val description: String

    Extracts entities from target dataset given in a text file

    Extracts entities from target dataset given in a text file

    Definition Classes
    TextMatcherInternal → AnnotatorApproach
  25. val enableLemmatizer: BooleanParam

    A Boolean parameter that controls whether lemmatization should be applied during text processing.

    A Boolean parameter that controls whether lemmatization should be applied during text processing.

    Lemmatization is the process of reducing words to their base or dictionary form (lemma). When this parameter is set to true: - The incoming tokens (words from the input text) are lemmatized. - The predefined entities (the terms you want to match against) are also lemmatized.

    This allows for more flexible and accurate matching. For example, words like "running", "ran", or "runs" will all be reduced to "run", and can match consistently even if the exact form in the text differs.

    Default value is false, meaning lemmatization is disabled unless explicitly turned on.

    Definition Classes
    TextMatcherInternalParams
  26. val enableStemmer: BooleanParam

    A Boolean parameter that controls whether stemming should be applied during text processing.

    A Boolean parameter that controls whether stemming should be applied during text processing.

    Stemming reduces words to their root forms (e.g., "running", "runs", and "runner" → "run"). This can help match different word forms more effectively in tasks such as keyword matching and entity recognition.

    When this parameter is set to true, stemming is applied in addition to the original form: - Input tokens are matched both in their original and stemmed forms. - Target entities can also be matched using their stemmed forms.

    This does not replace original matching — it complements it. Matching is performed using both the original and processed (stemmed) versions to improve recall and flexibility.

    Default value is false.

    Definition Classes
    TextMatcherInternalParams
  27. val entities: ExternalResourceParam

    External resource for the entities, e.g.

    External resource for the entities, e.g. a text file where each line is the string of an entity

  28. val entityValue: Param[String]

    Value for the entity metadata field (Default: "entity")

  29. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  30. def equals(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  31. val excludePunctuation: BooleanParam

    A parameter indicating whether punctuation marks should be removed during text processing.

    A parameter indicating whether punctuation marks should be removed during text processing.

    When set to true, most punctuation characters will be excluded from the processed text. This is typically used to clean text by removing non-word characters.

    Defaults to true, meaning punctuation is removed unless explicitly disabled. Some characters may be preserved if specifically handled by other parameters (e.g., safe keywords).

    Definition Classes
    TextMatcherInternalParams
  32. val excludeRegexPatterns: StringArrayParam

    A parameter specifying regular expression patterns used to exclude matching chunks during text processing.

    A parameter specifying regular expression patterns used to exclude matching chunks during text processing.

    Each string in this array is a regex pattern. If a detected chunk matches any of these patterns, it will be discarded and excluded from the final output.

    This is useful for removing unwanted matches based on pattern rules (e.g., specific codes, formats, or noise). By default, this parameter is empty, meaning no chunks are dropped based on regex.

    Definition Classes
    TextMatcherInternalParams
  33. def explainParam(param: Param[_]): String
    Definition Classes
    Params
  34. def explainParams(): String
    Definition Classes
    Params
  35. final def extractParamMap(): ParamMap
    Definition Classes
    Params
  36. final def extractParamMap(extra: ParamMap): ParamMap
    Definition Classes
    Params
  37. val features: ArrayBuffer[Feature[_, _, _]]
    Definition Classes
    HasFeatures
  38. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  39. final def fit(dataset: Dataset[_]): TextMatcherInternalModel
    Definition Classes
    AnnotatorApproach → Estimator
  40. def fit(dataset: Dataset[_], paramMaps: Seq[ParamMap]): Seq[TextMatcherInternalModel]
    Definition Classes
    Estimator
    Annotations
    @Since( "2.0.0" )
  41. def fit(dataset: Dataset[_], paramMap: ParamMap): TextMatcherInternalModel
    Definition Classes
    Estimator
    Annotations
    @Since( "2.0.0" )
  42. def fit(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): TextMatcherInternalModel
    Definition Classes
    Estimator
    Annotations
    @Since( "2.0.0" ) @varargs()
  43. def get[T](feature: StructFeature[T]): Option[T]
    Attributes
    protected
    Definition Classes
    HasFeatures
  44. def get[K, V](feature: MapFeature[K, V]): Option[Map[K, V]]
    Attributes
    protected
    Definition Classes
    HasFeatures
  45. def get[T](feature: SetFeature[T]): Option[Set[T]]
    Attributes
    protected
    Definition Classes
    HasFeatures
  46. def get[T](feature: ArrayFeature[T]): Option[Array[T]]
    Attributes
    protected
    Definition Classes
    HasFeatures
  47. final def get[T](param: Param[T]): Option[T]
    Definition Classes
    Params
  48. def getBuildFromTokens: Boolean

    Getter for buildFromTokens param

  49. def getCaseSensitive: Boolean

    Whether to match regardless of case (Default: true)

  50. final def getClass(): Class[_]
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  51. def getCleanKeywords: Array[String]

    Retrieves the list of keywords to be filtered out.

    Retrieves the list of keywords to be filtered out.

    returns

    an array of strings representing the keywords.

    Definition Classes
    TextMatcherInternalParams
  52. def getCleanStopWords: Boolean

    Retrieves the current state of the cleanStopWords parameter.

    Retrieves the current state of the cleanStopWords parameter.

    returns

    true if the cleanStopWords option is enabled, false otherwise.

    Definition Classes
    TextMatcherInternalParams
  53. final def getDefault[T](param: Param[T]): Option[T]
    Definition Classes
    Params
  54. def getDelimiter: String

    Get the Phrase, Entity delimiter in the entities file (Default: ,)

  55. def getDictionary: ExternalResource

    External dictionary to be used by the lemmatizer

  56. def getEnableLemmatizer: Boolean

    Gets the current state of the lemmatizer enablement setting.

    Gets the current state of the lemmatizer enablement setting.

    returns

    true if the lemmatizer is enabled, false otherwise.

    Definition Classes
    TextMatcherInternalParams
  57. def getEnableStemmer: Boolean

    Retrieves the current value of the enableStemmer parameter.

    Retrieves the current value of the enableStemmer parameter.

    returns

    true if stemming is enabled, false otherwise

    Definition Classes
    TextMatcherInternalParams
  58. def getEntityValue: String

    Getter for Value for the entity metadata field

  59. def getExcludeRegexPattern: Array[String]

    Retrieves the list of regex patterns used to exclude specific text matches during processing.

    Retrieves the list of regex patterns used to exclude specific text matches during processing.

    returns

    an array of strings representing the regex patterns to be excluded.

    Definition Classes
    TextMatcherInternalParams
  60. def getInputCols: Array[String]
    Definition Classes
    HasInputAnnotationCols
  61. def getLazyAnnotator: Boolean
    Definition Classes
    CanBeLazy
  62. def getMergeOverlapping: Boolean

    Whether to merge overlapping matched chunks (Default: false)

  63. final def getOrDefault[T](param: Param[T]): T
    Definition Classes
    Params
  64. final def getOutputCol: String
    Definition Classes
    HasOutputAnnotationCol
  65. def getParam(paramName: String): Param[Any]
    Definition Classes
    Params
  66. def getReturnChunks: String

    Retrieves the current value of the returnChunks parameter.

    Retrieves the current value of the returnChunks parameter.

    returns

    A string representing the configured value for the returnChunks setting.

    Definition Classes
    TextMatcherInternalParams
  67. def getSafeKeywords: Array[String]

    Retrieves the list of keywords to be filtered out.

    Retrieves the list of keywords to be filtered out.

    returns

    an array of strings representing the keywords.

    Definition Classes
    TextMatcherInternalParams
  68. def getShuffleEntitySubTokens: Boolean

    Getter for enableEntityVariations param

  69. def getSkipMatcherAugmentation: Boolean

    Gets whether augmentation for matcher patterns is skipped.

    Gets whether augmentation for matcher patterns is skipped.

    returns

    true if augmentation for matcher patterns is skipped, false otherwise.

    Definition Classes
    TextMatcherInternalParams
  70. def getSkipSourceTextAugmentation: Boolean

    Gets whether augmentation for source text is skipped.

    Gets whether augmentation for source text is skipped.

    returns

    true if augmentation for source text is skipped, false otherwise.

    Definition Classes
    TextMatcherInternalParams
  71. def getStopWords: Array[String]

    Retrieves the list of stop words used within the text matching process.

    Retrieves the list of stop words used within the text matching process.

    returns

    an array of strings representing the stop words.

    Definition Classes
    TextMatcherInternalParams
  72. def getTokenVariants(token: Annotation, lemmaDictionary: Map[String, String]): Seq[String]
    Attributes
    protected
    Definition Classes
    TextMatcherInternalParams
  73. def getTokenizer: TokenizerModel

    The Tokenizer to perform tokenization with

  74. final def hasDefault[T](param: Param[T]): Boolean
    Definition Classes
    Params
  75. def hasParam(paramName: String): Boolean
    Definition Classes
    Params
  76. def hashCode(): Int
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  77. def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean
    Attributes
    protected
    Definition Classes
    Logging
  78. def initializeLogIfNecessary(isInterpreter: Boolean): Unit
    Attributes
    protected
    Definition Classes
    Logging
  79. val inputAnnotatorTypes: Array[String]

    Output annotator type : DOCUMENT, TOKEN

    Output annotator type : DOCUMENT, TOKEN

    Definition Classes
    TextMatcherInternal → HasInputAnnotationCols
  80. final val inputCols: StringArrayParam
    Attributes
    protected
    Definition Classes
    HasInputAnnotationCols
  81. final def isDefined(param: Param[_]): Boolean
    Definition Classes
    Params
  82. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  83. final def isSet(param: Param[_]): Boolean
    Definition Classes
    Params
  84. def isTraceEnabled(): Boolean
    Attributes
    protected
    Definition Classes
    Logging
  85. val lazyAnnotator: BooleanParam
    Definition Classes
    CanBeLazy
  86. val lemmaDict: MapFeature[String, String]

    lemmaDict

    lemmaDict

    Definition Classes
    TextMatcherInternalParams
  87. val lemmatizerDictionary: ExternalResourceParam

    External dictionary to be used by the lemmatizer, which needs 'keyDelimiter' and 'valueDelimiter' for parsing the resource

    External dictionary to be used by the lemmatizer, which needs 'keyDelimiter' and 'valueDelimiter' for parsing the resource

    Example

    ...
    pick	->	pick	picks	picking	picked
    peck	->	peck	pecking	pecked	pecks
    pickle	->	pickle	pickles	pickled	pickling
    pepper	->	pepper	peppers	peppered	peppering
    ...

    where each key is delimited by -> and values are delimited by \t

  88. def log: Logger
    Attributes
    protected
    Definition Classes
    Logging
  89. def logDebug(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  90. def logDebug(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  91. def logError(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  92. def logError(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  93. def logInfo(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  94. def logInfo(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  95. def logName: String
    Attributes
    protected
    Definition Classes
    Logging
  96. def logTrace(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  97. def logTrace(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  98. def logWarning(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  99. def logWarning(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  100. val mergeOverlapping: BooleanParam

    Whether to merge overlapping matched chunks (Default: false)

  101. def msgHelper(schema: StructType): String
    Attributes
    protected
    Definition Classes
    HasInputAnnotationCols
  102. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  103. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  104. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  105. def onTrained(model: TextMatcherInternalModel, spark: SparkSession): Unit
    Definition Classes
    AnnotatorApproach
  106. def onWrite(path: String, spark: SparkSession): Unit
    Attributes
    protected
    Definition Classes
    ParamsAndFeaturesWritable
  107. val optionalInputAnnotatorTypes: Array[String]
    Definition Classes
    HasInputAnnotationCols
  108. val outputAnnotatorType: AnnotatorType

    Output annotator type : CHUNK

    Output annotator type : CHUNK

    Definition Classes
    TextMatcherInternal → HasOutputAnnotatorType
  109. final val outputCol: Param[String]
    Attributes
    protected
    Definition Classes
    HasOutputAnnotationCol
  110. lazy val params: Array[Param[_]]
    Definition Classes
    Params
  111. val returnChunks: Param[String]

    A string parameter that defines which version of the matched chunks should be returned: "original" or "matched".

    A string parameter that defines which version of the matched chunks should be returned: "original" or "matched".

    - If set to "original" (default): the returned chunks reflect the exact text spans as they appeared in the original input. This ensures that the begin and end character indices accurately map to the source text.

    - If set to "matched": the returned chunks are based on the processed form that triggered the match, such as a stemmed or lemmatized version of the phrase. This can be useful to see which normalized entity was matched, but the character indices (begin, end) may not align correctly with the original input text.

    Use "original" if accurate text positioning is important (e.g., for highlighting), and "matched" if you want to inspect the normalized form used for the match.

    Definition Classes
    TextMatcherInternalParams
  112. val safeKeywords: StringArrayParam

    A parameter representing an array of keywords that should be preserved during text cleaning, when stopword removal (cleanStopWords) is enabled.

    A parameter representing an array of keywords that should be preserved during text cleaning, when stopword removal (cleanStopWords) is enabled.

    When cleanStopWords is set to true, common stopwords are typically removed from the text. However, keywords specified in safeKeywords will be exempt from removal and retained in the processed text.

    By default, this parameter is an empty array, meaning no exceptions are made unless explicitly provided.

    Definition Classes
    TextMatcherInternalParams
  113. lazy val safeLemmaDict: Map[String, String]
    Definition Classes
    TextMatcherInternalParams
  114. def save(path: String): Unit
    Definition Classes
    MLWritable
    Annotations
    @Since( "1.6.0" ) @throws( ... )
  115. def set[T](feature: StructFeature[T], value: T): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  116. def set[K, V](feature: MapFeature[K, V], value: Map[K, V]): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  117. def set[T](feature: SetFeature[T], value: Set[T]): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  118. def set[T](feature: ArrayFeature[T], value: Array[T]): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  119. final def set(paramPair: ParamPair[_]): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    Params
  120. final def set(param: String, value: Any): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    Params
  121. final def set[T](param: Param[T], value: T): TextMatcherInternal.this.type
    Definition Classes
    Params
  122. def setBuildFromTokens(v: Boolean): TextMatcherInternal.this.type

    Setter for buildFromTokens param

  123. def setCaseSensitive(v: Boolean): TextMatcherInternal.this.type

    Whether to match regardless of case (Default: true)

  124. def setCleanKeywords(value: ArrayList[String]): TextMatcherInternal.this.type
    Definition Classes
    TextMatcherInternalParams
  125. def setCleanKeywords(values: Array[String]): TextMatcherInternal.this.type

    Sets the list of keywords to be cleaned during text processing.

    Sets the list of keywords to be cleaned during text processing.

    returns

    This instance with the updated configuration for cleaning keywords.

    Definition Classes
    TextMatcherInternalParams
  126. def setCleanStopWords(v: Boolean): TextMatcherInternal.this.type

    Sets whether to clean stop words during text processing.

    Sets whether to clean stop words during text processing.

    v

    Boolean value indicating whether to enable (true) or disable (false) the cleaning of stop words.

    returns

    This instance with the updated configuration for cleaning stop words.

    Definition Classes
    TextMatcherInternalParams
  127. def setDefault[T](feature: StructFeature[T], value: () ⇒ T): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  128. def setDefault[K, V](feature: MapFeature[K, V], value: () ⇒ Map[K, V]): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  129. def setDefault[T](feature: SetFeature[T], value: () ⇒ Set[T]): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  130. def setDefault[T](feature: ArrayFeature[T], value: () ⇒ Array[T]): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  131. final def setDefault(paramPairs: ParamPair[_]*): TextMatcherInternal.this.type
    Attributes
    protected
    Definition Classes
    Params
  132. final def setDefault[T](param: Param[T], value: T): TextMatcherInternal.this.type
    Attributes
    protected[org.apache.spark.ml]
    Definition Classes
    Params
  133. def setDelimiter(v: String): TextMatcherInternal.this.type

    Set the Phrase, Entity delimiter in the entities file (Default: ,)

  134. def setEnableLemmatizer(value: Boolean): TextMatcherInternal.this.type

    Enables or disables the lemmatizer for text matching.

    Enables or disables the lemmatizer for text matching.

    value

    If true, the lemmatizer will be enabled; if false, it will be disabled.

    returns

    This TextMatcherInternal instance with the updated lemmatizer setting.

    Definition Classes
    TextMatcherInternalParams
  135. def setEnableStemmer(value: Boolean): TextMatcherInternal.this.type

    Enables or disables the use of a stemmer for text processing.

    Enables or disables the use of a stemmer for text processing.

    value

    Boolean value indicating whether to enable (true) or disable (false) the stemmer.

    returns

    Instance of this class with updated configuration.

    Definition Classes
    TextMatcherInternalParams
  136. def setEntities(path: String, readAs: Format, options: Map[String, String] = Map("format" -> "text")): TextMatcherInternal.this.type

    Provides a file with phrases to match.

    Provides a file with phrases to match. Default: Looks up path in configuration.

    path

    a path to a file that contains the entities in the specified format.

    readAs

    the format of the file, can be one of {ReadAs.TEXT, ReadAs.SPARK}. Defaults to ReadAs.TEXT.

    options

    a map of additional parameters. Defaults to Map("format" -> "text").

    returns

    this

  137. def setEntities(value: ExternalResource): TextMatcherInternal.this.type

    Provides a file with phrases to match (Default: Looks up path in configuration)

  138. def setEntityValue(v: String): TextMatcherInternal.this.type

    Setter for Value for the entity metadata field

  139. def setExcludePunctuation(v: Boolean): TextMatcherInternal.this.type

    Sets the value of the excludePunctuation parameter used for text processing.

    Sets the value of the excludePunctuation parameter used for text processing.

    v

    A boolean value indicating whether to exclude punctuation.

    returns

    This instance with the updated excludePunctuation configuration.

    Definition Classes
    TextMatcherInternalParams
  140. def setExcludeRegexPatterns(v: Array[String]): TextMatcherInternal.this.type

    Sets the regular expression patterns for excluding specific elements during text processing.

    Sets the regular expression patterns for excluding specific elements during text processing.

    v

    Array of strings where each string represents a regular expression pattern to be used for excluding matching text elements.

    returns

    This instance with the updated configuration for exclude regex patterns.

    Definition Classes
    TextMatcherInternalParams
  141. final def setInputCols(value: String*): TextMatcherInternal.this.type
    Definition Classes
    HasInputAnnotationCols
  142. def setInputCols(value: Array[String]): TextMatcherInternal.this.type
    Definition Classes
    HasInputAnnotationCols
  143. def setLazyAnnotator(value: Boolean): TextMatcherInternal.this.type
    Definition Classes
    CanBeLazy
  144. def setLemmaDict(value: Map[String, String]): TextMatcherInternal.this.type

    Sets the internal dictionary used for lemmatization.

    Sets the internal dictionary used for lemmatization.

    value

    a map where keys are words and values are their corresponding lemmas.

    returns

    this

    Definition Classes
    TextMatcherInternalParams
  145. def setLemmatizerDictionary(path: String, keyDelimiter: String, valueDelimiter: String, readAs: Format = ReadAs.TEXT, options: Map[String, String] = Map("format" -> "text")): TextMatcherInternal.this.type

    External dictionary to be used by the lemmatizer, which needs keyDelimiter and valueDelimiter for parsing the resource

  146. def setLemmatizerDictionary(value: ExternalResource): TextMatcherInternal.this.type
  147. def setMergeOverlapping(v: Boolean): TextMatcherInternal.this.type

    Whether to merge overlapping matched chunks (Default: false)

  148. final def setOutputCol(value: String): TextMatcherInternal.this.type
    Definition Classes
    HasOutputAnnotationCol
  149. def setReturnChunks(v: String): TextMatcherInternal.this.type

    Sets the value of the returnChunks parameter used for text processing.

    Sets the value of the returnChunks parameter used for text processing.

    v

    A string value that specifies the configuration for returning chunks.

    returns

    This instance with the updated returnChunks configuration.

    Definition Classes
    TextMatcherInternalParams
  150. def setSafeKeywords(value: ArrayList[String]): TextMatcherInternal.this.type
    Definition Classes
    TextMatcherInternalParams
  151. def setSafeKeywords(v: Array[String]): TextMatcherInternal.this.type

    Sets the list of safe keywords to be used in text processing.

    Sets the list of safe keywords to be used in text processing.

    v

    Array of strings representing the safe keywords.

    returns

    This instance with the updated configuration for safe keywords.

    Definition Classes
    TextMatcherInternalParams
  152. def setShuffleEntitySubTokens(value: Boolean): TextMatcherInternal.this.type

    Setter for enableEntityVariations param

  153. def setSkipMatcherAugmentation(value: Boolean): TextMatcherInternal.this.type

    Sets whether to skip augmentation for matcher patterns.

    Sets whether to skip augmentation for matcher patterns.

    value

    If true, matcher patterns won't be augmented with lemmatization, stemming, etc. If false, matcher patterns will be augmented if the corresponding features are enabled.

    returns

    This instance with the updated configuration.

    Definition Classes
    TextMatcherInternalParams
  154. def setSkipSourceTextAugmentation(value: Boolean): TextMatcherInternal.this.type

    Sets whether to skip augmentation for source text.

    Sets whether to skip augmentation for source text.

    value

    If true, source text won't be augmented with lemmatization, stemming, etc. If false, source text will be augmented if the corresponding features are enabled.

    returns

    This instance with the updated configuration.

    Definition Classes
    TextMatcherInternalParams
  155. def setStopWords(value: ArrayList[String]): TextMatcherInternal.this.type
    Definition Classes
    TextMatcherInternalParams
  156. def setStopWords(v: Array[String]): TextMatcherInternal.this.type

    Sets the list of stop words to be used in text processing.

    Sets the list of stop words to be used in text processing.

    v

    Array of strings representing the stop words.

    returns

    This instance with the updated stop words setting.

    Definition Classes
    TextMatcherInternalParams
  157. def setTokenizer(tokenizer: TokenizerModel): TextMatcherInternal.this.type

    The Tokenizer to perform tokenization with

  158. val shuffleEntitySubTokens: BooleanParam

    Whether to generate and use variations (permutations) of the entity phrases.

    Whether to generate and use variations (permutations) of the entity phrases. Defaults to false.

  159. val skipMatcherAugmentation: BooleanParam

    A Boolean parameter that controls whether to skip augmentation (lemmatization, stemming, etc.) for matcher patterns.

    A Boolean parameter that controls whether to skip augmentation (lemmatization, stemming, etc.) for matcher patterns.

    When set to true, the matcher patterns won't be augmented with lemmatization, stemming, stopword removal, etc., even if those features are enabled. This applies only to entities/patterns being matched, not the source text.

    Default value is false, meaning matcher patterns will be augmented if the corresponding features are enabled.

    Definition Classes
    TextMatcherInternalParams
  160. val skipSourceTextAugmentation: BooleanParam

    A Boolean parameter that controls whether to skip augmentation (lemmatization, stemming, etc.) for the source text.

    A Boolean parameter that controls whether to skip augmentation (lemmatization, stemming, etc.) for the source text.

    When set to true, the source text won't be augmented with lemmatization, stemming, stopword removal, etc., even if those features are enabled. This applies only to the source text being analyzed, not the matcher patterns.

    Default value is false, meaning source text will be augmented if the corresponding features are enabled.

    Definition Classes
    TextMatcherInternalParams
  161. val stopWords: StringArrayParam

    A parameter representing the list of stop words to be filtered out during text processing.

    A parameter representing the list of stop words to be filtered out during text processing.

    By default, it is set to the English stop words provided by Spark ML.

    Definition Classes
    TextMatcherInternalParams
  162. final def synchronized[T0](arg0: ⇒ T0): T0
    Definition Classes
    AnyRef
  163. def toString(): String
    Definition Classes
    Identifiable → AnyRef → Any
  164. val tokenizer: StructFeature[TokenizerModel]

    The Tokenizer to perform tokenization with

  165. def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): TextMatcherInternalModel
    Definition Classes
    TextMatcherInternal → AnnotatorApproach
  166. final def transformSchema(schema: StructType): StructType
    Definition Classes
    AnnotatorApproach → PipelineStage
  167. def transformSchema(schema: StructType, logging: Boolean): StructType
    Attributes
    protected
    Definition Classes
    PipelineStage
    Annotations
    @DeveloperApi()
  168. val uid: String
    Definition Classes
    TextMatcherInternal → Identifiable
  169. def validate(schema: StructType): Boolean
    Attributes
    protected
    Definition Classes
    AnnotatorApproach
  170. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  171. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  172. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  173. def write: MLWriter
    Definition Classes
    ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable

Inherited from ParamsAndFeaturesWritable

Inherited from TextMatcherInternalParams

Inherited from HasFeatures

Inherited from AnnotatorApproach[TextMatcherInternalModel]

Inherited from CanBeLazy

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from HasOutputAnnotatorType

Inherited from HasOutputAnnotationCol

Inherited from HasInputAnnotationCols

Inherited from Estimator[TextMatcherInternalModel]

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

Annotator types

Required input and expected output annotator types

Parameters

A list of (hyper-)parameter keys this annotator can take. Users can set and get the parameter values through setters and getters, respectively.

Parameter setters

Parameter getters

Members