context

package context

Ordering

Alphabetic

Visibility

Public
All

Type Members

trait ContextRules[T] extends Serializable

class ContextualEntityFilterer extends AnnotatorModel[ContextualEntityFilterer] with HasSimpleAnnotate[ContextualEntityFilterer] with HandleExceptionParams with HasSafeAnnotate[ContextualEntityFilterer] with CheckLicense

ContextualEntityFilterer can filter chunks coming from CHUNK annotations based on entity(identifier,field) info in metadata.

ContextualEntityFilterer can filter chunks coming from CHUNK annotations based on entity(identifier,field) info in metadata. Filters can be done via white list entities, black list entities, black list word and white list words. The filter can be applied to the scope of the sentence or the document.

Example

Define pipeline stages to extract entities

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentenceDetector = new SentenceDetector()
  .setInputCols(Array("document"))
  .setOutputCol("sentences")

val tokenizer = new Tokenizer()
  .setInputCols(Array("sentences"))
  .setOutputCol("tokens")

val embedder = WordEmbeddingsModel
  .pretrained("embeddings_clinical", "en", "clinical/models")
  .setInputCols(Array("sentences", "tokens"))
  .setOutputCol("embeddings")

val nerTagger = MedicalNerModel
  .pretrained("ner_deid_generic_augmented", "en", "clinical/models")
  .setInputCols(Array("sentences", "tokens", "embeddings"))
  .setOutputCol("nerTags")

val nerConverter = new NerConverterInternal()
  .setInputCols(Array("sentences", "tokens", "nerTags"))
  .setOutputCol("nerChunks")

Define ContextualEntityFilterer and set the rules

val jsonRules=
     """
         |[{
         | "entity" : "LOCATION",
         | "scopeWindow" : [2,2],
         | "whiteListEntities" : ["AGE","DATE"],
         | "blackListEntities" : ["ID","NAME"],
         | "scopeWindowLevel"  : "token",
         | "blackListWords" : ["beautiful"]
         | },
         | {
         |  "entity" : "DATE",
         |  "scopeWindow" : [2,2],
         |  "whiteListEntities" : ["AGE","DATE"],
         |  "blackListEntities" : ["ID","NAME"],
         |  "scopeWindowLevel"  : "chunk",
         |  "confidenceThreshold" : 0,50
         | }
         | ]
         |
         |""".stripMargin

 val contextualEntityFilter = new ContextualEntityFilterer()
   .setInputCols(Array("sentences", "tokens", "nerChunks"))
   .setOutputCol("filtered_chunks")
   .setRulesAsStr(jsonRules)
   .setRuleScope("document")



val pipeline = new Pipeline().setStages(Array(
     documentAssembler,
     sentenceDetector,
     tokenizer,
     embedder,
     nerTagger,
     nerConverter,
     contextualEntityFilter
   ))

 val testText = "California, known for its beautiful beaches,and he is 36 years. " +
     "The Grand Canyon in Arizona, where the age is 37, is a stunning natural landmark. " +
     "It was founded on September 9, 1850, and Arizona on February 14, 1912."
 val testDataSet = Seq(testText).toDS.toDF("text")

 val result = pipeline.fit(testDataSet).transform(testDataSet)

Show results

result.selectExpr("explode(filtered_chunks) as filtered").show(100,truncate = false)

    -----------------+-----+-----+------+
    |result           |begin|end  |entity|
    +-----------------+-----+-----+------+
    |36               |54   |55   |AGE   |
    |37               |110  |111  |AGE   |
    |September 9, 1850|164  |180  |DATE  |
    |February 14, 1912|198  |214  |DATE  |
    +-----------------+-----+-----+------+

class ContextualEntityRuler extends AnnotatorModel[ContextualEntityRuler] with HasSimpleAnnotate[ContextualEntityRuler] with HandleExceptionParams with HasSafeAnnotate[ContextualEntityRuler] with CheckLicense

ContextualEntityRuler is an annotator that updates chunks based on contextual rules.

ContextualEntityRuler is an annotator that updates chunks based on contextual rules. These rules are defined in the ContextualEntityRulerRules class and can include prefixes, suffixes, and the context within a specified scope window around the chunk.

This annotator modifies the detected chunks by replacing their entity labels or content based on matching patterns and rules. It is particularly useful for refining entity recognition results in domain-specific text processing.

Example

Define pipeline stages to extract entities:

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentenceDetector = new SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentences")

val tokenizer = new Tokenizer()
  .setInputCols("sentences")
  .setOutputCol("tokens")

val embeddings = WordEmbeddingsModel
  .pretrained("embeddings_clinical", "en", "clinical/models")
  .setInputCols("sentences", "tokens")
  .setOutputCol("embeddings")

val medicalNerModel = MedicalNerModel
  .pretrained("ner_deid_generic_augmented", "en", "clinical/models")
  .setInputCols("sentences", "tokens", "embeddings")
  .setOutputCol("ner")

val nerChunks = new NerConverterInternal()
  .setInputCols("sentences", "tokens", "ner")
  .setOutputCol("nerChunks")

Define ContextualEntityRuler and set the rules:

val jsonRules =
  """
    |[{
    | "entity" : "AGE",
    | "scopeWindow" : [2, 2],
    | "scopeWindowLevel" : "token",
    | "prefixPatterns" : ["is"],
    | "suffixPatterns" : ["years"],
    | "replaceEntity" : "REPLACED_AGE"
    | }]
    |""".stripMargin

val contextualEntityRuler = new ContextualEntityRuler()
  .setInputCols(Array("sentences", "tokens", "nerChunks"))
  .setOutputCol("updated_chunks")
  .setRulesAsStr(jsonRules)
  .setCaseSensitive(false)
  .setAllowPunctuationInBetween(true)

val pipeline = new Pipeline().setStages(
  Array(
    documentAssembler,
    sentenceDetector,
    tokenizer,
    embeddings,
    medicalNerModel,
    nerChunks,
    contextualEntityRuler
  ))

val result = pipeline
  .fit(Seq.empty[String].toDF("text"))
  .transform(Seq(
    "California, known for its beautiful beaches, and he is 36 years old. " +
    "The Grand Canyon in Arizona, where the age is, 37, is a stunning natural landmark. " +
    "It was founded on September 9, 1850, and Arizona on February 14, 1912."
  ).toDF("text"))

Show results:

result.selectExpr("explode(updated_chunks) as filtered").show(100, truncate = false)

// Example output:

 +-----------------+-----+---+--------+
  |result           |begin|end|entity  |
  +-----------------+-----+---+--------+
  |California       |0    |9  |LOCATION|
  |is 36 years      |52   |62 |AGE     |
  |Grand Canyon     |73   |84 |LOCATION|
  |Arizona          |89   |95 |LOCATION|
  |is, 37           |112  |117|AGE     |
  |September 9, 1850|170  |186|DATE    |
  |February 14, 1912|204  |220|DATE    |
  +-----------------+-----+---+--------+

Key Concepts

**Rules**: Define the contextual rules in JSON format, specifying:
- entity: The target entity to match (e.g., AGE).
- scopeWindow: A range [x, y] defining the number of tokens around the entity to consider.
- scopeWindowLevel: The level of the scope window (token or char).
- prefixPatterns: Patterns to match before the entity (e.g., "is").
- suffixPatterns: Patterns to match after the entity (e.g., "years").
- prefixRegexes: Regular expressions to match before the entity.
- suffixRegexes: Regular expressions to match after the entity.
- prefixEntities: Entities to match before the entity.
- suffixEntities: Entities to match after the entity.
- regexInBetween: Regular expression to match text between the entity and prefix/suffix. If matched, the prefix/suffix entities will be included with the target entity.
- replaceEntity: The value to replace the matched entity with.
- mode: The mode of the rule. It can be either "include" or "exclude".
**Parameters**:
- setCaseSensitive: Enables case sensitivity in pattern matching.
- setAllowPunctuationInBetween: Allows punctuation to appear between matched patterns and entities. -

case class ContextualEntityRulerRules(entity: String, scopeWindowLevel: Option[String], prefixPatterns: Option[Array[String]], suffixPatterns: Option[Array[String]], scopeWindow: Option[(Int, Int)], prefixRegexes: Option[Array[String]], suffixRegexes: Option[Array[String]], prefixEntities: Option[Array[String]], suffixEntities: Option[Array[String]], replaceEntity: Option[String], regexInBetween: Option[String], mode: Option[String]) extends Serializable with Product
Represents rules used by the ContextualEntityRuler to update or filter chunks in text.
Represents rules used by the ContextualEntityRuler to update or filter chunks in text.
entity
The target entity to be modified. Example: "AGE"
scopeWindowLevel
Specifies the level of the scope window to consider. Valid values: "token" or "char". Default: "token".
prefixPatterns
Array of patterns (words/phrases) to search **before the entity**. If these patterns are found within the scope, they will contribute to matching. Example: ["years", "old"] matches entities preceded by "years" or "old."
suffixPatterns
Array of patterns (words/phrases) to search **after the entity**. If these patterns are found within the scope, they will contribute to matching. Example: ["years", "old"] matches entities followed by "years" or "old."
scopeWindow
A tuple defining the range of tokens or characters (based on scopeWindowLevel) to include in the scope. Default: (1, 1).
- Both integers must be non-negative or (-1, -1) for no scope limit. Example: (2, 3) means 2 tokens/characters before and 3 after the entity are considered.
prefixRegexes
Array of regular expressions to search **before the entity**. These regexes define more complex matching patterns for prefixes. Example: ["\\b(years|months)\\b"] matches words like "years" or "months" as prefixes.
suffixRegexes
Array of regular expressions to search **after the entity**. These regexes define more complex matching patterns for suffixes. Example: ["\\b(old|young)\\b"] matches words like "old" or "young" as suffixes.
prefixEntities
Array of entities to search **before the entity**. If these entities are found within the scope, they will contribute to matching. Example: ["AGE"] matches entities of type "AGE" before the target entity. Note: This is is not supported for mode 'exclude'.
suffixEntities
Array of entities to search **after the entity**. If these entities are found within the scope, they will contribute to matching. Example: ["AGE"] matches entities of type "AGE" after the target entity. Note: This is is not supported for mode 'exclude'.
replaceEntity
Optional string specifying the new entity to replace the target entity with. Example: "AGE_RANGE" replaces "AGE" with "AGE_RANGE" in matching cases.
regexInBetween
Optional regular expression to match text between the target entity and the prefix/suffix. If the regex matches, the prefix/suffix entities will be merged with target entity. Example: "\\b(of)\\b" merges the prefix entity with the target entity if "of" is between them. Note: This is is not supported for mode 'exclude'.
mode
The mode of the rule. Options: include, exclude, or replace_label_only.
- include: Include given patterns to chunk.
- exclude: Exclude given patterns from chunk.
- replace_label_only: Replace the label of the chunk with the given label. Intact the text. Default: include.
case class ContextualFilteringRules(entity: String, scopeWindowLevel: String, whiteListEntities: Option[Array[String]], blackListEntities: Option[Array[String]], scopeWindow: (Int, Int), blackListWords: Option[Array[String]], whiteListWords: Option[Array[String]], confidenceThreshold: Option[Double], possibleRegexContext: Option[String], impossibleRegexContext: Option[String]) extends Serializable with Product
ContextualFilteringRules is a case class that represents the rules to filter the Chunks.
ContextualFilteringRules is a case class that represents the rules to filter the Chunks.
entity
The field of the entity to filter.
scopeWindowLevel
The level of the scope window. It can be either 'token' or 'chunk'.
whiteListEntities
The white list entities to filter.One element of the white list is enough to keep the chunk.
blackListEntities
The black list entities to filter.All elements of the black list must be absent to keep the chunk.
scopeWindow
The scope window considering chunks to filter. Scope can be calculated looking at tokens or chunks.Decision of chunk or token can be defined by scopeWindowLevel.
blackListWords
The black list words to filter. All elements of the black list must be absent to keep the chunk.
whiteListWords
The white list words to filter. One element of the white list is enough to keep the chunk.
confidenceThreshold
The confidence threshold to filter the chunks. Filtering is only applied if the confidence of the chunk is below the threshold.
possibleRegexContext
The possible regex context to filter the chunks. If the regex is found in the context(chunk), the chunk is kept.
impossibleRegexContext
The impossible regex context to filter the chunks. If the regex is found in the context(chunk), the chunk is removed.

class ContextualParserApproach extends AnnotatorApproach[ContextualParserModel] with HandleExceptionParams with CheckLicense

Creates a model, that extracts entity from a document based on user defined rules.

Creates a model, that extracts entity from a document based on user defined rules. Rule matching is based on a RegexMatcher defined in a JSON file. It is set through the parameter setJsonPath() In this JSON file, regex is defined that you want to match along with the information that will output on metadata field. Additionally, a dictionary can be provided with setDictionary to map extracted entities to a unified representation. The first column of the dictionary file should be the representation with following columns the possible matches.

Example

An example JSON file regex_token.json can look like this:

{
"entity": "Stage",
"ruleScope": "sentence",
"regex": "[cpyrau]?[T][0-9X?][a-z^cpyrau]*",
"matchScope": "token"
}

Which means to extract the stage code on a sentence level. An example pipeline could then be defined like this

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentenceDetector = new SentenceDetector()
  .setInputCols("document")
  .setOutputCol("sentence")

val tokenizer = new Tokenizer()
  .setInputCols("sentence")
  .setOutputCol("token")

Define the parser (json file needs to be provided)

val data = Seq("A patient has liver metastases pT1bN0M0 and the T5 primary site may be colon or... ").toDF("text")
val contextualParser = new ContextualParserApproach()
  .setInputCols(Array("sentence", "token"))
  .setOutputCol("entity")
  .setJsonPath("/path/to/regex_token.json")
  .setCaseSensitive(true)
val pipeline = new Pipeline().setStages(Array(
    documentAssembler,
    sentenceDetector,
    tokenizer,
    contextualParser
  ))

val result = pipeline.fit(data).transform(data)

Show Results

result.selectExpr("explode(entity)").show(5, truncate=false)
+-------------------------------------------------------------------------------------------------------------------------+
|col                                                                                                                      |
+-------------------------------------------------------------------------------------------------------------------------+
|{chunk, 32, 39, pT1bN0M0, {field -> Stage, normalized -> , confidence -> 1.00, sentence -> 0}, []}                 |
|{chunk, 49, 50, T5, {field -> Stage, normalized -> , confidence -> 1.00, sentence -> 0}, []}                       |
|{chunk, 148, 156, cT4bcN2M1, {field -> Stage, normalized -> , confidence -> 1.00, sentence -> 1}, []}              |
|{chunk, 189, 194, T?N3M1, {field -> Stage, normalized -> , confidence -> 1.00, sentence -> 2}, []}                 |
|{chunk, 316, 323, pT1bN0M0, {field -> Stage, normalized -> , confidence -> 1.00, sentence -> 3}, []}               |
+-------------------------------------------------------------------------------------------------------------------------+

See also: ContextualParserModel for the trained model

class ContextualParserModel extends AnnotatorModel[ContextualParserModel] with HasSimpleAnnotate[ContextualParserModel] with HandleExceptionParams with HasSafeAnnotate[ContextualParserModel] with CheckLicense
Extracts entity from a document based on user defined rules.
Extracts entity from a document based on user defined rules. Rule matching is based on a RegexMatcher defined in a JSON file. In this file, regex is defined that you want to match along with the information that will output on metadata field. To instantiate a model, see ContextualParserApproach and its accompanied example.

See also
ContextualParserApproach to create your own model
case class Dictionary(dictionary: Map[String, String]) extends Product with Serializable

Attributes
protected
case class EntityDefinition(entity: String, ruleScope: String, regex: Option[String], contextLength: Option[Double], prefix: Option[List[String]], regexPrefix: Option[String], suffix: Option[List[String]], regexSuffix: Option[String], contextException: Option[List[String]], exceptionDistance: Option[Double], regexContextException: Option[String], matchScope: Option[String], completeMatchRegex: Option[String]) extends Product with Serializable

Attributes
protected
class MatchExceptions extends ContextRules[Boolean] with Serializable
class MatchPrefixSuffix extends ContextRules[(Boolean, Map[String, Double])] with Serializable
class MatchRegex extends ContextRules[(Boolean, Map[String, Double])] with Serializable
class MatchRegexPerSentence extends ContextRules[List[(Boolean, Map[String, Double])]] with Serializable
case class MatchedToken(token: String, begin: Int, end: Int, valueMatch: String, regexMatch: String, sentenceIndex: Int, confidenceValue: Double, normalizedValue: String, tokenIndex: Int) extends Product with Serializable
trait ReadablePretrainedContextualParser extends ParamsAndFeaturesReadable[ContextualParserModel] with HasPretrained[ContextualParserModel]

Value Members

object ContextualEntityFilterer extends ParamsAndFeaturesReadable[ContextualEntityFilterer] with Serializable
object ContextualEntityRuler extends ParamsAndFeaturesReadable[ContextualEntityRuler] with Serializable
object ContextualEntityRulerRules extends Serializable
Companion object for ContextualEntityRulerRules.
object ContextualFilteringRules extends Serializable
Companion object for ContextualFilteringRules.
object ContextualParserApproach extends DefaultParamsReadable[ContextualParserApproach] with Serializable
object ContextualParserModel extends ReadablePretrainedContextualParser with Serializable

Packages

context

package context

Type Members

Example

Example

Key Concepts

Example

Value Members

Ungrouped

Packages

context 

package context

Type Members

Example

Example

Key Concepts

Example

Value Members

Ungrouped

context