sparknlp_jsl.deidentification_module#

Module Contents#

Classes#

Deidentifier

class Deidentifier(spark, custom_pipeline=None, fields=None, ner_chunk='ner_chunk', sentence='sentence', token='token', document='document', masking_policy='entity_labels', fixed_mask_length=1, obfuscate_date=True, obfuscate_ref_source='faker', obfuscate_ref_file_path=None, age_group_obfuscation=False, age_ranges=None, shift_days=False, number_of_days=None, documenthashcoder_col_name='documentHash', date_tag='DATE', language='en', region='us', unnormalized_date=False, unnormalized_mode='mask', id_column_name='id', date_shift_column_name='dateshift', multi_mode_file_path=None, domain=None, separator='\t', input_file_path=None, output_file_path='deidentified.csv')#
age_group_obfuscation = False#
age_ranges = None#
custom_pipeline = None#
date_shift_column_name = 'dateshift'#
date_tag = 'DATE'#
document = 'document'#
documenthashcoder_col_name = 'documentHash'#
domain = None#
fields = None#
fixed_mask_length = 1#
id_column_name = 'id'#
input_file_path = None#
language = 'en'#
masking_policy = 'entity_labels'#
multi_mode_file_path = None#
ner_chunk = 'ner_chunk'#
number_of_days = None#
obfuscate_date = True#
obfuscate_ref_file_path = None#
obfuscate_ref_source = 'faker'#
output_file_path = 'deidentified.csv'#
region = 'us'#
sentence = 'sentence'#
separator = '\t'#
shift_days = False#
spark#
token = 'token'#
unnormalized_date = False#
unnormalized_mode = 'mask'#
deid_with_custom_pipeline(pretrained_pipeline=None)#

This function is used to deidentify the given data with custom pipeline.

deid_with_pretrained_pipeline()#

Deidentification with pretrained pipeline

deidentify()#

This function deidentifies the input file according to the given field names and saves the results as a csv/json file.