Visual NER on 10K Filings (SEC)

Description

This model is a Visual NER team aimed to extract the main key points in the summary page of SEC 10 K filings (Annual reports).

Predicted Entities

REGISTRANT, ADDRESS, PHONE, DATE, EMPLOYERIDNB, EXCHANGE, STATE, STOCKCLASS, STOCKVALUE, TRADINGSYMBOL, FILENUMBER

Live Demo Open in Colab Copy S3 URI

How to use

binary_to_image = BinaryToImage()\
    .setInputCol("content") \
    .setOutputCol("image") \
    .setImageType(ImageType.TYPE_3BYTE_BGR)

img_to_hocr = ImageToHocr()\
    .setInputCol("image")\
    .setOutputCol("hocr")\
    .setIgnoreResolution(False)\
    .setOcrParams(["preserve_interword_spaces=0"])

tokenizer = HocrTokenizer()\
    .setInputCol("hocr")\
    .setOutputCol("token")

doc_ner = VisualDocumentNerV21()\
    .pretrained("visualner_10kfilings", "en", "clinical/models")\
    .setInputCols(["token", "image"])\
    .setOutputCol("entities")

draw = ImageDrawAnnotations() \
    .setInputCol("image") \
    .setInputChunksCol("entities") \
    .setOutputCol("image_with_annotations") \
    .setFontSize(10) \
    .setLineWidth(4)\
    .setRectColor(Color.red)

# OCR pipeline
pipeline = PipelineModel(stages=[
    binary_to_image,
    img_to_hocr,
    tokenizer,
    doc_ner,
    draw
])

bin_df = spark.read.format("binaryFile").load('data/t01.jpg')

results = pipeline.transform(bin_df).cache()
res = results.collect()
path_array = f.split(results['path'], '/')

results.withColumn('filename', path_array.getItem(f.size(path_array)- 1)) \
    .withColumn("exploded_entities", f.explode("entities")) \
    .select("filename", "exploded_entities") \
    .show(truncate=False)
val binary_to_image = new BinaryToImage()
    .setInputCol("content")
    .setOutputCol("image") 
    .setImageType(ImageType.TYPE_3BYTE_BGR)

val img_to_hocr = new ImageToHocr()
    .setInputCol("image")
    .setOutputCol("hocr")
    .setIgnoreResolution(False)
    .setOcrParams(Array("preserve_interword_spaces=0"))

val tokenizer = new HocrTokenizer()
    .setInputCol("hocr")
    .setOutputCol("token")

val doc_ner = VisualDocumentNerV21()
    .pretrained("visualner_10kfilings", "en", "clinical/models")
    .setInputCols(Array("token", "image"))
    .setOutputCol("entities")

val draw = new ImageDrawAnnotations() 
    .setInputCol("image") 
    .setInputChunksCol("entities") 
    .setOutputCol("image_with_annotations") 
    .setFontSize(10) 
    .setLineWidth(4)
    .setRectColor(Color.red)

# OCR pipeline
val pipeline = new PipelineModel().setStages(Array(
    binary_to_image, 
    img_to_hocr, 
    tokenizer, 
    doc_ner, 
    draw))

val bin_df = spark.read.format("binaryFile").load("data/t01.jpg")

val results = pipeline.transform(bin_df).cache()
val res = results.collect()
val path_array = f.split(results["path"], "/")

val results.withColumn("filename", path_array.getItem(f.size(path_array)- 1)) 
    .withColumn(Array("exploded_entities", f.explode("entities"))) 
    .select(Array("filename", "exploded_entities")) 
    .show(truncate=False)

Example

Input image

Screenshot

Output image

Screenshot

Output text

+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|filename|exploded_entities                                                                                                                                         |
+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|t01.jpg |{named_entity, 712, 716, OTHERS, {confidence -> 96, width -> 74, x -> 1557, y -> 416, word -> Ended, token -> ended, height -> 18}, []}                   |
|t01.jpg |{named_entity, 718, 724, DATE-B, {confidence -> 96, width -> 97, x -> 1639, y -> 416, word -> January, token -> january, height -> 24}, []}               |
|t01.jpg |{named_entity, 726, 727, DATE-I, {confidence -> 95, width -> 34, x -> 1743, y -> 416, word -> 31,, token -> 31, height -> 22}, []}                        |
|t01.jpg |{named_entity, 730, 733, DATE-I, {confidence -> 96, width -> 54, x -> 1785, y -> 416, word -> 2021, token -> 2021, height -> 18}, []}                     |
|t01.jpg |{named_entity, 735, 744, OTHERS, {confidence -> 91, width -> 143, x -> 1372, y -> 472, word -> Commission, token -> commission, height -> 18}, []}        |
|t01.jpg |{named_entity, 746, 749, OTHERS, {confidence -> 96, width -> 36, x -> 1523, y -> 472, word -> file, token -> file, height -> 18}, []}                     |
|t01.jpg |{named_entity, 751, 756, OTHERS, {confidence -> 92, width -> 96, x -> 1568, y -> 472, word -> number:, token -> number, height -> 18}, []}                |
|t01.jpg |{named_entity, 759, 761, FILENUMBER-B, {confidence -> 92, width -> 119, x -> 1675, y -> 472, word -> 001-39495, token -> 001, height -> 18}, []}          |
|t01.jpg |{named_entity, 769, 773, REGISTRANT-B, {confidence -> 92, width -> 136, x -> 1472, y -> 558, word -> ASANA,, token -> asana, height -> 31}, []}           |
|t01.jpg |{named_entity, 776, 778, REGISTRANT-I, {confidence -> 95, width -> 72, x -> 1620, y -> 558, word -> INC., token -> inc, height -> 25}, []}        
+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------+

Model Information

Model Name: visualner_10kfilings
Type: ocr
Compatibility: Visual NLP 4.0.0+
License: Licensed
Edition: Official
Language: en
Size: 744.4 MB

References

SEC 10k filings