Description
this NER model is based on LayoutLM pre-trained model and fine-tuned with SROIE dataset
Predicted Entities
- O
- B-DATE
- B-COMPANY
- B-TOTAL
How to use
after feeding hocr as the input, this model should predict the related entity per work/token
ocr = ImageToHocr()\
.setInputCol("image")\
.setOutputCol("hocr")\
.setIgnoreResolution(False)\
.setOcrParams(["preserve_interword_spaces=0"])
doc_ner = VisualDocumentNer()\
.pretrained("visual_document_NER_SROIE0526", "en", "public/ocr/models") \
.setInputCol("hocr")\
.setOutputCol("label")
df = doc_ner.transform(ocr.transform(visual_document_df))
path_array = split(df['path'], '/')
df.withColumn('filename', path_array.getItem(size(path_array) - 1)) \
.select("filename", "entities", "label") \
.show(truncate=False)
val ocr = new ImageToHocr()
.setInputCol("corrected_image")
.setOutputCol("hocr")
.setIgnoreResolution(false)
.setOcrParams(Array("preserve_interword_spaces=0"))
val visualDocumentNER = VisualDocumentNER
.pretrained(testSparkModel, "en", "public/ocr/models")
.setInputCol("hocr")
val results = visualDocumentNER.transform(ocr)
Results
+------------------------------------------------------------------------+---------+
|entities |label |
+------------------------------------------------------------------------+---------+
|[entity, 0, 0, O, [word -> [1060, token -> [], []] |O |
|[entity, 0, 0, O, [word -> [1060, token -> 1060], []] |O |
|[entity, 0, 0, O, [word -> [1060, token -> 1060], []] |O |
|[entity, 0, 0, O, [word -> 257, token -> 257], []] |O |
|[entity, 0, 0, O, [word -> LEMON, token -> lemon], []] |O |
|[entity, 0, 0, O, [word -> TREE, token -> tree], []] |O |
|[entity, 0, 0, B-COMPANY, [word -> RESTAURANT, token -> restaurant], []]|B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> JTJ, token -> jtj], []] |B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> JTJ, token -> jtj], []] |B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> JTJ, token -> jtj], []] |B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> FOODS, token -> foods], []] |B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> SDN, token -> sdn], []] |B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> SDN, token -> sdn], []] |B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> BHD, token -> bhd], []] |B-COMPANY|
|[entity, 0, 0, B-COMPANY, [word -> BHD, token -> bhd], []] |B-COMPANY|
|[entity, 0, 0, O, [word -> (1179227A), token -> (], []] |O |
|[entity, 0, 0, O, [word -> (1179227A), token -> 1179227a], []] |O |
|[entity, 0, 0, O, [word -> (1179227A), token -> 1179227a], []] |O |
|[entity, 0, 0, O, [word -> (1179227A), token -> 1179227a], []] |O |
|[entity, 0, 0, O, [word -> (1179227A), token -> 1179227a], []] |O |
|[entity, 0, 0, O, [word -> (1179227A), token -> 1179227a], []] |O |
|[entity, 0, 0, O, [word -> (1179227A), token -> )], []] |O |
|[entity, 0, 0, O, [word -> GST, token -> gst], []] |O |
|[entity, 0, 0, O, [word -> GST, token -> gst], []] |O |
|[entity, 0, 0, O, [word -> 001085747200, token -> 001085747200], []] |O |
|[entity, 0, 0, O, [word -> 001085747200, token -> 001085747200], []] |O |
|[entity, 0, 0, O, [word -> 001085747200, token -> 001085747200], []] |O |
|[entity, 0, 0, O, [word -> 001085747200, token -> 001085747200], []] |O |
|[entity, 0, 0, O, [word -> 001085747200, token -> 001085747200], []] |O |
|[entity, 0, 0, O, [word -> 001085747200, token -> 001085747200], []] |O |
|[entity, 0, 0, O, [word -> No, token -> no], []] |O |
|[entity, 0, 0, O, [word -> 3,, token -> 3], []] |O |
|[entity, 0, 0, O, [word -> 3,, token -> ,], []] |O |
|[entity, 0, 0, O, [word -> Jalan, token -> jalan], []] |O |
|[entity, 0, 0, O, [word -> Permas, token -> permas], []] |O |
|[entity, 0, 0, O, [word -> Permas, token -> permas], []] |O |
|[entity, 0, 0, O, [word -> 10/8,, token -> 10], []] |O |
|[entity, 0, 0, O, [word -> 10/8,, token -> /], []] |O |
|[entity, 0, 0, O, [word -> 10/8,, token -> 8], []] |O |
|[entity, 0, 0, O, [word -> 10/8,, token -> ,], []] |O |
|[entity, 0, 0, O, [word -> Bandar, token -> bandar], []] |O |
|[entity, 0, 0, O, [word -> Bandar, token -> bandar], []] |O |
|[entity, 0, 0, O, [word -> Baru, token -> baru], []] |O |
|[entity, 0, 0, O, [word -> Baru, token -> baru], []] |O |
|[entity, 0, 0, O, [word -> Perrnas, token -> perrnas], []] |O |
|[entity, 0, 0, O, [word -> Perrnas, token -> perrnas], []] |O |
|[entity, 0, 0, O, [word -> Perrnas, token -> perrnas], []] |O |
|[entity, 0, 0, O, [word -> Jaya,, token -> jaya], []] |O |
|[entity, 0, 0, O, [word -> Jaya,, token -> ,], []] |O |
|[entity, 0, 0, O, [word -> 81750, token -> 81750], []] |O |
|[entity, 0, 0, O, [word -> 81750, token -> 81750], []] |O |
|[entity, 0, 0, O, [word -> 81750, token -> 81750], []] |O |
|[entity, 0, 0, O, [word -> Masai,, token -> masai], []] |O |
|[entity, 0, 0, O, [word -> Masai,, token -> masai], []] |O |
|[entity, 0, 0, O, [word -> Masai,, token -> ,], []] |O |
|[entity, 0, 0, O, [word -> Johor, token -> johor], []] |O |
|[entity, 0, 0, O, [word -> 07, token -> 07], []] |O |
|[entity, 0, 0, O, [word -> 3823456, token -> 3823456], []] |O |
|[entity, 0, 0, O, [word -> 3823456, token -> 3823456], []] |O |
|[entity, 0, 0, O, [word -> 3823456, token -> 3823456], []] |O |
|[entity, 0, 0, O, [word -> 3823456, token -> 3823456], []] |O |
|[entity, 0, 0, O, [word -> SIMPLIFIED, token -> simplified], []] |O |
|[entity, 0, 0, O, [word -> TAX, token -> tax], []] |O |
|[entity, 0, 0, O, [word -> INVOICE, token -> invoice], []] |O |
|[entity, 0, 0, O, [word -> INVOICE, token -> invoice], []] |O |
|[entity, 0, 0, O, [word -> INVOICE, token -> invoice], []] |O |
|[entity, 0, 0, O, [word -> INVOICENO, token -> invoiceno], []] |O |
|[entity, 0, 0, O, [word -> INVOICENO, token -> invoiceno], []] |O |
|[entity, 0, 0, O, [word -> INVOICENO, token -> invoiceno], []] |O |
|[entity, 0, 0, O, [word -> INVOICENO, token -> invoiceno], []] |O |
|[entity, 0, 0, O, [word -> ©S00014/69, token -> ©s00014], []] |O |
|[entity, 0, 0, O, [word -> ©S00014/69, token -> ©s00014], []] |O |
|[entity, 0, 0, O, [word -> ©S00014/69, token -> ©s00014], []] |O |
|[entity, 0, 0, O, [word -> ©S00014/69, token -> ©s00014], []] |O |
|[entity, 0, 0, O, [word -> ©S00014/69, token -> ©s00014], []] |O |
|[entity, 0, 0, O, [word -> ©S00014/69, token -> /], []] |O |
|[entity, 0, 0, O, [word -> ©S00014/69, token -> 69], []] |O |
|[entity, 0, 0, O, [word -> INVOICE, token -> invoice], []] |O |
|[entity, 0, 0, O, [word -> INVOICE, token -> invoice], []] |O |
|[entity, 0, 0, O, [word -> INVOICE, token -> invoice], []] |O |
|[entity, 0, 0, O, [word -> DALE:, token -> dale], []] |O |
|[entity, 0, 0, O, [word -> DALE:, token -> :], []] |O |
|[entity, 0, 0, B-DATE, [word -> 6/1/2018, token -> 6], []] |B-DATE |
|[entity, 0, 0, O, [word -> 6/1/2018, token -> /], []] |O |
|[entity, 0, 0, B-DATE, [word -> 6/1/2018, token -> 1], []] |B-DATE |
|[entity, 0, 0, O, [word -> 6/1/2018, token -> /], []] |O |
|[entity, 0, 0, B-DATE, [word -> 6/1/2018, token -> 2018], []] |B-DATE |
|[entity, 0, 0, O, [word -> 6:42:02, token -> 6], []] |O |
|[entity, 0, 0, O, [word -> 6:42:02, token -> :], []] |O |
|[entity, 0, 0, O, [word -> 6:42:02, token -> 42], []] |O |
|[entity, 0, 0, O, [word -> 6:42:02, token -> :], []] |O |
|[entity, 0, 0, O, [word -> 6:42:02, token -> 02], []] |O |
|[entity, 0, 0, O, [word -> PM, token -> pm], []] |O |
|[entity, 0, 0, O, [word -> WAITER:, token -> waiter], []] |O |
|[entity, 0, 0, O, [word -> WAITER:, token -> :], []] |O |
|[entity, 0, 0, O, [word -> Vanessa, token -> vanessa], []] |O |
|[entity, 0, 0, O, [word -> “Sane, token -> “sane], []] |O |
|[entity, 0, 0, O, [word -> “Sane, token -> “sane], []] |O |
|[entity, 0, 0, O, [word -> “Sane, token -> “sane], []] |O |
|[entity, 0, 0, O, [word -> Pax, token -> pax], []] |O |
+------------------------------------------------------------------------+---------+
Model Information
Model Name: | visual_document_NER_SROIE0526_en_3.0.0_3.0.1_1621990933091 |
Type: | ocr |
Compatibility: | Spark NLP 3.0.0+ |
License: | Licensed |
Edition: | Official |
Language: | en |
Case sensitive: | false |
Max sentense length: | 512 |
Data Source
SROIE