Amharic Lemmatizer


This model uses context and language knowledge to assign all forms and inflections of a word to a single root. This enables the pipeline to treat the past and present tense of a verb, for example, as the same word instead of two completely different words. The lemmatizer takes into consideration the context surrounding a word to determine which root is correct when the word form alone is ambiguous.

How to use

document_assembler = DocumentAssembler() \
    .setInputCol("text") \

tokenizer = Tokenizer()\

lemmatizer = LemmatizerModel.pretrained("lemma", "am") \
        .setInputCols(["token"]) \

nlp_pipeline = Pipeline(stages=[document_assembler, tokenize, lemmatizer])
light_pipeline = LightPipeline([[""]]).toDF("text")))
results = light_pipeline.fullAnnotate(["መጽሐፉን መጽሐፍ ኡ ን አስያዛት አስያዝ ኧ ኣት ።"])

val document_assembler = DocumentAssembler()

val tokenizer = Tokenizer()

val lemmatizer = LemmatizerModel.pretrained("lemma", "am")

val pipeline = new Pipeline().setStages(Array(document_assembler, tokenizer, lemmatizer))
val result =["መጽሐፉን መጽሐፍ   አስያዛት አስያዝ  ኣት ።"].toDS.toDF("text")).transform(data)
import nlu

text = ["መጽሐፉን መጽሐፍ ኡ ን አስያዛት አስያዝ ኧ ኣት ።"]
lemma_df = nlu.load('am.lemma').predict(text, output_level = "document")


{'lemma': [Annotation(token, 0, 4, _, {'sentence': '0'}),
  Annotation(token, 6, 9, መጽሐፍ, {'sentence': '0'}),
  Annotation(token, 11, 11, ኡ, {'sentence': '0'}),
  Annotation(token, 13, 13, ን, {'sentence': '0'}),
  Annotation(token, 15, 19, _, {'sentence': '0'}),
  Annotation(token, 21, 24, አስያዝ, {'sentence': '0'}),
  Annotation(token, 26, 26, ኧ, {'sentence': '0'}),
  Annotation(token, 28, 29, ኣት, {'sentence': '0'}),
  Annotation(token, 31, 31, ።, {'sentence': '0'})]}

Model Information

Model Name: lemma
Compatibility: Spark NLP 2.7.0+
License: Open Source
Edition: Official
Input Labels: [document]
Output Labels: [token]
Language: am

Data Source

The model was trained on the Universal Dependencies version 2.7.


Binyam Ephrem Seyoum ,Yusuke Miyao and Baye Yimam Mekonnen.2018.Universal Dependencies for Amharic. In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018), pp. 2216–2222, Miyazaki, Japan: European Language Resources Association (ELRA)