Awadhi is an Indo-Aryan language, spoken in the eastern region of Uttar Pradesh by approximately 38 million native speakers. However, despite this large number of speakers, it is highly lacking in language resources like corpus, language technology tools, guidelines etc till date. This paper presents the first attempt towards developing an annotated corpora and a POS tagger of the language, The corpus is currently annotated with part-of-speech tags. Since there is no earlier tagset available for Awadhi, the POS tagset for the language was developed as part of this research. The tagset is a subset of the BIS scheme, which is the national standard for the development of POS tagsets for Indian languages.
@InProceedings{BASIT18.18, author = {Abdul Basit and Ritesh Kumar}, title = {Towards a Part-of-Speech Tagger for Awadhi: Corpus and Experiments}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Girish Nath Jha and Kalika Bali and Sobha L and Atul
Kr. Ojha}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-09-2}, language = {english} }