Title |
A Common Parts-of-Speech Tagset Framework for Indian Languages |
Authors |
Baskaran Sankaran, Kalika Bali, Monojit Choudhury, Tanmoy Bhattacharya, Pushpak Bhattacharyya, Girish Nath Jha, S. Rajendran, K. Saravanan, L. Sobha and K.V. Subbarao |
Abstract |
We present a universal Parts-of-Speech (POS) tagset framework covering most of the Indian languages (ILs) following the hierarchical and decomposable tagset schema. In spite of significant number of speakers, there is no workable POS tagset and tagger for most ILs, which serve as fundamental building blocks for NLP research. Existing IL POS tagsets are often designed for a specific language; the few that have been designed for multiple languages cover only shallow linguistic features ignoring linguistic richness and the idiosyncrasies. The new framework that is proposed here addresses these deficiencies in an efficient and principled manner. We follow a hierarchical schema similar to that of EAGLES and this enables the framework to be flexible enough to capture rich features of a language/ language family, even while capturing the shared linguistic structures in a methodical way. The proposed common framework further facilitates the sharing and reusability of scarce resources in these languages and ensures cross-linguistic compatibility. |
Language |
Multiple languages |
Topics |
Tagging, Standards for LRs, Morphology |
Full paper |
A Common Parts-of-Speech Tagset Framework for Indian Languages |
Slides |
- |
Bibtex |
@InProceedings{SANKARAN08.337,
author = {Baskaran Sankaran, Kalika Bali, Monojit Choudhury, Tanmoy Bhattacharya, Pushpak Bhattacharyya, Girish Nath Jha, S. Rajendran, K. Saravanan, L. Sobha and K.V. Subbarao},
title = {A Common Parts-of-Speech Tagset Framework for Indian Languages},
booktitle = {Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC'08)},
year = {2008},
month = {may},
date = {28-30},
address = {Marrakech, Morocco},
editor = {Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Daniel Tapias},
publisher = {European Language Resources Association (ELRA)},
isbn = {2-9517408-4-0},
note = {http://www.lrec-conf.org/proceedings/lrec2008/},
language = {english}
} |