In this paper, we discuss the automatic identification of language in Assamese-English-Hindi code-mixed data at the word-level. The data for this study was collected from public Facebook Pages and was annotated using a minimal tagset for code-mixed data. Support Vector Machine was trained using the total tagged dataset of approximately 20k tokens. The best performing classifier achieved a state-of-the-art accuracy of over 96%.
@InProceedings{BORA18.17, author = {Manas Jyoti Bora and Ritesh Kumar}, title = {Automatic Word-level Identification of Language in Assamese – English – Hindi Code-mixed Data}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Girish Nath Jha and Kalika Bali and Sobha L and Atul
Kr. Ojha}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-09-2}, language = {english} }