Languages are disappearing at an alarming rate, linguistics rights of speakers of most of the 7000 languages are under risk. ICT play a key role for the preservation of endangered languages; as ultimate use of ICT, natural language processing must be highlighted since in this century the lack of such support hampers literacy acquisition as well as prevents the use of the Internet and any electronic means. The first step is the building of resources for processing, therefore we introduce the first speech corpus of Southern Quechua, Siminchik, suitable for training and evaluating speech recognition systems. The corpus consists of 97 hours of spontaneous conversations recorded in radio programs in the Southern regions of Peru. The annotation task was carried out by native speakers from those regions using the unified written convention. We present initial experiments on speech recognition and language modeling and explain the challenges inherent to the nature and current status of this ancestral language.
@InProceedings{ZEVALLOS18.4, author = {Rodolfo Zevallos and Luis Camacho}, title = {Siminchik: A Speech Corpus for Preservation of Southern Quechua}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Ineke Schuurman and Leen Sevens and Victoria Yaneva and John O’Flaherty}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-12-2}, language = {english} }