Title |
On the Use of Web Resources and Natural Language Processing Techniques to Improve Automatic Speech Recognition Systems |
Authors |
Gwénolé Lecorvé, Guillaume Gravier and Pascale Sébillot |
Abstract |
Language models used in current automatic speech recognition systems are trained on general-purpose corpora and are therefore not relevant to transcribe spoken documents dealing with successive precise topics, such as long multimedia streams, frequently tacking reportages and debates. To overcome this problem, this paper shows that Web resources and natural language processing techniques can be effective to automatically adapt the baseline language model of an automatic speech recognition system to any encountered topic. More precisely, we detail how to characterize the topic of transcription segment and how to collect Web pages from which a topic-specific language model can be trained. Then, an adapted language model is obtained by combining the topic-specific language model with the general-purpose language model. Finally, new transcriptions are generated using the adapted language model and are compared with transcriptions previously obtained with the baseline language model. Experiments show that our topic adaptation technique leads to significant transcription quality gains. |
Language |
Single language |
Topics |
Speech recognition and understanding, Language modelling, Information Extraction, Information Retrieval |
Full paper |
On the Use of Web Resources and Natural Language Processing Techniques to Improve Automatic Speech Recognition Systems |
Slides |
On the Use of Web Resources and Natural Language Processing Techniques to Improve Automatic Speech Recognition Systems |
Bibtex |
@InProceedings{LECORV08.155,
author = {Gwénolé Lecorvé, Guillaume Gravier and Pascale Sébillot},
title = {On the Use of Web Resources and Natural Language Processing Techniques to Improve Automatic Speech Recognition Systems},
booktitle = {Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC'08)},
year = {2008},
month = {may},
date = {28-30},
address = {Marrakech, Morocco},
editor = {Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Daniel Tapias},
publisher = {European Language Resources Association (ELRA)},
isbn = {2-9517408-4-0},
note = {http://www.lrec-conf.org/proceedings/lrec2008/},
language = {english}
} |