Title |
A Corpus of Spontaneous Speech in Lectures: The KIT Lecture Corpus for Spoken Language Processing and Translation |
Authors |
Eunah Cho, Sarah Fünfer, Sebastian Stüker and Alex Waibel |
Abstract |
With the increasing number of applications handling spontaneous speech, the needs to process spoken languages become stronger. Speech disfluency is one of the most challenging tasks to deal with in automatic speech processing. As most applications are trained with well-formed, written texts, many issues arise when processing spontaneous speech due to its distinctive characteristics. Therefore, more data with annotated speech disfluencies will help the adaptation of natural language processing applications, such as machine translation systems. In order to support this, we have annotated speech disfluencies in German lectures at KIT. In this paper we describe how we annotated the disfluencies in the data and provide detailed statistics on the size of the corpus and the speakers. Moreover, machine translation performance on a source text including disfluencies is compared to the results of the translation of a source text without different sorts of disfluencies or no disfluencies at all. |
Topics |
Machine Translation, SpeechToSpeech Translation, Corpus (Creation, Annotation, etc.) |
Full paper |
A Corpus of Spontaneous Speech in Lectures: The KIT Lecture Corpus for Spoken Language Processing and Translation |
Bibtex |
@InProceedings{CHO14.311,
author = {Eunah Cho and Sarah Fünfer and Sebastian Stüker and Alex Waibel}, title = {A Corpus of Spontaneous Speech in Lectures: The KIT Lecture Corpus for Spoken Language Processing and Translation}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)}, year = {2014}, month = {may}, date = {26-31}, address = {Reykjavik, Iceland}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Hrafn Loftsson and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, isbn = {978-2-9517408-8-4}, language = {english} } |