OpenSubtitles.org provides a large collection of user contributed subtitles in various languages for movies and TV programs. Subtitle translations are valuable resources for cross-lingual studies and machine translation research. A less explored feature of the collection is the inclusion of alternative translations, which can be very useful for training paraphrase systems or collecting multi-reference test suites for machine translation. However, differences in translation may also be due to misspellings, incomplete or corrupt data files, or wrongly aligned subtitles. This paper reports our efforts in recognising and classifying alternative subtitle translations with language independent techniques. We use time-based alignment with lexical re-synchronisation techniques and BLEU score filters and sort alternative translations into categories using edit distance metrics and heuristic rules. Our approach produces large numbers of sentence-aligned translation alternatives for over 50 languages provided via the OPUS corpus collection.
@InProceedings{TIEDEMANN16.62,
author = {Jörg Tiedemann}, title = {Finding Alternative Translations in a Large Corpus of Movie Subtitle}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }