Parallel corpora extracted from online repositories of movie and TV subtitles are employed in a wide range of NLP applications, from language modelling to machine translation and dialogue systems. However, the subtitles uploaded in such repositories exhibit varying levels of quality. A particularly difficult problem stems from the fact that a substantial number of these subtitles are not written by human subtitlers but are simply generated through the use of online translation engines. This paper investigates whether these machine-generated subtitles can be detected automatically using a combination of linguistic and extra-linguistic features. We show that a feedforward neural network trained on a small dataset of subtitles can detect machine-generated subtitles with a F-1 score of 0.64. Furthermore, applying this detection model on an unlabelled sample of subtitles allows us to provide a statistical estimate for the proportion of subtitles that are machine-translated (or are at least of very low quality) in the full corpus.
@InProceedings{LISON18.5, author = {Pierre Lison and A. Seza Doğruöz}, title = {Detecting Machine-translated Subtitles in Large Parallel Corpora}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Reinhard Rapp and Pierre Zweigenbaum and Serge Sharoff}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-07-8}, language = {english} }