Multilingual parliaments have been a useful source for monolingual and multilingual corpus collection. However, it is often the case that extra-textual information about speakers or the original language of the sentences is absent, and as a result, these resources cannot be fully used in translation studies. In this paper we present a method for processing and building a parallel corpus consisting of parliamentary debates of the European Parliament for English into German and English into Spanish. The paper documents all necessary (pre- and post-) processing steps for creating such a valuable resource. In addition to the parallel corpora, we collect monolingual comparable corpora for English, German and Spanish using the same method.
@InProceedings{KARAKANTA18.10, author = {Alina Karakanta ,Mihaela Vela and Elke Teich}, title = {Preserving Metadata from Parliamentary Debates}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Darja Fišer and Maria Eskevich and Franciska de Jong}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-02-3}, language = {english} }