The paper describes a new approach for aligning English and Arabic documents for the purpose of comparable corpora construction. The proposed approach make use of LDA topics to analyze the topical structures of the documents. Based on the LDA topics we created topic mapping dictionary to automatically transfer a set of key-words describing the topics within the source document to the target language and use the transferred knowledge to judge whether two documents written in English and Arabic are comparable. Besides the topical mappings, we also use the traditional translation-based features to boost the alignment performance. We also integrated our alignment approach within a search tool that enables users to search for English documents, select an English document and retrieve Arabic documents comparable to the selected English document. The Arabic documents are ranked according to how comparable they are to the selected English document. In both cases the tools lets the user to read the articles.
@InProceedings{SABBAH18.9, author = {Firas Sabbah and Ahmet Aker}, title = {Creating Comparable Corpora through Topic Mappings}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Reinhard Rapp and Pierre Zweigenbaum and Serge Sharoff}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-07-8}, language = {english} }