This paper describes the creation of a high quality parallel corpus for Portuguese and Chinese that extracted from parallel and comparable documents. The corpus is constructed using an on-line alignment platform, UM-pAligner. The UM-pAligner consists of two main alignment components, parallel sentence identification and classification model, for acquiring the parallel sentences from either the parallel or comparable texts in a semi-automatic manner. The extracted parallel sentences are manually verified. The resulting corpus is composed of the parallel sentences covering the texts of the newswire, legal, subtitle, technical and general on-line publications, around 6 million parallel sentences. About 1 million parallel sentences are compiled and made available for download at the NLP2CT website.
@InProceedings{CHAO18.17, author = {Lidia S Chao and Derek F Wong}, title = {UM-PCorpus: A Large Portuguese-Chinese Parallel Corpus }, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Erhong Yang and Le Sun}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-29-0}, language = {english} }