In this paper, we present a biomedical Chinese-English parallel corpus aligned at sentence level. We collected biomedical abstracts that are available in both Chinese and English from MEDLINE and generated a dataset of 5,129 bilingual abstracts. We then employed the Champollion aligner, which uses both lexicon and sentence length features, to align the sentences in both languages. The aligned parallel corpus contains 61,874 English sentences and 43,866 Chinese sentences. The corpus is still under development, as we are manually checking sentence boundary and the alignment. We believe such a publically available corpus will benefit the development of cross-lingual systems and applications in the biomedical domain.
@InProceedings{TANG18.7, author = {Lingyi Tang ,Jun Xu ,Xinyue Hu ,Qiang Wei and Hua Xu}, title = {Building a Biomedical Chinese-English Parallel Corpus from MEDLINE}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Maite Melero and Martin Krallinger and
Aitor Gonzalez-Agirre}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-03-0}, language = {english} }