Web corpora are often constructed automatically, and their contents are therefore often not well understood. One technique for assessing the composition of such a web corpus is to empirically measure its similarity to a reference corpus whose composition is known. In this paper we evaluate a number of measures of corpus similarity, including a method based on topic modelling which has not been previously evaluated for this task. To evaluate these methods we use known-similarity corpora that have been previously used for this purpose, as well as a number of newly-constructed known-similarity corpora targeting differences in genre, topic, time, and region. Our findings indicate that, overall, the topic modelling approach did not improve on a chi-square method that had previously been found to work well for measuring corpus similarity.
@InProceedings{FOTHERGILL16.154,
author = {Richard Fothergill and Paul Cook and Timothy Baldwin}, title = {Evaluating a Topic Modelling Approach to Measuring Corpus Similarity}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }