This paper is concerned with the question of whether we can predict the future impact of a paper based on the text of the paper. We create a corpus of papers in computational linguistics, and we create gold standard impact annotations by using their Google Scholar citation counts. We use supervised classification approaches to automatically predict impact of the papers. Our results when using very simple features show some success, but they also show that the classifiers suffer from class imbalance problems.
@InProceedings{CHEN18.2, author = {Yue Chen ,Kenneth Steimel ,Everett Green ,Nils Hjortnaes ,Zuoyu Tian ,Daniel Dakota and Sandra Kübler}, title = {Towards Determining Textual Characteristics of High and Low Impact Publications}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Jana Diesner and Georg Rehm and Andreas Witt}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-05-4}, language = {english} }