We present E-TIPSY, a search query corpus annotated with named Entities, Term Importance, POS tags, and SYntactic parses. This corpus contains crowdsourced (gold) annotations of the three most important terms in each query. In addition, it contains automatically produced annotations of named entities, part-of-speech tags, and syntactic parses for the same queries. This corpus comes in two formats: (1) Sober Subset: annotations that two or more crowd workers agreed upon, and (2) Full Glass: all annotations. We analyze the strikingly low correlation between term importance and syntactic headedness, which invites research into effective ways of combining these different signals. Our corpus can serve as a benchmark for term importance methods aimed at improving search engine quality and as an initial step toward developing a dataset of gold linguistic analysis of web search queries. In addition, it can be used as a basis for linguistic inquiries into the kind of expressions used in search.
@InProceedings{MARTON16.264,
author = {Yuval Marton and Kristina Toutanova}, title = {E-TIPSY: Search Query Corpus Annotated with Entities, Term Importance, POS Tags, and Syntactic Parses}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }