Information about the location of an action is often implicit in text, as humans can infer it based on common sense knowledge. Today’s NLP systems however struggle with inferring information that goes beyond what is explicit in text. Selectional preference estimation based on large amounts of data provides a way to infer prototypical role fillers, but text-based systems tend to underestimate the probability of the most typical role fillers. We here present a new dataset containing thematic fit judgments for 2,000 verb/location pairs. This dataset can be used for evaluating text-based, vision-based or multimodal inference systems for the typicality of an event’s location. We additionally provide three thematic fit baselines for this dataset: a state-of-the-art neural networks based thematic fit model learned from linguistic data, a model estimating typical locations based on the MSCOCO dataset and a simple combination of the systems.
@InProceedings{MUKUZE18.1089, author = {Nelson Mukuze and Anna Rohrbach and Vera Demberg and Bernt Schiele}, title = "{A vision-grounded dataset for predicting typical locations for verbs}", booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {May 7-12, 2018}, address = {Miyazaki, Japan}, editor = {Nicoletta Calzolari (Conference chair) and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Koiti Hasida and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and Hélène Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis and Takenobu Tokunaga}, publisher = {European Language Resources Association (ELRA)}, isbn = {979-10-95546-00-9}, language = {english} }