Contents analisys from text data requires semantic representations that are difficult to obtain automatically, as they may require large handcrafted knowledge bases or manually annotated examples. Unsupervised autonomous methods for generating semantic representations are of greatest interest in face of huge volumes of text to be exploited in all kinds of applications. In this work we describe the generation and validation of semantic representations in the vector space paradigm for Spanish. The method used is GloVe (Pennington, 2014), one of the best performing reported methods , and vectors were trained over Spanish Wikipedia. The learned vectors evaluation is done in terms of word analogy and similarity tasks (Pennington, 2014; Baroni, 2014; Mikolov, 2013a). The vector set and a Spanish version for some widely used semantic relatedness tests are made publicly available.
@InProceedings{ETCHEVERRY16.1212,
author = {Mathias Etcheverry and Dina Wonsever}, title = {Spanish Word Vectors from Wikipedia}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }