Title |
Induction of Treebank-Aligned Lexical Resources |
Authors |
Tejaswini Deoskar and Mats Rooth |
Abstract |
We describe the induction of lexical resources from unannotated corpora that are aligned with treebank grammars, providing a systematic correspondence between features in the lexical resource and a treebank syntactic resource. We first describe a methodology based on parsing technology for augmenting a treebank database with linguistic features. A PCFG containing these features is created from the augmented treebank. We then use a procedure based on the inside-outside algorithm to learn lexical resources aligned with the treebank PCFG from large unannotated corpora. The method has been applied in creating a feature-annotated English treebank based on the Penn Treebank. The unsupervised estimation procedure gives a substantial error reduction (up to 31.6%) on the task of learning the subcategorization preference of novel verbs that are not present in the annotated training sample. |
Language |
|
Topics |
Lexicon, lexical database, Statistical methods, Parsing Systems |
Full paper |
Induction of Treebank-Aligned Lexical Resources |
Slides |
Induction of Treebank-Aligned Lexical Resources |
Bibtex |
@InProceedings{DEOSKAR08.798,
author = {Tejaswini Deoskar and Mats Rooth},
title = {Induction of Treebank-Aligned Lexical Resources},
booktitle = {Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC'08)},
year = {2008},
month = {may},
date = {28-30},
address = {Marrakech, Morocco},
editor = {Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Daniel Tapias},
publisher = {European Language Resources Association (ELRA)},
isbn = {2-9517408-4-0},
note = {http://www.lrec-conf.org/proceedings/lrec2008/},
language = {english}
} |