TGermaCorp is a German text corpus whose primary sources are collected from German literature texts which date from the sixteenth century to the present. The corpus is intended to represent its target language (German) in syntactic, lexical, stylistic and chronological diversity. For this purpose, it is hand-annotated on several linguistic layers, including POS, lemma, named entities, multiword expressions, clauses, sentences and paragraphs. In order to introduce TGermaCorp in comparison to more homogeneous corpora of contemporary everyday language, quantitative assessments of syntactic and lexical diversity are provided. In this respect, TGermaCorp contributes to establishing characterising features for resource descriptions, which is needed for keeping track of a meaningful comparison of the ever-growing number of natural language resources. The assessments confirm the special role of proper names, whose propagation in text may influence lexical and syntactic diversity measures in rather trivial ways. TGermaCorp will be made available via hucompute.org.
@InProceedings{LUECKING16.444,
author = {Andy Luecking and Armin Hoenen and Alexander Mehler}, title = {TGermaCorp -- A (Digital) Humanities Resource for (Computational) Linguistics}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }