Title |
Authorship Attribution of E-Mail: Comparing Classifiers over a New Corpus for Evaluation |
Authors |
Ben Allison and Louise Guthrie |
Abstract |
The release of the Enron corpus provided a unique resource for studying aspects of email use, because it is largely unfiltered, and therefore presents a relatively complete collection of emails for a reasonably large number of correspondents. This paper describes a newly created subcorpus of the Enron emails which we suggest can be used to test techniqes for authorship attribution, and further shows the application of three different classification methods to this task to present baseline results. Two of the classifiers used are are standard, and have been shown to perform well in the literature, and one of the classifiers is novel and based on concurrent work that proposes a Bayesian hierarchical distribution for word counts in documents. For each of the classifiers, we present results using six text representations, including use of linguistic structures derived from a parser as well as lexical information. |
Language |
Single language |
Topics |
Document Classification, Text categorisation, Statistical methods, Language modelling |
Full paper |
Authorship Attribution of E-Mail: Comparing Classifiers over a New Corpus for Evaluation |
Slides |
- |
Bibtex |
@InProceedings{ALLISON08.552,
author = {Ben Allison and Louise Guthrie},
title = {Authorship Attribution of E-Mail: Comparing Classifiers over a New Corpus for Evaluation},
booktitle = {Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC'08)},
year = {2008},
month = {may},
date = {28-30},
address = {Marrakech, Morocco},
editor = {Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Daniel Tapias},
publisher = {European Language Resources Association (ELRA)},
isbn = {2-9517408-4-0},
note = {http://www.lrec-conf.org/proceedings/lrec2008/},
language = {english}
} |