This study describes a new corpus of over 60,000 hand-annotated metadiscourse acts from 106 OpenCourseWare lectures, from two different disciplines: Physics and Economics. Metadiscourse is a set of linguistic expressions that signal different functions in the discourse. This type of language is hypothesised to be helpful in finding a structure in unstructured text, such as lectures discourse. A brief summary is provided about the annotation scheme and labelling procedures, inter-annotator reliability statistics, overall distributional statistics, a description of auxiliary data that will be distributed with the corpus, and information relating to how to obtain the data. The results provide a deeper understanding of lecture structure and confirm the reliable coding of metadiscursive acts in academic lectures across different disciplines. The next stage of our research will be to build a classification model to automate the tagging process, instead of manual annotation, which take time and efforts. This is in addition to the use of these tags as indicators of the higher level structure of lecture discourse.
@InProceedings{ALHARBI16.1085,
author = {Ghada Alharbi and Thomas Hain}, title = {The OpenCourseWare Metadiscourse (OCWMD) Corpus}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }