This paper discusses the creation of a semantically annotated corpus of questions about patient data in electronic health records (EHRs). The goal is provide the training data necessary for semantic parsers to automatically convert EHR questions into a structured query. A layered annotation strategy is used which mirrors a typical natural language processing (NLP) pipeline. First, questions are syntactically analyzed to identify multi-part questions. Second, medical concepts are recognized and normalized to a clinical ontology. Finally, logical forms are created using a lambda calculus representation. We use a corpus of 446 questions asking for patient-specific information. From these, 468 specific questions are found containing 259 unique medical concepts and requiring 53 unique predicates to represent the logical forms. We further present detailed characteristics of the corpus, including inter-annotator agreement results, and describe the challenges automatic NLP systems will face on this task.
@InProceedings{ROBERTS16.152,
author = {Kirk Roberts and Dina Demner-Fushman}, title = {Annotating Logical Forms for EHR Questions}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }