Emotion Recognition (ER) is an important part of dialogue analysis which can be used in order to improve the quality of Spoken Dialogue Systems (SDSs). The emotional hypothesis of the current response of an end-user might be utilised by the dialogue manager component in order to change the SDS strategy which could result in a quality enhancement. In this study additional speaker-related information is used to improve the performance of the speech-based ER process. The analysed information is the speaker identity, gender and age of a user. Two schemes are described here, namely, using additional information as an independent variable within the feature vector and creating separate emotional models for each speaker, gender or age-cluster independently. The performances of the proposed approaches were compared against the baseline ER system, where no additional information has been used, on a number of emotional speech corpora of German, English, Japanese and Russian. The study revealed that for some of the corpora the proposed approach significantly outperforms the baseline methods with a relative difference of up to 11.9%.
@InProceedings{SIDOROV16.492,
author = {Maxim Sidorov and Alexander Schmitt and Eugene Semenkin and Wolfgang Minker}, title = {Could Speaker, Gender or Age Awareness be beneficial in Speech-based Emotion Recognition?}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {may}, date = {23-28}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, language = {english} }