Every year, the amount of text produced and stored increases, and so do the requirements to process and search it. Some of the largest corpora we build for use in Sketch Engine now take months to compile, and their searching leaves a lot to be desired even on today's state-of-art machines, mainly due to lack of parallelization and storage bottlenecks. We describe our experiments in distributing the processing of corpus operations over a cluster of commodity computers.
@InProceedings{RÁBARA18.15, author = {Radoslav Rábara ,Pavel Rychlý and Ondřej Herman}, title = {Distributed Corpus Search}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Piotr Banski and Marc Kupietz and Adrien Barbaresi and
Hanno Biber and Evelyn Breiteneder and Simon Clematide and Andreas Witt}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-14-6}, language = {english} }