@inproceedings{congcong2020cls, address = {{Gaithersburg, MD}}, title = {Classification for {{Crisis}}-{{Related Tweets Leveraging Word Embeddings}} and {{Data Augmentation}}}, abstract = {This paper presents University College Dublin's (UCD) work at TREC 2019-B Incident Streams (IS) track. The purpose of the IS track is to find actionable messages and estimate their priority among a stream of crisis-related tweets. Based on the track's requirements, we break down the task into two sub-tasks. One is defined as a multi-label classification task that categorises upcoming tweets into different aid requests. The other is defined as a single-label classification task that estimates these tweets with four different levels of priority. For the track, we submitted four runs, each of which uses a different model for the tasks. Our baseline run trains classification models with hand-crafted features through machine learning methods, namely Logistic Regression and Na{\"i}ve Bayes. Our other three runs train classification models with different deep learning methods. The deep methods include a vanilla bidirectional long short-term memory recurrent neural network (biLSTM), an adapted biLSTM, and a bi-attentive classification network (BCN) with pre-trained contextualised ELMo embedding. For all the runs, we apply different word embeddings (in-domain pre-trained, word-level pre-trained GloVe, character-level, or ELMo embeddings) and data augmentation strategies (SMOTE, loss weights, or GPT-2) to explore the influence they have on performance. Evaluation results show that our models perform better than the median for most situations.}, booktitle = {Proceedings of the Twenty-Eighth {{Text REtrieval Conference}} ({{TREC}} 2019)}, author = {Wang, Congcong and Lillis, David}, year = {2020}, }