@inproceedings{97ed85a0fcb74ba8968be9cd6b8991e2,
title = "JTAV: Jointly learning social media content representation by fusing textual, acoustic, and visual features",
abstract = "Learning social media content is the basis of many real-world applications, including information retrieval and recommendation systems, among others. In contrast with previous works that focus mainly on single modal or bi-modal learning, we propose to learn social media content by fusing jointly textual, acoustic, and visual information (JTAV). Effective strategies are proposed to extract fine-grained features of each modality, that is, attBiGRU and DCRNN. We also introduce cross-modal fusion and attentive pooling techniques to integrate multi-modal information comprehensively. Extensive experimental evaluation conducted on real-world datasets demonstrates our proposed model outperforms the state-of-the-art approaches by a large margin.",
author = "Hongru Liang and Haozheng Wang and Jun Wang and Shaodi You and Zhe Sun and Wei, {Jin Mao} and Zhenglu Yang",
note = "Publisher Copyright: {\textcopyright} 2018 COLING 2018 - 27th International Conference on Computational Linguistics, Proceedings. All rights reserved.; 27th International Conference on Computational Linguistics, COLING 2018 ; Conference date: 20-08-2018 Through 26-08-2018",
year = "2018",
language = "English",
series = "COLING 2018 - 27th International Conference on Computational Linguistics, Proceedings",
publisher = "Association for Computational Linguistics (ACL)",
pages = "1269--1280",
editor = "Bender, {Emily M.} and Leon Derczynski and Pierre Isabelle",
booktitle = "COLING 2018 - 27th International Conference on Computational Linguistics, Proceedings",
address = "United States",
}