@inproceedings{65075acb44e94e0b9724d7618bdd5017,
title = "Two stage similarity-aware indexing for large-scale real-time entity resolution",
abstract = "Entity resolution is the process of identifying records in one or multiple data sources that represent the same real-world entity. How to find all the records that belong to the same entity as the query record in real-time brings challenges to existing entity resolution approaches. The challenge is especially true for large-scale dataset. In this paper, we propose to use a two-stage similarity-aware indexing approach for large-scale real-time entity resolution. In the first stage, we use locality sensitive hashing to fulter out records with low similarities for the purpose of decreasing the number of comparisons. Then, in the second stage, we pre-calculate the comparison similarities of the attribute values to further decrease the query time. The experiments conducted on a largescale dataset with over 2 million records shows the effectiveness of the proposed approach.",
keywords = "Blocking, Dynamic data, Entity resolution, Locality sensitive hashing, Real-time, Scalability",
author = "Shouheng Li and Huizhi Liang and Banda Ramadan",
note = "Publisher Copyright: {\textcopyright} 2013, Australian Computer Society, Inc.",
year = "2013",
language = "English",
series = "Conferences in Research and Practice in Information Technology Series",
publisher = "Australian Computer Society",
pages = "107--116",
editor = "Yanchang Zhao and Andrew Stranieri and Lin Liu and Paul Kennedy and Peter Christen and Kok-Leong Ong and Yanchang Zhao",
booktitle = "Data Mining and Analytics 2013 - Proceedings of the 11th Australasian Data Mining Conference, AusDM 2013",
address = "Australia",
}