@inproceedings{61b3cfb7cfcc491ba6fe2e9aba77fcee,
title = "Dynamic similarity-aware inverted indexing for real-time entity resolution",
abstract = "Entity resolution is the process of identifying groups of records in a single or multiple data sources that represent the same real-world entity. It is an important tool in data de-duplication, in linking records across databases, and in matching query records against a database of existing entities. Most existing entity resolution techniques complete the resolution process offline and on static databases. However, real-world databases are often dynamic, and increasingly organizations need to resolve entities in real-time. Thus, there is a need for new techniques that facilitate working with dynamic databases in real-time. In this paper, we propose a dynamic similarity-aware inverted indexing technique (DySimII) that meets these requirements. We also propose a frequency-filtered indexing technique where only the most frequent attribute values are indexed. We experimentally evaluate our techniques on a large real-world voter database. The results show that when the index size grows no appreciable increase is found in the average record insertion time (around 0.1 msec) and in the average query time (less than 0.1 sec). We also find that applying the frequency-filtered approach reduces the index size with only a slight drop in recall.",
keywords = "Data matching, Duplicate detection, Dynamic indexing, Frequency-filtered indexing, Real-time query, Record linkage",
author = "Banda Ramadan and Peter Christen and Huizhi Liang and Gayler, {Ross W.} and David Hawking",
year = "2013",
doi = "10.1007/978-3-642-40319-4_5",
language = "English",
isbn = "9783642403187",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
pages = "47--58",
booktitle = "Trends and Applications in Knowledge Discovery and Data Mining - PAKDD 2013 International Workshops",
note = "17th Pacific-Asia Conference on Knowledge Discovery and Data Mining, PAKDD 2013 ; Conference date: 14-04-2013 Through 17-04-2013",
}