@inproceedings{b23cd57f29bd4b6c955e54ae52c38fdc,
title = "Scalable entity resolution using probabilistic signatures on parallel databases",
abstract = "Accurate and efficient entity resolution is an open challenge of particular relevance to intelligence organisations that collect large datasets from disparate sources with differing levels of quality and standard. Starting from a first-principles formulation of entity resolution, this paper presents a novel entity resolution algorithm that introduces a data-driven blocking and record linkage technique based on the probabilistic identification of entity signatures in data. The scalability and accuracy of the proposed algorithm are evaluated using benchmark datasets and shown to achieve state-of-the-art results. The proposed algorithm can be implemented simply on modern parallel databases, which we have done in the financial intelligence domain with tens of Terabytes of noisy data.",
keywords = "Connected components, In-database analytics, Large-scale entity resolution, Probabilistic signature",
author = "Yuhang Zhang and Tania Churchill and Ng, {Kee Siong} and Peter Christen",
note = "Publisher Copyright: {\textcopyright} 2018 Copyright held by the owner/author(s). Publication rights licensed to ACM.; 27th ACM International Conference on Information and Knowledge Management, CIKM 2018 ; Conference date: 22-10-2018 Through 26-10-2018",
year = "2018",
month = oct,
day = "17",
doi = "10.1145/3269206.3272016",
language = "English",
series = "International Conference on Information and Knowledge Management, Proceedings",
publisher = "Association for Computing Machinery (ACM)",
pages = "2213--2222",
editor = "Norman Paton and Selcuk Candan and Haixun Wang and James Allan and Rakesh Agrawal and Alexandros Labrinidis and Alfredo Cuzzocrea and Mohammed Zaki and Divesh Srivastava and Andrei Broder and Assaf Schuster",
booktitle = "CIKM 2018 - Proceedings of the 27th ACM International Conference on Information and Knowledge Management",
address = "United States",
}