@inproceedings{14d1827bc5e642db83feabf1af588894,
title = "Secure and Accurate Two-Step Hash Encoding for Privacy-Preserving Record Linkage",
abstract = "In order to discover new insights from data, there is a growing need to share information that is distributed across multiple databases that are often held by different organisations. One key task in data integration is the calculation of similarities between records to identify pairs or sets of records that correspond to the same real-world entities. Due to privacy and confidentiality concerns, however, the owners of sensitive databases are often not allowed or willing to exchange or share their data with other organisations to allow such similarity calculations. In this paper we propose a novel privacy-preserving encoding technique that can be used to securely calculate similarities between sensitive values held in different databases. Our technique uses two-step hashing to encode values into an integer set representation that provides strong privacy guarantees and allows accurate similarity calculations. We provide a theoretical analysis of the accuracy and privacy of our encoding technique, and conduct an empirical study on large real databases containing several millions records. Our results show that our technique provides high security against privacy attacks and achieves better similarity accuracy compared to two state-of-the-art encoding techniques.",
keywords = "Hashing, Integer representation, Jaccard similarity",
author = "Thilina Ranbaduge and Peter Christen and Rainer Schnell",
note = "Publisher Copyright: {\textcopyright} Springer Nature Switzerland AG 2020.; 24th Pacific-Asia Conference on Knowledge Discovery and Data Mining, PAKDD 2020 ; Conference date: 11-05-2020 Through 14-05-2020",
year = "2020",
doi = "10.1007/978-3-030-47436-2_11",
language = "English",
isbn = "9783030474355",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer",
pages = "139--151",
editor = "Lauw, {Hady W.} and Ee-Peng Lim and Wong, {Raymond Chi-Wing} and Alexandros Ntoulas and See-Kiong Ng and Pan, {Sinno Jialin}",
booktitle = "Advances in Knowledge Discovery and Data Mining - 24th Pacific-Asia Conference, PAKDD 2020, Proceedings",
address = "Germany",
}