@inproceedings{ff061b4882a840148d66d7865fab3cef,
title = "Efficient interactive training selection for large-scale entity resolution",
abstract = "Entity resolution (ER) has wide-spread applications in many areas, including e-commerce, health-care, the social sciences, and crime and fraud detection. A crucial step in ER is the accurate classification of pairs of records into matches (assumed to refer to the same entity) and non-matches (assumed to refer to different entities). In most practical ER applications it is difficult and costly to obtain training data of high quality and enough size, which impedes the learning of an ER classifier. We tackle this problem using an interactive learning algorithm that exploits the cluster structure in similarity vectors calculated from compared record pairs. We select informative training examples to assess the purity of clusters, and recursively split clusters until clusters pure enough for training are found. We consider two aspects of active learning that are significant in practical applications: a limited budget for the number of manual classifications that can be done, and a noisy oracle where manual labeling might be incorrect. Experiments using several real data sets show that manual labeling efforts can be significantly reduced for training an ER classifier without compromising matching quality.",
keywords = "Active learning, Data matching, Deduplication, Hierarchical clustering, Interactive labelling, Noisy oracle, Record linkage",
author = "Qing Wang and Dinusha Vatsalan and Peter Christen",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing Switzerland 2015.; 19th Pacific-Asia Conference on Knowledge Discovery and Data Mining, PAKDD 2015 ; Conference date: 19-05-2015 Through 22-05-2015",
year = "2015",
doi = "10.1007/978-3-319-18032-8_44",
language = "English",
isbn = "9783319180311",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "562--573",
editor = "Tru Cao and Ee-Peng Lim and Tu-Bao Ho and Zhi-Hua Zhou and Hiroshi Motoda and David Cheung",
booktitle = "Advances in Knowledge Discovery and Data Mining - 19th Pacific-Asia Conference, PAKDD 2015, Proceedings",
address = "Germany",
}