@inproceedings{c04feca385f5439fbb026f7d8bea7d3d,
title = "Data cleaning and matching of institutions in bibliographic databases",
abstract = "Bibliographic databases are very important for a variety of tasks for governments, academic institutions and businesses. These include assessing research output of institutions, performance evaluation of academics and compiling university rankings. However, incorrect or incomplete data in such databases can compromise any analysis and lead to poor decisions and financial loss. In this paper we detail our experience with an entity resolution project on Australian institution data using the SCOPUS bibliographic database. The goal of the project was to improve the entity resolution of institution data in SCOPUS so it could be used more effectively in other applications. We detail the methodology including a novel approach for extracting correct institution names from the values of one of the attributes. Along with the results from the project we present our insights into the specific characteristics and difficulties of the Australian institution data, and some techniques that were effective in addressing these. Finally, we present our conclusions and describe other situations where our experience and techniques could be applied.",
keywords = "Bibliographic databases, Data matching, Deduplication, SCOPUS",
author = "Jeffrey Fisher and Qing Wang and Paul Wong and Peter Christen",
note = "Publisher Copyright: {\textcopyright} 2013, Australian Computer Society, Inc.",
year = "2013",
language = "English",
series = "Conferences in Research and Practice in Information Technology Series",
publisher = "Australian Computer Society",
pages = "139--148",
editor = "Yanchang Zhao and Andrew Stranieri and Lin Liu and Paul Kennedy and Peter Christen and Kok-Leong Ong and Yanchang Zhao",
booktitle = "Data Mining and Analytics 2013 - Proceedings of the 11th Australasian Data Mining Conference, AusDM 2013",
address = "Australia",
}