@inproceedings{5c9019a9a8b44844a85f71ee379d7bfb,
title = "SchemaDB: A Dataset for Structures in Relational Data",
abstract = "In this paper we introduce the SchemaDB dataset; a collection of relational database schemas in both sql and graph formats. Databases are not commonly shared publicly for reasons of privacy and security, and so the corresponding schema for these databases are often not available for study. Consequently, an understanding of database structures in the wild is lacking, and most easily found examples of schema found publicly belong to common development frameworks or are derived from textbooks or engine benchmarks. SchemaDB contains 2,500 samples of relational schema found in public code repositories which have been standardised to MySQL syntax. We provide our gathering and transformation methodology, summary statistics, structural analysis, and discuss potential downstream research tasks in several domains.",
keywords = "Data transformation, Datasets, Machine learning, Relational databases, Web data collection",
author = "Cody Christopher and Kristen Moore and David Liebowitz",
note = "Publisher Copyright: {\textcopyright} 2022, The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd.; 20th Australasian Data Mining Conference, AusDM 2022 ; Conference date: 12-12-2022 Through 15-12-2022",
year = "2022",
doi = "10.1007/978-981-19-8746-5\_17",
language = "English",
isbn = "9789811987458",
series = "Communications in Computer and Information Science",
publisher = "Springer Science+Business Media B.V.",
pages = "233--243",
editor = "Park, \{Laurence A.F.\} and Simeon Simoff and Gomes, \{Heitor Murilo\} and Maryam Doborjeh and Boo, \{Yee Ling\} and Koh, \{Yun Sing\} and Yanchang Zhao and Graham Williams",
booktitle = "Data Mining",
address = "Netherlands",
}