@inproceedings{48b3f786a8b24972bc3503099bfa5afb,
title = "Text segmentation and Chinese site search",
abstract = "Automatic segmentation and overlapping bigrams are the most common methods for overcoming the lack of explicit word boundaries in Chinese text. Past studies have compared their effectiveness, but findings have been equivocal and site search has been little studied. We compare representatives of the two approaches using a 465,000 page crawl and test queries applicable to the university context. 503 pairs of result sets were judged by 56 Chinese students. Although there are differences on certain queries, we find no overall advantage to either method. To understand the merits of each approach, we analyze cases where they performed differently. Our analysis enumerates situations which favour segmentation, and those which favour bigrams. We observe that further improvements in segmentation accuracy will not improve retrieval effectiveness.",
keywords = "Chinese IR, Segmentation, Site search",
author = "Liyuan Zhou and David Hawking and Paul Thomas",
note = "Publisher Copyright: {\textcopyright} ACM.; 20th Australasian Document Computing Symposium, ADCS 2015 ; Conference date: 08-12-2015 Through 09-12-2015",
year = "2015",
month = dec,
day = "8",
doi = "10.1145/2838931.2838940",
language = "English",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery (ACM)",
editor = "Sarvnaz Karimi and Park, {Laurence A. F.}",
booktitle = "ADCS 2015 - Proceedings of the 20th Australasian Document Computing Symposium",
address = "United States",
}