Abstract
Large amounts of spatio-temporal data are continuously collected through the use of location devices or sensor technologies. One of the techniques usually used to obtain a first insight on data is clustering. The Shared Nearest Neighbour (SNN) is a clustering algorithm that finds clusters with different densities, shapes and sizes, and also identifies noise in data, making it a good candidate to deal with spatial data. However, its time complexity is, in the worst case, O(n 2), compromising its scalability. This paper presents the use of a metric data structure, the kd-Tree, to index spatial data and support the SNN in querying for the k-nearest neighbours, improving the time complexity in the average case of the algorithm, when dealing with low dimensional data, to at most O(n ×logn). The proposed algorithm, the kd-SNN, was evaluated in terms of performance, showing huge improvements over existing approaches, allowing the identification of the main traffic routes by completely clustering a maritime data set.
Links
BibTeX (Download)
@incollection{faustino2014kd, title = {kd-SNN: a metric data structure seconding the clustering of spatial data}, author = {Bruno Faustino and João Moura Pires and Maribel Yasmina Santos and Guilherme Moreira}, url = {http://dx.doi.org/10.1007/978-3-319-09144-0_22}, isbn = {978-3-319-09143-3}, year = {2014}, date = {2014-01-01}, booktitle = {Computational Science and Its Applications--ICCSA 2014}, pages = {312--327}, publisher = {Springer}, abstract = {Large amounts of spatio-temporal data are continuously collected through the use of location devices or sensor technologies. One of the techniques usually used to obtain a first insight on data is clustering. The Shared Nearest Neighbour (SNN) is a clustering algorithm that finds clusters with different densities, shapes and sizes, and also identifies noise in data, making it a good candidate to deal with spatial data. However, its time complexity is, in the worst case, O(n 2), compromising its scalability. This paper presents the use of a metric data structure, the kd-Tree, to index spatial data and support the SNN in querying for the k-nearest neighbours, improving the time complexity in the average case of the algorithm, when dealing with low dimensional data, to at most O(n ×logn). The proposed algorithm, the kd-SNN, was evaluated in terms of performance, showing huge improvements over existing approaches, allowing the identification of the main traffic routes by completely clustering a maritime data set.}, keywords = {Clustering, Density-based Clustering, Metric-spaces, Shared Nearest Neighbour}, pubstate = {published}, tppubtype = {incollection} }