@techreport{TD:100453,
	att_abstract={{Multidimensional distributions are often used in data min- ing to describe and summarize different features of large datasets. It is natural to look for distinct classes in such datasets by clustering the data. A common approach entails the use of methods like k-means clustering. However, the k-means method inherently relies on the Euclidean metric in the embedded space and does not account for additional topology underlying the distribution.
In this paper, we propose using Earth Mover Distance (EMD) to compare multidimensional distributions. For a n-bin histogram, the EMD is based on a solution to the transportation problem with time complexity O(n3 log n). To mitigate the high computational cost of EMD, we pro- pose an approximation that reduces the cost to linear time.
Other notions of distances such as the information theo- retic Kullback-Leibler divergence and statistical χ2 distance, account only for the correspondence between bins with the same index, and do not use information across bins, and are sensitive to bin size. A cross-bin distance measure like EMD is not affected by binning differences and meaningfully matches the perceptual notion of “nearness”.
Our technique is simple, efficient and practical for clus- tering distributions. We demonstrate the use of EMD on a practical application of clustering over 400,000 anonymous mobility usage patterns which are defined as distributions over a manifold. EMD allows us to represent inherent re- lationships in this space. We show that EMD allows us to successfully cluster even sparse signatures and we compare the results with other clustering methods. Given the large size of our dataset a fast approximation is crucial for this application.}},
	att_authors={yj205v, nd1321, ag1971, ph2326, wh5769, gj2418, ss2864, sv1623},
	att_categories={C_NSS.7},
	att_copyright={{ACM}},
	att_copyright_notice={{(c) ACM, 2011. This is the author's version of the work. It is posted here by permission of ACM for your personal use. Not for redistribution. The definitive version was published in ACM SIGCOMM Workshop on Measurements Up the STack (W‐MUST) {{, 2011-08-19}}.
}},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={},
	att_techdoc={true},
	att_techdoc_key={TD:100453},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:100453_DS1_2011-08-12T20:48:12.621Z.pdf},
	author={Yu Jin and Nicholas Duffield and Alexandre Gerber and Patrick Haffner and Wen-ling Hsu and Guy Jacobson and Subhabrata Sen and Shobha Venkataraman and Zhi-Li Zhang},
	institution={{in Proc. ACM SIGCOMM Workshop on
Measurements Up the STack (W-MUST)}},
	month={August},
	title={{Large-scale App-based Reporting of Customer Problems in Cellular Networks: Potential and Limitations}},
	year=2011,
}