#!/usr/bin/env python # # Copyright (c) 2011 Brian House # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # See for details. # """ For more info / discussion: http://blog.brianhouse.net/post/15629863565 """ import math def distance(a, b): """Euclidean distance between two sequences""" assert len(a) == len(b) t = sum((a[i] - b[i])**2 for i in xrange(len(a))) return t**(1.0/len(a)) def geo_distance(pt0, pt1, miles=True): """Return the distance between two points, specified (lon, lat), in miles (or kilometers)""" LON, LAT = 0, 1 pt0 = math.radians(pt0[LON]), math.radians(pt0[LAT]) pt1 = math.radians(pt1[LON]), math.radians(pt1[LAT]) lon_delta = pt1[LON] - pt0[LON] lat_delta = pt1[LAT] - pt0[LAT] a = math.sin(lat_delta / 2)**2 + math.cos(pt0[LAT]) * math.cos(pt1[LAT]) * math.sin(lon_delta / 2)**2 c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) d = 6371 * c # radius of Earth in km if miles: d *= 0.621371192 return d class ClusterTree(object): """Hierarchical agglomerative clustering""" @staticmethod def build(vectors, distance_f=distance): # use geo_distance for geography """Build a ClusterTree on the set of input vectors and a given distance function""" distance_f = distance_f vectors = vectors root = None print("ClusterTree learning %d vectors..." % len(vectors)) distances = {} current_cluster_id = -1 # start with every feature in its own cluster cluster = [ClusterTree(vectors[i], id=i) for i in xrange(len(vectors))] while len(cluster) > 1: # find the smallest distance between pairs closest_pair = (0, 1) min_distance = distance_f(cluster[0].vector, cluster[1].vector) for i in xrange(len(cluster)): for j in xrange(i + 1, len(cluster)): if (cluster[i].id, cluster[j].id) not in distances: # cache calcs distances[(cluster[i].id, cluster[j].id)] = distance_f(cluster[i].vector, cluster[j].vector) d = distances[(cluster[i].id, cluster[j].id)] if d < min_distance: min_distance = d closest_pair = (i, j) # create new cluster with a merged vector that is the average between clusters average_vector = [(cluster[closest_pair[0]].vector[i] + cluster[closest_pair[1]].vector[i]) / 2.0 for i in xrange(len(cluster[0].vector))] new_cluster = ClusterTree(average_vector, id=current_cluster_id, left=cluster[closest_pair[0]], right=cluster[closest_pair[1]]) current_cluster_id = current_cluster_id - 1 # remove the clusters that have been agglomerated and add the new one to the cluster list del cluster[closest_pair[1]] del cluster[closest_pair[0]] cluster.append(new_cluster) # just one cluster remaining print("--> done. Calculating radii...") # radius is the furthest leaf node from a cluster's centroid def calc_radii(master, cluster=None): if cluster is None: master.radius_calculated = True calc_radii(master, master) elif cluster.id >= 0: # leaf d = distance_f(master.vector, cluster.vector) if d > master.radius: master.radius = d else: # branches if cluster.left is not None: calc_radii(master, cluster.left) if not cluster.left.radius_calculated: calc_radii(cluster.left) if cluster.right is not None: calc_radii(master, cluster.right) if not cluster.right.radius_calculated: calc_radii(cluster.right) calc_radii(cluster[0], cluster[0]) print("--> done") return cluster[0] def __init__(self, vector, id=None, left=None, right=None): self.left = left self.right = right self.vector = vector self.id = id self.radius = 0.0 self.radius_calculated = False def get_order(self): """Return a list of indexes for how the input vectors are ordered in a depth first enumeration of the cluster, for ordering by likeness""" ids = self.get_leaf_ids() inverse = zip(range(len(ids)), ids) inverse.sort(key=lambda t: t[1]) inverse, nop = zip(*inverse) return inverse def get_leaf_ids(self): """Perform a depth first listing of ids for elements""" if self.id >= 0: # leaf return [self.id] else: # branches left_branch = [] right_branch = [] if self.left is not None: left_branch = self.left.get_leaf_ids() if self.right is not None: right_branch = self.right.get_leaf_ids() return left_branch + right_branch def get_pruned(self, max_radius): """Prune cluster until we have a set of sub-clusters with a maximum radius, return as a list of primary branches""" if self.radius <= max_radius: return [self] else: left_branch = [] right_branch = [] if self.left is not None: left_branch = self.left.get_pruned(max_radius) if self.right is not None: right_branch = self.right.get_pruned(max_radius) return left_branch + right_branch def draw(self, n=0): """Indent to make a hierarchy layout""" output = [] for i in xrange(n-1): # output.append("| ") output.append(" ") if n: output.append("|\n") for i in xrange(n-1): # output.append("| ") output.append(" ") output.append("|_") if self.id < 0: # negative id means that this is branch output.append("%f\n" % (self.radius)) else: # positive id means that this is an endpoint output.append("<%d> %s\n" % (self.id, self.vector)) # now print the right and left branches if self.left != None: output.append(self.left.draw(n+1)) if self.right != None: output.append(self.right.draw(n+1)) output = "".join(output) return output def __repr__(self): return "" % (self.id, self.radius, str(self.vector)) if __name__ == "__main__": # lonlats vectors = [ (-73.959587, 40.700550), (-74.005478, 40.736172), (-74.001495, 40.730076), (-73.989952, 40.755913), (-73.961044, 40.680523), (-73.988213, 40.756870), (-73.998276, 40.745892), (-73.988243, 40.758320), (-74.003242, 40.736607), (-73.992645, 40.737282), (-73.964523, 40.687843), (-73.959976, 40.689465), (-73.990227, 40.663258), (-73.959084, 40.718075), (-73.965073, 40.684383), (-73.959305, 40.682167), (-73.983543, 40.686768), (-73.987846, 40.757870), (-73.959839, 40.675007), (-74.002655, 40.748798), (-73.982246, 40.745609), (-74.003578, 40.713501), (-73.990356, 40.754631), (-73.990501, 40.756229), (-73.954521, 40.725109), (-73.967308, 40.691536), (-73.954231, 40.723579), (-73.993607, 40.736607), (-73.961525, 40.684975), (-73.959549, 40.685371) ] ct = ClusterTree.build(vectors, geo_distance) clusters = ct.get_pruned(0.25) print(ct.draw())