remove fastcluster (#29352)
* remove fastcluster * lock * rm there * and from release filespull/29353/head^2
parent
0ced56b2ea
commit
00a11a1a2b
16 changed files with 1 additions and 2574 deletions
@ -1,140 +0,0 @@ |
|||||||
# pylint: skip-file |
|
||||||
|
|
||||||
import time |
|
||||||
import unittest |
|
||||||
import numpy as np |
|
||||||
from fastcluster import linkage_vector |
|
||||||
from scipy.cluster import _hierarchy |
|
||||||
from scipy.spatial.distance import pdist |
|
||||||
|
|
||||||
from third_party.cluster.fastcluster_py import hclust, ffi |
|
||||||
from third_party.cluster.fastcluster_py import cluster_points_centroid |
|
||||||
|
|
||||||
|
|
||||||
def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None): |
|
||||||
# supersimplified function to get fast clustering. Got it from scipy |
|
||||||
Z = np.asarray(Z, order='c') |
|
||||||
n = Z.shape[0] + 1 |
|
||||||
T = np.zeros((n,), dtype='i') |
|
||||||
_hierarchy.cluster_dist(Z, T, float(t), int(n)) |
|
||||||
return T |
|
||||||
|
|
||||||
|
|
||||||
TRACK_PTS = np.array([[59.26000137, -9.35999966, -5.42500019], |
|
||||||
[91.61999817, -0.31999999, -2.75], |
|
||||||
[31.38000031, 0.40000001, -0.2], |
|
||||||
[89.57999725, -8.07999992, -18.04999924], |
|
||||||
[53.42000122, 0.63999999, -0.175], |
|
||||||
[31.38000031, 0.47999999, -0.2], |
|
||||||
[36.33999939, 0.16, -0.2], |
|
||||||
[53.33999939, 0.95999998, -0.175], |
|
||||||
[59.26000137, -9.76000023, -5.44999981], |
|
||||||
[33.93999977, 0.40000001, -0.22499999], |
|
||||||
[106.74000092, -5.76000023, -18.04999924]]) |
|
||||||
|
|
||||||
CORRECT_LINK = np.array([[2., 5., 0.07999998, 2.], |
|
||||||
[4., 7., 0.32984889, 2.], |
|
||||||
[0., 8., 0.40078104, 2.], |
|
||||||
[6., 9., 2.41209933, 2.], |
|
||||||
[11., 14., 3.76342275, 4.], |
|
||||||
[12., 13., 13.02297651, 4.], |
|
||||||
[1., 3., 17.27626057, 2.], |
|
||||||
[10., 17., 17.92918845, 3.], |
|
||||||
[15., 16., 23.68525366, 8.], |
|
||||||
[18., 19., 52.52351319, 11.]]) |
|
||||||
|
|
||||||
CORRECT_LABELS = np.array([7, 1, 4, 2, 6, 4, 5, 6, 7, 5, 3], dtype=np.int32) |
|
||||||
|
|
||||||
|
|
||||||
def plot_cluster(pts, idx_old, idx_new): |
|
||||||
import matplotlib.pyplot as plt |
|
||||||
m = 'Set1' |
|
||||||
|
|
||||||
plt.figure() |
|
||||||
plt.subplot(1, 2, 1) |
|
||||||
plt.scatter(pts[:, 0], pts[:, 1], c=idx_old, cmap=m) |
|
||||||
plt.title("Old") |
|
||||||
plt.colorbar() |
|
||||||
plt.subplot(1, 2, 2) |
|
||||||
plt.scatter(pts[:, 0], pts[:, 1], c=idx_new, cmap=m) |
|
||||||
plt.title("New") |
|
||||||
plt.colorbar() |
|
||||||
|
|
||||||
plt.show() |
|
||||||
|
|
||||||
|
|
||||||
def same_clusters(correct, other): |
|
||||||
correct = np.asarray(correct) |
|
||||||
other = np.asarray(other) |
|
||||||
if len(correct) != len(other): |
|
||||||
return False |
|
||||||
|
|
||||||
for i in range(len(correct)): |
|
||||||
c = np.where(correct == correct[i]) |
|
||||||
o = np.where(other == other[i]) |
|
||||||
if not np.array_equal(c, o): |
|
||||||
return False |
|
||||||
return True |
|
||||||
|
|
||||||
|
|
||||||
class TestClustering(unittest.TestCase): |
|
||||||
def test_scipy_clustering(self): |
|
||||||
old_link = linkage_vector(TRACK_PTS, method='centroid') |
|
||||||
old_cluster_idxs = fcluster(old_link, 2.5, criterion='distance') |
|
||||||
|
|
||||||
np.testing.assert_allclose(old_link, CORRECT_LINK) |
|
||||||
np.testing.assert_allclose(old_cluster_idxs, CORRECT_LABELS) |
|
||||||
|
|
||||||
def test_pdist(self): |
|
||||||
pts = np.ascontiguousarray(TRACK_PTS, dtype=np.float64) |
|
||||||
pts_ptr = ffi.cast("double *", pts.ctypes.data) |
|
||||||
|
|
||||||
n, m = pts.shape |
|
||||||
out = np.zeros((n * (n - 1) // 2, ), dtype=np.float64) |
|
||||||
out_ptr = ffi.cast("double *", out.ctypes.data) |
|
||||||
hclust.hclust_pdist(n, m, pts_ptr, out_ptr) |
|
||||||
|
|
||||||
np.testing.assert_allclose(out, np.power(pdist(TRACK_PTS), 2)) |
|
||||||
|
|
||||||
def test_cpp_clustering(self): |
|
||||||
pts = np.ascontiguousarray(TRACK_PTS, dtype=np.float64) |
|
||||||
pts_ptr = ffi.cast("double *", pts.ctypes.data) |
|
||||||
n, m = pts.shape |
|
||||||
|
|
||||||
labels = np.zeros((n, ), dtype=np.int32) |
|
||||||
labels_ptr = ffi.cast("int *", labels.ctypes.data) |
|
||||||
hclust.cluster_points_centroid(n, m, pts_ptr, 2.5**2, labels_ptr) |
|
||||||
self.assertTrue(same_clusters(CORRECT_LABELS, labels)) |
|
||||||
|
|
||||||
def test_cpp_wrapper_clustering(self): |
|
||||||
labels = cluster_points_centroid(TRACK_PTS, 2.5) |
|
||||||
self.assertTrue(same_clusters(CORRECT_LABELS, labels)) |
|
||||||
|
|
||||||
def test_random_cluster(self): |
|
||||||
np.random.seed(1337) |
|
||||||
N = 1000 |
|
||||||
|
|
||||||
t_old = 0. |
|
||||||
t_new = 0. |
|
||||||
|
|
||||||
for _ in range(N): |
|
||||||
n = int(np.random.uniform(2, 32)) |
|
||||||
x = np.random.uniform(-10, 50, (n, 1)) |
|
||||||
y = np.random.uniform(-5, 5, (n, 1)) |
|
||||||
vrel = np.random.uniform(-5, 5, (n, 1)) |
|
||||||
pts = np.hstack([x, y, vrel]) |
|
||||||
|
|
||||||
t = time.time() |
|
||||||
old_link = linkage_vector(pts, method='centroid') |
|
||||||
old_cluster_idx = fcluster(old_link, 2.5, criterion='distance') |
|
||||||
t_old += time.time() - t |
|
||||||
|
|
||||||
t = time.time() |
|
||||||
cluster_idx = cluster_points_centroid(pts, 2.5) |
|
||||||
t_new += time.time() - t |
|
||||||
|
|
||||||
self.assertTrue(same_clusters(old_cluster_idx, cluster_idx)) |
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__": |
|
||||||
unittest.main() |
|
@ -1 +0,0 @@ |
|||||||
test |
|
@ -1,79 +0,0 @@ |
|||||||
C++ interface to fast hierarchical clustering algorithms |
|
||||||
======================================================== |
|
||||||
|
|
||||||
This is a simplified C++ interface to fast implementations of hierarchical |
|
||||||
clustering by Daniel Müllner. The original library with interfaces to R |
|
||||||
and Python is described in: |
|
||||||
|
|
||||||
Daniel Müllner: "fastcluster: Fast Hierarchical, Agglomerative Clustering |
|
||||||
Routines for R and Python." Journal of Statistical Software 53 (2013), |
|
||||||
no. 9, pp. 1–18, http://www.jstatsoft.org/v53/i09/ |
|
||||||
|
|
||||||
|
|
||||||
Usage of the library |
|
||||||
-------------------- |
|
||||||
|
|
||||||
For using the library, the following source files are needed: |
|
||||||
|
|
||||||
fastcluster_dm.cpp, fastcluster_R_dm.cpp |
|
||||||
original code by Daniel Müllner |
|
||||||
these are included by fastcluster.cpp via #include, and therefore |
|
||||||
need not be compiled to object code |
|
||||||
|
|
||||||
fastcluster.[h|cpp] |
|
||||||
simplified C++ interface |
|
||||||
fastcluster.cpp is the only file that must be compiled |
|
||||||
|
|
||||||
The library provides the clustering function *hclust_fast* for |
|
||||||
creating the dendrogram information in an encoding as used by the |
|
||||||
R function *hclust*. For a description of the parameters, see fastcluster.h. |
|
||||||
Its parameter *method* can be one of |
|
||||||
|
|
||||||
HCLUST_METHOD_SINGLE |
|
||||||
single link with the minimum spanning tree algorithm (Rohlf, 1973) |
|
||||||
|
|
||||||
HHCLUST_METHOD_COMPLETE |
|
||||||
complete link with the nearest-neighbor-chain algorithm (Murtagh, 1984) |
|
||||||
|
|
||||||
HCLUST_METHOD_AVERAGE |
|
||||||
complete link with the nearest-neighbor-chain algorithm (Murtagh, 1984) |
|
||||||
|
|
||||||
HCLUST_METHOD_MEDIAN |
|
||||||
median link with the generic algorithm (Müllner, 2011) |
|
||||||
|
|
||||||
For splitting the dendrogram into clusters, the two functions *cutree_k* |
|
||||||
and *cutree_cdist* are provided. |
|
||||||
|
|
||||||
Note that output parameters must be allocated beforehand, e.g. |
|
||||||
int* merge = new int[2*(npoints-1)]; |
|
||||||
For a complete usage example, see lines 135-142 of demo.cpp. |
|
||||||
|
|
||||||
|
|
||||||
Demonstration program |
|
||||||
--------------------- |
|
||||||
|
|
||||||
A simple demo is implemented in demo.cpp, which can be compiled and run with |
|
||||||
|
|
||||||
make |
|
||||||
./hclust-demo -m complete lines.csv |
|
||||||
|
|
||||||
It creates two clusters of line segments such that the segment angle between |
|
||||||
line segments of different clusters have a maximum (cosine) dissimilarity. |
|
||||||
For visualizing the result, plotresult.r can be used as follows |
|
||||||
(requires R <https://r-project.org> to be installed): |
|
||||||
|
|
||||||
./hclust-demo -m complete lines.csv | Rscript plotresult.r |
|
||||||
|
|
||||||
|
|
||||||
Authors & Copyright |
|
||||||
------------------- |
|
||||||
|
|
||||||
Daniel Müllner, 2011, <http://danifold.net> |
|
||||||
Christoph Dalitz, 2018, <http://www.hsnr.de/ipattern/> |
|
||||||
|
|
||||||
|
|
||||||
License |
|
||||||
------- |
|
||||||
|
|
||||||
This code is provided under a BSD-style license. |
|
||||||
See the file LICENSE for details. |
|
@ -1,8 +0,0 @@ |
|||||||
Import('env') |
|
||||||
|
|
||||||
fc = env.SharedLibrary("fastcluster", "fastcluster.cpp") |
|
||||||
|
|
||||||
# TODO: how do I gate on test |
|
||||||
#env.Program("test", ["test.cpp"], LIBS=[fc]) |
|
||||||
#valgrind --leak-check=full ./test |
|
||||||
|
|
@ -1,218 +0,0 @@ |
|||||||
//
|
|
||||||
// C++ standalone verion of fastcluster by Daniel Müllner
|
|
||||||
//
|
|
||||||
// Copyright: Christoph Dalitz, 2018
|
|
||||||
// Daniel Müllner, 2011
|
|
||||||
// License: BSD style license
|
|
||||||
// (see the file LICENSE for details)
|
|
||||||
//
|
|
||||||
|
|
||||||
|
|
||||||
#include <vector> |
|
||||||
#include <algorithm> |
|
||||||
#include <cmath> |
|
||||||
|
|
||||||
|
|
||||||
extern "C" { |
|
||||||
#include "fastcluster.h" |
|
||||||
} |
|
||||||
|
|
||||||
// Code by Daniel Müllner
|
|
||||||
// workaround to make it usable as a standalone version (without R)
|
|
||||||
bool fc_isnan(double x) { return false; } |
|
||||||
#include "fastcluster_dm.cpp" |
|
||||||
#include "fastcluster_R_dm.cpp" |
|
||||||
|
|
||||||
extern "C" { |
|
||||||
//
|
|
||||||
// Assigns cluster labels (0, ..., nclust-1) to the n points such
|
|
||||||
// that the cluster result is split into nclust clusters.
|
|
||||||
//
|
|
||||||
// Input arguments:
|
|
||||||
// n = number of observables
|
|
||||||
// merge = clustering result in R format
|
|
||||||
// nclust = number of clusters
|
|
||||||
// Output arguments:
|
|
||||||
// labels = allocated integer array of size n for result
|
|
||||||
//
|
|
||||||
void cutree_k(int n, const int* merge, int nclust, int* labels) { |
|
||||||
|
|
||||||
int k,m1,m2,j,l; |
|
||||||
|
|
||||||
if (nclust > n || nclust < 2) { |
|
||||||
for (j=0; j<n; j++) labels[j] = 0; |
|
||||||
return; |
|
||||||
} |
|
||||||
|
|
||||||
// assign to each observable the number of its last merge step
|
|
||||||
// beware: indices of observables in merge start at 1 (R convention)
|
|
||||||
std::vector<int> last_merge(n, 0); |
|
||||||
for (k=1; k<=(n-nclust); k++) { |
|
||||||
// (m1,m2) = merge[k,]
|
|
||||||
m1 = merge[k-1]; |
|
||||||
m2 = merge[n-1+k-1]; |
|
||||||
if (m1 < 0 && m2 < 0) { // both single observables
|
|
||||||
last_merge[-m1-1] = last_merge[-m2-1] = k; |
|
||||||
} |
|
||||||
else if (m1 < 0 || m2 < 0) { // one is a cluster
|
|
||||||
if(m1 < 0) { j = -m1; m1 = m2; } else j = -m2; |
|
||||||
// merging single observable and cluster
|
|
||||||
for(l = 0; l < n; l++) |
|
||||||
if (last_merge[l] == m1) |
|
||||||
last_merge[l] = k; |
|
||||||
last_merge[j-1] = k; |
|
||||||
} |
|
||||||
else { // both cluster
|
|
||||||
for(l=0; l < n; l++) { |
|
||||||
if( last_merge[l] == m1 || last_merge[l] == m2 ) |
|
||||||
last_merge[l] = k; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
// assign cluster labels
|
|
||||||
int label = 0; |
|
||||||
std::vector<int> z(n,-1); |
|
||||||
for (j=0; j<n; j++) { |
|
||||||
if (last_merge[j] == 0) { // still singleton
|
|
||||||
labels[j] = label++; |
|
||||||
} else { |
|
||||||
if (z[last_merge[j]] < 0) { |
|
||||||
z[last_merge[j]] = label++; |
|
||||||
} |
|
||||||
labels[j] = z[last_merge[j]]; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
//
|
|
||||||
// Assigns cluster labels (0, ..., nclust-1) to the n points such
|
|
||||||
// that the hierarchical clustering is stopped when cluster distance >= cdist
|
|
||||||
//
|
|
||||||
// Input arguments:
|
|
||||||
// n = number of observables
|
|
||||||
// merge = clustering result in R format
|
|
||||||
// height = cluster distance at each merge step
|
|
||||||
// cdist = cutoff cluster distance
|
|
||||||
// Output arguments:
|
|
||||||
// labels = allocated integer array of size n for result
|
|
||||||
//
|
|
||||||
void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels) { |
|
||||||
|
|
||||||
int k; |
|
||||||
|
|
||||||
for (k=0; k<(n-1); k++) { |
|
||||||
if (height[k] >= cdist) { |
|
||||||
break; |
|
||||||
} |
|
||||||
} |
|
||||||
cutree_k(n, merge, n-k, labels); |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
// Hierarchical clustering with one of Daniel Muellner's fast algorithms
|
|
||||||
//
|
|
||||||
// Input arguments:
|
|
||||||
// n = number of observables
|
|
||||||
// distmat = condensed distance matrix, i.e. an n*(n-1)/2 array representing
|
|
||||||
// the upper triangle (without diagonal elements) of the distance
|
|
||||||
// matrix, e.g. for n=4:
|
|
||||||
// d00 d01 d02 d03
|
|
||||||
// d10 d11 d12 d13 -> d01 d02 d03 d12 d13 d23
|
|
||||||
// d20 d21 d22 d23
|
|
||||||
// d30 d31 d32 d33
|
|
||||||
// method = cluster metric (see enum method_code)
|
|
||||||
// Output arguments:
|
|
||||||
// merge = allocated (n-1)x2 matrix (2*(n-1) array) for storing result.
|
|
||||||
// Result follows R hclust convention:
|
|
||||||
// - observabe indices start with one
|
|
||||||
// - merge[i][] contains the merged nodes in step i
|
|
||||||
// - merge[i][j] is negative when the node is an atom
|
|
||||||
// height = allocated (n-1) array with distances at each merge step
|
|
||||||
// Return code:
|
|
||||||
// 0 = ok
|
|
||||||
// 1 = invalid method
|
|
||||||
//
|
|
||||||
int hclust_fast(int n, double* distmat, int method, int* merge, double* height) { |
|
||||||
|
|
||||||
// call appropriate culstering function
|
|
||||||
cluster_result Z2(n-1); |
|
||||||
if (method == HCLUST_METHOD_SINGLE) { |
|
||||||
// single link
|
|
||||||
MST_linkage_core(n, distmat, Z2); |
|
||||||
} |
|
||||||
else if (method == HCLUST_METHOD_COMPLETE) { |
|
||||||
// complete link
|
|
||||||
NN_chain_core<METHOD_METR_COMPLETE, t_float>(n, distmat, NULL, Z2); |
|
||||||
} |
|
||||||
else if (method == HCLUST_METHOD_AVERAGE) { |
|
||||||
// best average distance
|
|
||||||
double* members = new double[n]; |
|
||||||
for (int i=0; i<n; i++) members[i] = 1; |
|
||||||
NN_chain_core<METHOD_METR_AVERAGE, t_float>(n, distmat, members, Z2); |
|
||||||
delete[] members; |
|
||||||
} |
|
||||||
else if (method == HCLUST_METHOD_MEDIAN) { |
|
||||||
// best median distance (beware: O(n^3))
|
|
||||||
generic_linkage<METHOD_METR_MEDIAN, t_float>(n, distmat, NULL, Z2); |
|
||||||
} |
|
||||||
else if (method == HCLUST_METHOD_CENTROID) { |
|
||||||
// best centroid distance (beware: O(n^3))
|
|
||||||
double* members = new double[n]; |
|
||||||
for (int i=0; i<n; i++) members[i] = 1; |
|
||||||
generic_linkage<METHOD_METR_CENTROID, t_float>(n, distmat, members, Z2); |
|
||||||
delete[] members; |
|
||||||
} |
|
||||||
else { |
|
||||||
return 1; |
|
||||||
} |
|
||||||
|
|
||||||
int* order = new int[n]; |
|
||||||
if (method == HCLUST_METHOD_MEDIAN || method == HCLUST_METHOD_CENTROID) { |
|
||||||
generate_R_dendrogram<true>(merge, height, order, Z2, n); |
|
||||||
} else { |
|
||||||
generate_R_dendrogram<false>(merge, height, order, Z2, n); |
|
||||||
} |
|
||||||
delete[] order; // only needed for visualization
|
|
||||||
|
|
||||||
return 0; |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
// Build condensed distance matrix
|
|
||||||
// Input arguments:
|
|
||||||
// n = number of observables
|
|
||||||
// m = dimension of observable
|
|
||||||
// Output arguments:
|
|
||||||
// out = allocated integer array of size n * (n - 1) / 2 for result
|
|
||||||
void hclust_pdist(int n, int m, double* pts, double* out) { |
|
||||||
int ii = 0; |
|
||||||
for (int i = 0; i < n; i++) { |
|
||||||
for (int j = i + 1; j < n; j++) { |
|
||||||
// Compute euclidian distance
|
|
||||||
double d = 0; |
|
||||||
for (int k = 0; k < m; k ++) { |
|
||||||
double error = pts[i * m + k] - pts[j * m + k]; |
|
||||||
d += (error * error); |
|
||||||
} |
|
||||||
out[ii] = d;//sqrt(d);
|
|
||||||
ii++; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
void cluster_points_centroid(int n, int m, double* pts, double dist, int* idx) { |
|
||||||
double* pdist = new double[n * (n - 1) / 2]; |
|
||||||
int* merge = new int[2 * (n - 1)]; |
|
||||||
double* height = new double[n - 1]; |
|
||||||
|
|
||||||
hclust_pdist(n, m, pts, pdist); |
|
||||||
hclust_fast(n, pdist, HCLUST_METHOD_CENTROID, merge, height); |
|
||||||
cutree_cdist(n, merge, height, dist, idx); |
|
||||||
|
|
||||||
delete[] pdist; |
|
||||||
delete[] merge; |
|
||||||
delete[] height; |
|
||||||
} |
|
||||||
} |
|
@ -1,77 +0,0 @@ |
|||||||
//
|
|
||||||
// C++ standalone verion of fastcluster by Daniel Muellner
|
|
||||||
//
|
|
||||||
// Copyright: Daniel Muellner, 2011
|
|
||||||
// Christoph Dalitz, 2018
|
|
||||||
// License: BSD style license
|
|
||||||
// (see the file LICENSE for details)
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifndef fastclustercpp_H |
|
||||||
#define fastclustercpp_H |
|
||||||
|
|
||||||
//
|
|
||||||
// Assigns cluster labels (0, ..., nclust-1) to the n points such
|
|
||||||
// that the cluster result is split into nclust clusters.
|
|
||||||
//
|
|
||||||
// Input arguments:
|
|
||||||
// n = number of observables
|
|
||||||
// merge = clustering result in R format
|
|
||||||
// nclust = number of clusters
|
|
||||||
// Output arguments:
|
|
||||||
// labels = allocated integer array of size n for result
|
|
||||||
//
|
|
||||||
void cutree_k(int n, const int* merge, int nclust, int* labels); |
|
||||||
|
|
||||||
//
|
|
||||||
// Assigns cluster labels (0, ..., nclust-1) to the n points such
|
|
||||||
// that the hierarchical clsutering is stopped at cluster distance cdist
|
|
||||||
//
|
|
||||||
// Input arguments:
|
|
||||||
// n = number of observables
|
|
||||||
// merge = clustering result in R format
|
|
||||||
// height = cluster distance at each merge step
|
|
||||||
// cdist = cutoff cluster distance
|
|
||||||
// Output arguments:
|
|
||||||
// labels = allocated integer array of size n for result
|
|
||||||
//
|
|
||||||
void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels); |
|
||||||
|
|
||||||
//
|
|
||||||
// Hierarchical clustering with one of Daniel Muellner's fast algorithms
|
|
||||||
//
|
|
||||||
// Input arguments:
|
|
||||||
// n = number of observables
|
|
||||||
// distmat = condensed distance matrix, i.e. an n*(n-1)/2 array representing
|
|
||||||
// the upper triangle (without diagonal elements) of the distance
|
|
||||||
// matrix, e.g. for n=4:
|
|
||||||
// d00 d01 d02 d03
|
|
||||||
// d10 d11 d12 d13 -> d01 d02 d03 d12 d13 d23
|
|
||||||
// d20 d21 d22 d23
|
|
||||||
// d30 d31 d32 d33
|
|
||||||
// method = cluster metric (see enum method_code)
|
|
||||||
// Output arguments:
|
|
||||||
// merge = allocated (n-1)x2 matrix (2*(n-1) array) for storing result.
|
|
||||||
// Result follows R hclust convention:
|
|
||||||
// - observabe indices start with one
|
|
||||||
// - merge[i][] contains the merged nodes in step i
|
|
||||||
// - merge[i][j] is negative when the node is an atom
|
|
||||||
// height = allocated (n-1) array with distances at each merge step
|
|
||||||
// Return code:
|
|
||||||
// 0 = ok
|
|
||||||
// 1 = invalid method
|
|
||||||
//
|
|
||||||
int hclust_fast(int n, double* distmat, int method, int* merge, double* height); |
|
||||||
enum hclust_fast_methods { |
|
||||||
HCLUST_METHOD_SINGLE = 0, |
|
||||||
HCLUST_METHOD_COMPLETE = 1, |
|
||||||
HCLUST_METHOD_AVERAGE = 2, |
|
||||||
HCLUST_METHOD_MEDIAN = 3, |
|
||||||
HCLUST_METHOD_CENTROID = 5, |
|
||||||
}; |
|
||||||
|
|
||||||
void hclust_pdist(int n, int m, double* pts, double* out); |
|
||||||
void cluster_points_centroid(int n, int m, double* pts, double dist, int* idx); |
|
||||||
|
|
||||||
|
|
||||||
#endif |
|
@ -1,115 +0,0 @@ |
|||||||
//
|
|
||||||
// Excerpt from fastcluster_R.cpp
|
|
||||||
//
|
|
||||||
// Copyright: Daniel Müllner, 2011 <http://danifold.net>
|
|
||||||
//
|
|
||||||
|
|
||||||
struct pos_node { |
|
||||||
t_index pos; |
|
||||||
int node; |
|
||||||
}; |
|
||||||
|
|
||||||
void order_nodes(const int N, const int * const merge, const t_index * const node_size, int * const order) { |
|
||||||
/* Parameters:
|
|
||||||
N : number of data points |
|
||||||
merge : (N-1)×2 array which specifies the node indices which are |
|
||||||
merged in each step of the clustering procedure. |
|
||||||
Negative entries -1...-N point to singleton nodes, while |
|
||||||
positive entries 1...(N-1) point to nodes which are themselves |
|
||||||
parents of other nodes. |
|
||||||
node_size : array of node sizes - makes it easier |
|
||||||
order : output array of size N |
|
||||||
|
|
||||||
Runtime: Θ(N) |
|
||||||
*/ |
|
||||||
auto_array_ptr<pos_node> queue(N/2); |
|
||||||
|
|
||||||
int parent; |
|
||||||
int child; |
|
||||||
t_index pos = 0; |
|
||||||
|
|
||||||
queue[0].pos = 0; |
|
||||||
queue[0].node = N-2; |
|
||||||
t_index idx = 1; |
|
||||||
|
|
||||||
do { |
|
||||||
--idx; |
|
||||||
pos = queue[idx].pos; |
|
||||||
parent = queue[idx].node; |
|
||||||
|
|
||||||
// First child
|
|
||||||
child = merge[parent]; |
|
||||||
if (child<0) { // singleton node, write this into the 'order' array.
|
|
||||||
order[pos] = -child; |
|
||||||
++pos; |
|
||||||
} |
|
||||||
else { /* compound node: put it on top of the queue and decompose it
|
|
||||||
in a later iteration. */ |
|
||||||
queue[idx].pos = pos; |
|
||||||
queue[idx].node = child-1; // convert index-1 based to index-0 based
|
|
||||||
++idx; |
|
||||||
pos += node_size[child-1]; |
|
||||||
} |
|
||||||
// Second child
|
|
||||||
child = merge[parent+N-1]; |
|
||||||
if (child<0) { |
|
||||||
order[pos] = -child; |
|
||||||
} |
|
||||||
else { |
|
||||||
queue[idx].pos = pos; |
|
||||||
queue[idx].node = child-1; |
|
||||||
++idx; |
|
||||||
} |
|
||||||
} while (idx>0); |
|
||||||
} |
|
||||||
|
|
||||||
#define size_(r_) ( ((r_<N) ? 1 : node_size[r_-N]) ) |
|
||||||
|
|
||||||
template <const bool sorted> |
|
||||||
void generate_R_dendrogram(int * const merge, double * const height, int * const order, cluster_result & Z2, const int N) { |
|
||||||
// The array "nodes" is a union-find data structure for the cluster
|
|
||||||
// identites (only needed for unsorted cluster_result input).
|
|
||||||
union_find nodes(sorted ? 0 : N); |
|
||||||
if (!sorted) { |
|
||||||
std::stable_sort(Z2[0], Z2[N-1]); |
|
||||||
} |
|
||||||
|
|
||||||
t_index node1, node2; |
|
||||||
auto_array_ptr<t_index> node_size(N-1); |
|
||||||
|
|
||||||
for (t_index i=0; i<N-1; ++i) { |
|
||||||
// Get two data points whose clusters are merged in step i.
|
|
||||||
// Find the cluster identifiers for these points.
|
|
||||||
if (sorted) { |
|
||||||
node1 = Z2[i]->node1; |
|
||||||
node2 = Z2[i]->node2; |
|
||||||
} |
|
||||||
else { |
|
||||||
node1 = nodes.Find(Z2[i]->node1); |
|
||||||
node2 = nodes.Find(Z2[i]->node2); |
|
||||||
// Merge the nodes in the union-find data structure by making them
|
|
||||||
// children of a new node.
|
|
||||||
nodes.Union(node1, node2); |
|
||||||
} |
|
||||||
// Sort the nodes in the output array.
|
|
||||||
if (node1>node2) { |
|
||||||
t_index tmp = node1; |
|
||||||
node1 = node2; |
|
||||||
node2 = tmp; |
|
||||||
} |
|
||||||
/* Conversion between labeling conventions.
|
|
||||||
Input: singleton nodes 0,...,N-1 |
|
||||||
compound nodes N,...,2N-2 |
|
||||||
Output: singleton nodes -1,...,-N |
|
||||||
compound nodes 1,...,N |
|
||||||
*/ |
|
||||||
merge[i] = (node1<N) ? -static_cast<int>(node1)-1 |
|
||||||
: static_cast<int>(node1)-N+1; |
|
||||||
merge[i+N-1] = (node2<N) ? -static_cast<int>(node2)-1 |
|
||||||
: static_cast<int>(node2)-N+1; |
|
||||||
height[i] = Z2[i]->dist; |
|
||||||
node_size[i] = size_(node1) + size_(node2); |
|
||||||
} |
|
||||||
|
|
||||||
order_nodes(N, merge, node_size, order); |
|
||||||
} |
|
File diff suppressed because it is too large
Load Diff
@ -1,28 +0,0 @@ |
|||||||
import os |
|
||||||
import numpy as np |
|
||||||
|
|
||||||
from cffi import FFI |
|
||||||
from common.ffi_wrapper import suffix |
|
||||||
|
|
||||||
cluster_dir = os.path.join(os.path.dirname(os.path.abspath(__file__))) |
|
||||||
cluster_fn = os.path.join(cluster_dir, "libfastcluster"+suffix()) |
|
||||||
|
|
||||||
ffi = FFI() |
|
||||||
ffi.cdef(""" |
|
||||||
int hclust_fast(int n, double* distmat, int method, int* merge, double* height); |
|
||||||
void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels); |
|
||||||
void hclust_pdist(int n, int m, double* pts, double* out); |
|
||||||
void cluster_points_centroid(int n, int m, double* pts, double dist, int* idx); |
|
||||||
""") |
|
||||||
|
|
||||||
hclust = ffi.dlopen(cluster_fn) |
|
||||||
|
|
||||||
|
|
||||||
def cluster_points_centroid(pts, dist): |
|
||||||
pts = np.ascontiguousarray(pts, dtype=np.float64) |
|
||||||
pts_ptr = ffi.cast("double *", pts.ctypes.data) |
|
||||||
n, m = pts.shape |
|
||||||
|
|
||||||
labels_ptr = ffi.new("int[]", n) |
|
||||||
hclust.cluster_points_centroid(n, m, pts_ptr, dist**2, labels_ptr) |
|
||||||
return list(labels_ptr) |
|
@ -1,35 +0,0 @@ |
|||||||
#include <cassert> |
|
||||||
|
|
||||||
extern "C" { |
|
||||||
#include "fastcluster.h" |
|
||||||
} |
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, const char* argv[]) { |
|
||||||
const int n = 11; |
|
||||||
const int m = 3; |
|
||||||
double* pts = new double[n*m]{59.26000137, -9.35999966, -5.42500019, |
|
||||||
91.61999817, -0.31999999, -2.75, |
|
||||||
31.38000031, 0.40000001, -0.2, |
|
||||||
89.57999725, -8.07999992, -18.04999924, |
|
||||||
53.42000122, 0.63999999, -0.175, |
|
||||||
31.38000031, 0.47999999, -0.2, |
|
||||||
36.33999939, 0.16, -0.2, |
|
||||||
53.33999939, 0.95999998, -0.175, |
|
||||||
59.26000137, -9.76000023, -5.44999981, |
|
||||||
33.93999977, 0.40000001, -0.22499999, |
|
||||||
106.74000092, -5.76000023, -18.04999924}; |
|
||||||
|
|
||||||
int * idx = new int[n]; |
|
||||||
int * correct_idx = new int[n]{0, 1, 2, 3, 4, 2, 5, 4, 0, 5, 6}; |
|
||||||
|
|
||||||
cluster_points_centroid(n, m, pts, 2.5 * 2.5, idx); |
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) { |
|
||||||
assert(idx[i] == correct_idx[i]); |
|
||||||
} |
|
||||||
|
|
||||||
delete[] idx; |
|
||||||
delete[] correct_idx; |
|
||||||
delete[] pts; |
|
||||||
} |
|
Loading…
Reference in new issue