diff --git a/poetry.lock b/poetry.lock index 7a6399cdb8..48f8c565c1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1217,66 +1217,6 @@ files = [ [package.extras] testing = ["hatch", "pre-commit", "pytest", "tox"] -[[package]] -name = "fastcluster" -version = "1.2.6" -description = "Fast hierarchical clustering routines for R and Python." -optional = false -python-versions = ">=3" -files = [ - {file = "fastcluster-1.2.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d0e8faef0437a25fd083df70fb86cc65ce3c9c9780d4ae377cbe6521e7746ce0"}, - {file = "fastcluster-1.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8be01f97bc2bf11a9188537864f8e520e1103cdc6007aa2c5d7979b1363b121"}, - {file = "fastcluster-1.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:855ab2b7e6fa9b05f19c4f3023dedfb1a35a88d831933d65d0d9e10a070a9e85"}, - {file = "fastcluster-1.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72503e727887a61a15f9aaa13178798d3994dfec58aa7a943e42dcfda07c0149"}, - {file = "fastcluster-1.2.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcb0973ca0e6978e3242046338c350cbed1493108929231fae9bd35ad05a6d6"}, - {file = "fastcluster-1.2.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9020899b67fe492d0ed87a3e993ec9962c5a0b51ea2df71d86b1766f065f1cef"}, - {file = "fastcluster-1.2.6-cp310-cp310-win32.whl", hash = "sha256:6cf156d4203708348522393c523c2e61c81f5a6a500e0411dcba2b064551ea2f"}, - {file = "fastcluster-1.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:1801c9daa9aa5bbbb0830efe8bd3034b4b7a417e4b8dd353683999be29797df2"}, - {file = "fastcluster-1.2.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ce70c743490f6778b463524d1767a9ecccd31c8bd2dbb5739bb2174168c15d39"}, - {file = "fastcluster-1.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ac1b84d4b28456a379a71451d13995eb3242143452ce9c861f8913360de842a3"}, - {file = "fastcluster-1.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:55b49f6033c45a28f93540847b495ed0f718b5c3f4fef446cf77e3726662e1d5"}, - {file = "fastcluster-1.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1c776a4ec7594f47cd2e1e2da73a30134f1d402d7c93a81e3cb7c3d8e191173"}, - {file = "fastcluster-1.2.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aca61d16435bb7aea3901939d7d7d7e36aff9bb538123e649166a3014b280054"}, - {file = "fastcluster-1.2.6-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04ea4a68e0675072ca761bad33322a0e998cb43693fd41165bc420d7db40429a"}, - {file = "fastcluster-1.2.6-cp311-cp311-win32.whl", hash = "sha256:773043d5db2790e1ff2a4e1eae0b6a60afb2a93ad2c74897a56c80bc800db04f"}, - {file = "fastcluster-1.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:841d128daa6597d13781793eb482b0b566bbd58d2a9d1e2cf1b58838773beb14"}, - {file = "fastcluster-1.2.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cf5acfe1156849279ebd44a8d1fbcbe8b8e21334f7538eda782ae31e7dade9e2"}, - {file = "fastcluster-1.2.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb27c13225f5f77f3c5986a27ca27277cce7db12844330cf535019cd38021257"}, - {file = "fastcluster-1.2.6-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5fe543b6d45da27e84e5af6248722475b88943d8ef40d835cbabbb0ba5ee786b"}, - {file = "fastcluster-1.2.6-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c12224da0b1f2f9d2b3d715dc82ecb1a3a33b990606f97da075cc46bc6d9576f"}, - {file = "fastcluster-1.2.6-cp36-cp36m-win32.whl", hash = "sha256:86a1ad972e83ba48144884fa849f87626346308b650002157123aee67d3b16fe"}, - {file = "fastcluster-1.2.6-cp36-cp36m-win_amd64.whl", hash = "sha256:8d3c9eab8e69cb36dcdd64c8b3200e008aedf88e34d39e01ae6af98a9605ad18"}, - {file = "fastcluster-1.2.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c61be0bad81a21ee3e5bef91469fdd11968f33d41d142c656accba9b2992babe"}, - {file = "fastcluster-1.2.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06df1d97edca68b2ffa43d81c3b5f4e4147bc12ab241c6585fadcdeb0bfa23ca"}, - {file = "fastcluster-1.2.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ab9337b0a6a9b07b6f86fc724972d1ad729c890e2f539c1b33271c2f1f00af8b"}, - {file = "fastcluster-1.2.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4093d5454bcbe48b30e32da5db43056a08889480a96e4555f28c1f7004fc5323"}, - {file = "fastcluster-1.2.6-cp37-cp37m-win32.whl", hash = "sha256:58958a0333e3dfbad198394e9b7dd6254de0a3e622019b057288405b2a4a6bba"}, - {file = "fastcluster-1.2.6-cp37-cp37m-win_amd64.whl", hash = "sha256:03f8efe6435a7b947fa4a420676942d0267dac0d323ec5ead50f1856cc7cf96f"}, - {file = "fastcluster-1.2.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a5ceb39379327316d34613f7c16c06d7a3816aa38f4437b5e8433aa1bf149e2c"}, - {file = "fastcluster-1.2.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0bb54283b4d5ec96f167c7fd31921f169226c1261637434fdae7a69ee3c69573"}, - {file = "fastcluster-1.2.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6e51db0067e65687a5c46f00a11843d0bb15ca759e8a1767eebac8c4f6e3f4df"}, - {file = "fastcluster-1.2.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11748a4e245c745030e9ddd8c2c37e378f8ad8bd8e869d954c84ff674495499f"}, - {file = "fastcluster-1.2.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7254f81dc71cd29ef6f2d9747cf97ff907b569c9ef9d9760352391be5b57118c"}, - {file = "fastcluster-1.2.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa4a4c01c5fbec3623e92bc33a9f712ca416ce93255c402f5c904ac4b890ac3c"}, - {file = "fastcluster-1.2.6-cp38-cp38-win32.whl", hash = "sha256:ffdb00782cd63bbf2c45bb048897531e868326dff5081ab9b752d294b0426c1d"}, - {file = "fastcluster-1.2.6-cp38-cp38-win_amd64.whl", hash = "sha256:a952a84453123db0c2336b9a9c86162e99ad0b897bae8213107c055a64effd41"}, - {file = "fastcluster-1.2.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a085e7e13f1afc517358981b2b7ed774dc9abf95f2be0da9a495d9e6b58c4409"}, - {file = "fastcluster-1.2.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6a7c7f51a6d2f5ab58b1d85e9d0af2af9600ec13bb43bc6aafc9085d2c4ccd93"}, - {file = "fastcluster-1.2.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8bac5cf64691060cf86b0752dd385ef1eccff6d24bdb8b60691cf8cbf0e4f9ef"}, - {file = "fastcluster-1.2.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:060c1cb3c84942d8d3618385e2c25998ba690c46ec8c73d64477f808abfac3f2"}, - {file = "fastcluster-1.2.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03a228e018457842eb81de85be7af0b5fe8065d666dd093193e3bdcf1f13d2e"}, - {file = "fastcluster-1.2.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6f8da329c0032f2acaf4beaef958a2db0dae43d3f946f592dad5c29aa82c832"}, - {file = "fastcluster-1.2.6-cp39-cp39-win32.whl", hash = "sha256:eb3f98791427d5d5d02d023b66bcef61e48954edfadae6527ef72d70cf32ec86"}, - {file = "fastcluster-1.2.6-cp39-cp39-win_amd64.whl", hash = "sha256:4b9cfd426966b8037bec2fc03a0d7a9c87313482c699b36ffa1432b49f84ed2e"}, - {file = "fastcluster-1.2.6.tar.gz", hash = "sha256:aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6"}, -] - -[package.dependencies] -numpy = ">=1.9" - -[package.extras] -test = ["scipy (>=1.6.3)"] - [[package]] name = "filelock" version = "3.12.2" @@ -5396,4 +5336,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "~3.11" -content-hash = "c88d289360ff8c163e0e9fa45431fd86bd783f8e51134711df3e7559b9e0303e" +content-hash = "f342fe9e841e08da2db9cf28aceaaa4112a67ea7015fc66c4aadace327868929" diff --git a/pyproject.toml b/pyproject.toml index 38451501f2..7da35094fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,6 @@ carla = { url = "https://github.com/commaai/carla/releases/download/3.11.4/carla control = "*" coverage = "*" dictdiffer = "*" -fastcluster = "*" ft4222 = "*" hexdump = "*" hypothesis = "==6.46.7" diff --git a/release/files_common b/release/files_common index 90ee2ccd6f..8381546b03 100644 --- a/release/files_common +++ b/release/files_common @@ -418,8 +418,6 @@ selfdrive/assets/navigation/* third_party/.gitignore third_party/SConscript -third_party/cluster/* - third_party/linux/** third_party/opencl/** diff --git a/selfdrive/controls/tests/test_clustering.py b/selfdrive/controls/tests/test_clustering.py deleted file mode 100644 index 99c24d5938..0000000000 --- a/selfdrive/controls/tests/test_clustering.py +++ /dev/null @@ -1,140 +0,0 @@ -# pylint: skip-file - -import time -import unittest -import numpy as np -from fastcluster import linkage_vector -from scipy.cluster import _hierarchy -from scipy.spatial.distance import pdist - -from third_party.cluster.fastcluster_py import hclust, ffi -from third_party.cluster.fastcluster_py import cluster_points_centroid - - -def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None): - # supersimplified function to get fast clustering. Got it from scipy - Z = np.asarray(Z, order='c') - n = Z.shape[0] + 1 - T = np.zeros((n,), dtype='i') - _hierarchy.cluster_dist(Z, T, float(t), int(n)) - return T - - -TRACK_PTS = np.array([[59.26000137, -9.35999966, -5.42500019], - [91.61999817, -0.31999999, -2.75], - [31.38000031, 0.40000001, -0.2], - [89.57999725, -8.07999992, -18.04999924], - [53.42000122, 0.63999999, -0.175], - [31.38000031, 0.47999999, -0.2], - [36.33999939, 0.16, -0.2], - [53.33999939, 0.95999998, -0.175], - [59.26000137, -9.76000023, -5.44999981], - [33.93999977, 0.40000001, -0.22499999], - [106.74000092, -5.76000023, -18.04999924]]) - -CORRECT_LINK = np.array([[2., 5., 0.07999998, 2.], - [4., 7., 0.32984889, 2.], - [0., 8., 0.40078104, 2.], - [6., 9., 2.41209933, 2.], - [11., 14., 3.76342275, 4.], - [12., 13., 13.02297651, 4.], - [1., 3., 17.27626057, 2.], - [10., 17., 17.92918845, 3.], - [15., 16., 23.68525366, 8.], - [18., 19., 52.52351319, 11.]]) - -CORRECT_LABELS = np.array([7, 1, 4, 2, 6, 4, 5, 6, 7, 5, 3], dtype=np.int32) - - -def plot_cluster(pts, idx_old, idx_new): - import matplotlib.pyplot as plt - m = 'Set1' - - plt.figure() - plt.subplot(1, 2, 1) - plt.scatter(pts[:, 0], pts[:, 1], c=idx_old, cmap=m) - plt.title("Old") - plt.colorbar() - plt.subplot(1, 2, 2) - plt.scatter(pts[:, 0], pts[:, 1], c=idx_new, cmap=m) - plt.title("New") - plt.colorbar() - - plt.show() - - -def same_clusters(correct, other): - correct = np.asarray(correct) - other = np.asarray(other) - if len(correct) != len(other): - return False - - for i in range(len(correct)): - c = np.where(correct == correct[i]) - o = np.where(other == other[i]) - if not np.array_equal(c, o): - return False - return True - - -class TestClustering(unittest.TestCase): - def test_scipy_clustering(self): - old_link = linkage_vector(TRACK_PTS, method='centroid') - old_cluster_idxs = fcluster(old_link, 2.5, criterion='distance') - - np.testing.assert_allclose(old_link, CORRECT_LINK) - np.testing.assert_allclose(old_cluster_idxs, CORRECT_LABELS) - - def test_pdist(self): - pts = np.ascontiguousarray(TRACK_PTS, dtype=np.float64) - pts_ptr = ffi.cast("double *", pts.ctypes.data) - - n, m = pts.shape - out = np.zeros((n * (n - 1) // 2, ), dtype=np.float64) - out_ptr = ffi.cast("double *", out.ctypes.data) - hclust.hclust_pdist(n, m, pts_ptr, out_ptr) - - np.testing.assert_allclose(out, np.power(pdist(TRACK_PTS), 2)) - - def test_cpp_clustering(self): - pts = np.ascontiguousarray(TRACK_PTS, dtype=np.float64) - pts_ptr = ffi.cast("double *", pts.ctypes.data) - n, m = pts.shape - - labels = np.zeros((n, ), dtype=np.int32) - labels_ptr = ffi.cast("int *", labels.ctypes.data) - hclust.cluster_points_centroid(n, m, pts_ptr, 2.5**2, labels_ptr) - self.assertTrue(same_clusters(CORRECT_LABELS, labels)) - - def test_cpp_wrapper_clustering(self): - labels = cluster_points_centroid(TRACK_PTS, 2.5) - self.assertTrue(same_clusters(CORRECT_LABELS, labels)) - - def test_random_cluster(self): - np.random.seed(1337) - N = 1000 - - t_old = 0. - t_new = 0. - - for _ in range(N): - n = int(np.random.uniform(2, 32)) - x = np.random.uniform(-10, 50, (n, 1)) - y = np.random.uniform(-5, 5, (n, 1)) - vrel = np.random.uniform(-5, 5, (n, 1)) - pts = np.hstack([x, y, vrel]) - - t = time.time() - old_link = linkage_vector(pts, method='centroid') - old_cluster_idx = fcluster(old_link, 2.5, criterion='distance') - t_old += time.time() - t - - t = time.time() - cluster_idx = cluster_points_centroid(pts, 2.5) - t_new += time.time() - t - - self.assertTrue(same_clusters(old_cluster_idx, cluster_idx)) - - -if __name__ == "__main__": - unittest.main() diff --git a/third_party/SConscript b/third_party/SConscript index e8d1789ee0..e5bbfaa07a 100644 --- a/third_party/SConscript +++ b/third_party/SConscript @@ -4,5 +4,3 @@ env.Library('json11', ['json11/json11.cpp'], CCFLAGS=env['CCFLAGS'] + ['-Wno-unq env.Append(CPPPATH=[Dir('json11')]) env.Library('kaitai', ['kaitai/kaitaistream.cpp'], CPPDEFINES=['KS_STR_ENCODING_NONE']) - -SConscript(['cluster/SConscript']) diff --git a/third_party/cluster/.gitignore b/third_party/cluster/.gitignore deleted file mode 100644 index 9daeafb986..0000000000 --- a/third_party/cluster/.gitignore +++ /dev/null @@ -1 +0,0 @@ -test diff --git a/third_party/cluster/LICENSE b/third_party/cluster/LICENSE deleted file mode 100644 index ab8b4db7d7..0000000000 --- a/third_party/cluster/LICENSE +++ /dev/null @@ -1,13 +0,0 @@ -Copyright: - * fastcluster_dm.cpp & fastcluster_R_dm.cpp: - © 2011 Daniel Müllner - * fastcluster.(h|cpp) & demo.cpp & plotresult.r: - © 2018 Christoph Dalitz -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/cluster/README b/third_party/cluster/README deleted file mode 100644 index acb18bc72b..0000000000 --- a/third_party/cluster/README +++ /dev/null @@ -1,79 +0,0 @@ -C++ interface to fast hierarchical clustering algorithms -======================================================== - -This is a simplified C++ interface to fast implementations of hierarchical -clustering by Daniel Müllner. The original library with interfaces to R -and Python is described in: - -Daniel Müllner: "fastcluster: Fast Hierarchical, Agglomerative Clustering -Routines for R and Python." Journal of Statistical Software 53 (2013), -no. 9, pp. 1–18, http://www.jstatsoft.org/v53/i09/ - - -Usage of the library --------------------- - -For using the library, the following source files are needed: - -fastcluster_dm.cpp, fastcluster_R_dm.cpp - original code by Daniel Müllner - these are included by fastcluster.cpp via #include, and therefore - need not be compiled to object code - -fastcluster.[h|cpp] - simplified C++ interface - fastcluster.cpp is the only file that must be compiled - -The library provides the clustering function *hclust_fast* for -creating the dendrogram information in an encoding as used by the -R function *hclust*. For a description of the parameters, see fastcluster.h. -Its parameter *method* can be one of - -HCLUST_METHOD_SINGLE - single link with the minimum spanning tree algorithm (Rohlf, 1973) - -HHCLUST_METHOD_COMPLETE - complete link with the nearest-neighbor-chain algorithm (Murtagh, 1984) - -HCLUST_METHOD_AVERAGE - complete link with the nearest-neighbor-chain algorithm (Murtagh, 1984) - -HCLUST_METHOD_MEDIAN - median link with the generic algorithm (Müllner, 2011) - -For splitting the dendrogram into clusters, the two functions *cutree_k* -and *cutree_cdist* are provided. - -Note that output parameters must be allocated beforehand, e.g. - int* merge = new int[2*(npoints-1)]; -For a complete usage example, see lines 135-142 of demo.cpp. - - -Demonstration program ---------------------- - -A simple demo is implemented in demo.cpp, which can be compiled and run with - - make - ./hclust-demo -m complete lines.csv - -It creates two clusters of line segments such that the segment angle between -line segments of different clusters have a maximum (cosine) dissimilarity. -For visualizing the result, plotresult.r can be used as follows -(requires R to be installed): - - ./hclust-demo -m complete lines.csv | Rscript plotresult.r - - -Authors & Copyright -------------------- - -Daniel Müllner, 2011, -Christoph Dalitz, 2018, - - -License -------- - -This code is provided under a BSD-style license. -See the file LICENSE for details. diff --git a/third_party/cluster/SConscript b/third_party/cluster/SConscript deleted file mode 100644 index 97eb4300d4..0000000000 --- a/third_party/cluster/SConscript +++ /dev/null @@ -1,8 +0,0 @@ -Import('env') - -fc = env.SharedLibrary("fastcluster", "fastcluster.cpp") - -# TODO: how do I gate on test -#env.Program("test", ["test.cpp"], LIBS=[fc]) -#valgrind --leak-check=full ./test - diff --git a/third_party/cluster/__init__.py b/third_party/cluster/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/third_party/cluster/fastcluster.cpp b/third_party/cluster/fastcluster.cpp deleted file mode 100644 index d2b557d6f5..0000000000 --- a/third_party/cluster/fastcluster.cpp +++ /dev/null @@ -1,218 +0,0 @@ -// -// C++ standalone verion of fastcluster by Daniel Müllner -// -// Copyright: Christoph Dalitz, 2018 -// Daniel Müllner, 2011 -// License: BSD style license -// (see the file LICENSE for details) -// - - -#include -#include -#include - - -extern "C" { -#include "fastcluster.h" -} - -// Code by Daniel Müllner -// workaround to make it usable as a standalone version (without R) -bool fc_isnan(double x) { return false; } -#include "fastcluster_dm.cpp" -#include "fastcluster_R_dm.cpp" - -extern "C" { -// -// Assigns cluster labels (0, ..., nclust-1) to the n points such -// that the cluster result is split into nclust clusters. -// -// Input arguments: -// n = number of observables -// merge = clustering result in R format -// nclust = number of clusters -// Output arguments: -// labels = allocated integer array of size n for result -// - void cutree_k(int n, const int* merge, int nclust, int* labels) { - - int k,m1,m2,j,l; - - if (nclust > n || nclust < 2) { - for (j=0; j last_merge(n, 0); - for (k=1; k<=(n-nclust); k++) { - // (m1,m2) = merge[k,] - m1 = merge[k-1]; - m2 = merge[n-1+k-1]; - if (m1 < 0 && m2 < 0) { // both single observables - last_merge[-m1-1] = last_merge[-m2-1] = k; - } - else if (m1 < 0 || m2 < 0) { // one is a cluster - if(m1 < 0) { j = -m1; m1 = m2; } else j = -m2; - // merging single observable and cluster - for(l = 0; l < n; l++) - if (last_merge[l] == m1) - last_merge[l] = k; - last_merge[j-1] = k; - } - else { // both cluster - for(l=0; l < n; l++) { - if( last_merge[l] == m1 || last_merge[l] == m2 ) - last_merge[l] = k; - } - } - } - - // assign cluster labels - int label = 0; - std::vector z(n,-1); - for (j=0; j= cdist - // - // Input arguments: - // n = number of observables - // merge = clustering result in R format - // height = cluster distance at each merge step - // cdist = cutoff cluster distance - // Output arguments: - // labels = allocated integer array of size n for result - // - void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels) { - - int k; - - for (k=0; k<(n-1); k++) { - if (height[k] >= cdist) { - break; - } - } - cutree_k(n, merge, n-k, labels); - } - - - // - // Hierarchical clustering with one of Daniel Muellner's fast algorithms - // - // Input arguments: - // n = number of observables - // distmat = condensed distance matrix, i.e. an n*(n-1)/2 array representing - // the upper triangle (without diagonal elements) of the distance - // matrix, e.g. for n=4: - // d00 d01 d02 d03 - // d10 d11 d12 d13 -> d01 d02 d03 d12 d13 d23 - // d20 d21 d22 d23 - // d30 d31 d32 d33 - // method = cluster metric (see enum method_code) - // Output arguments: - // merge = allocated (n-1)x2 matrix (2*(n-1) array) for storing result. - // Result follows R hclust convention: - // - observabe indices start with one - // - merge[i][] contains the merged nodes in step i - // - merge[i][j] is negative when the node is an atom - // height = allocated (n-1) array with distances at each merge step - // Return code: - // 0 = ok - // 1 = invalid method - // - int hclust_fast(int n, double* distmat, int method, int* merge, double* height) { - - // call appropriate culstering function - cluster_result Z2(n-1); - if (method == HCLUST_METHOD_SINGLE) { - // single link - MST_linkage_core(n, distmat, Z2); - } - else if (method == HCLUST_METHOD_COMPLETE) { - // complete link - NN_chain_core(n, distmat, NULL, Z2); - } - else if (method == HCLUST_METHOD_AVERAGE) { - // best average distance - double* members = new double[n]; - for (int i=0; i(n, distmat, members, Z2); - delete[] members; - } - else if (method == HCLUST_METHOD_MEDIAN) { - // best median distance (beware: O(n^3)) - generic_linkage(n, distmat, NULL, Z2); - } - else if (method == HCLUST_METHOD_CENTROID) { - // best centroid distance (beware: O(n^3)) - double* members = new double[n]; - for (int i=0; i(n, distmat, members, Z2); - delete[] members; - } - else { - return 1; - } - - int* order = new int[n]; - if (method == HCLUST_METHOD_MEDIAN || method == HCLUST_METHOD_CENTROID) { - generate_R_dendrogram(merge, height, order, Z2, n); - } else { - generate_R_dendrogram(merge, height, order, Z2, n); - } - delete[] order; // only needed for visualization - - return 0; - } - - - // Build condensed distance matrix - // Input arguments: - // n = number of observables - // m = dimension of observable - // Output arguments: - // out = allocated integer array of size n * (n - 1) / 2 for result - void hclust_pdist(int n, int m, double* pts, double* out) { - int ii = 0; - for (int i = 0; i < n; i++) { - for (int j = i + 1; j < n; j++) { - // Compute euclidian distance - double d = 0; - for (int k = 0; k < m; k ++) { - double error = pts[i * m + k] - pts[j * m + k]; - d += (error * error); - } - out[ii] = d;//sqrt(d); - ii++; - } - } - } - - void cluster_points_centroid(int n, int m, double* pts, double dist, int* idx) { - double* pdist = new double[n * (n - 1) / 2]; - int* merge = new int[2 * (n - 1)]; - double* height = new double[n - 1]; - - hclust_pdist(n, m, pts, pdist); - hclust_fast(n, pdist, HCLUST_METHOD_CENTROID, merge, height); - cutree_cdist(n, merge, height, dist, idx); - - delete[] pdist; - delete[] merge; - delete[] height; - } -} diff --git a/third_party/cluster/fastcluster.h b/third_party/cluster/fastcluster.h deleted file mode 100644 index 56c63b6a5e..0000000000 --- a/third_party/cluster/fastcluster.h +++ /dev/null @@ -1,77 +0,0 @@ -// -// C++ standalone verion of fastcluster by Daniel Muellner -// -// Copyright: Daniel Muellner, 2011 -// Christoph Dalitz, 2018 -// License: BSD style license -// (see the file LICENSE for details) -// - -#ifndef fastclustercpp_H -#define fastclustercpp_H - -// -// Assigns cluster labels (0, ..., nclust-1) to the n points such -// that the cluster result is split into nclust clusters. -// -// Input arguments: -// n = number of observables -// merge = clustering result in R format -// nclust = number of clusters -// Output arguments: -// labels = allocated integer array of size n for result -// -void cutree_k(int n, const int* merge, int nclust, int* labels); - -// -// Assigns cluster labels (0, ..., nclust-1) to the n points such -// that the hierarchical clsutering is stopped at cluster distance cdist -// -// Input arguments: -// n = number of observables -// merge = clustering result in R format -// height = cluster distance at each merge step -// cdist = cutoff cluster distance -// Output arguments: -// labels = allocated integer array of size n for result -// -void cutree_cdist(int n, const int* merge, double* height, double cdist, int* labels); - -// -// Hierarchical clustering with one of Daniel Muellner's fast algorithms -// -// Input arguments: -// n = number of observables -// distmat = condensed distance matrix, i.e. an n*(n-1)/2 array representing -// the upper triangle (without diagonal elements) of the distance -// matrix, e.g. for n=4: -// d00 d01 d02 d03 -// d10 d11 d12 d13 -> d01 d02 d03 d12 d13 d23 -// d20 d21 d22 d23 -// d30 d31 d32 d33 -// method = cluster metric (see enum method_code) -// Output arguments: -// merge = allocated (n-1)x2 matrix (2*(n-1) array) for storing result. -// Result follows R hclust convention: -// - observabe indices start with one -// - merge[i][] contains the merged nodes in step i -// - merge[i][j] is negative when the node is an atom -// height = allocated (n-1) array with distances at each merge step -// Return code: -// 0 = ok -// 1 = invalid method -// -int hclust_fast(int n, double* distmat, int method, int* merge, double* height); -enum hclust_fast_methods { - HCLUST_METHOD_SINGLE = 0, - HCLUST_METHOD_COMPLETE = 1, - HCLUST_METHOD_AVERAGE = 2, - HCLUST_METHOD_MEDIAN = 3, - HCLUST_METHOD_CENTROID = 5, -}; - -void hclust_pdist(int n, int m, double* pts, double* out); -void cluster_points_centroid(int n, int m, double* pts, double dist, int* idx); - - -#endif diff --git a/third_party/cluster/fastcluster_R_dm.cpp b/third_party/cluster/fastcluster_R_dm.cpp deleted file mode 100644 index cbe126c15e..0000000000 --- a/third_party/cluster/fastcluster_R_dm.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// -// Excerpt from fastcluster_R.cpp -// -// Copyright: Daniel Müllner, 2011 -// - -struct pos_node { - t_index pos; - int node; -}; - -void order_nodes(const int N, const int * const merge, const t_index * const node_size, int * const order) { - /* Parameters: - N : number of data points - merge : (N-1)×2 array which specifies the node indices which are - merged in each step of the clustering procedure. - Negative entries -1...-N point to singleton nodes, while - positive entries 1...(N-1) point to nodes which are themselves - parents of other nodes. - node_size : array of node sizes - makes it easier - order : output array of size N - - Runtime: Θ(N) - */ - auto_array_ptr queue(N/2); - - int parent; - int child; - t_index pos = 0; - - queue[0].pos = 0; - queue[0].node = N-2; - t_index idx = 1; - - do { - --idx; - pos = queue[idx].pos; - parent = queue[idx].node; - - // First child - child = merge[parent]; - if (child<0) { // singleton node, write this into the 'order' array. - order[pos] = -child; - ++pos; - } - else { /* compound node: put it on top of the queue and decompose it - in a later iteration. */ - queue[idx].pos = pos; - queue[idx].node = child-1; // convert index-1 based to index-0 based - ++idx; - pos += node_size[child-1]; - } - // Second child - child = merge[parent+N-1]; - if (child<0) { - order[pos] = -child; - } - else { - queue[idx].pos = pos; - queue[idx].node = child-1; - ++idx; - } - } while (idx>0); -} - -#define size_(r_) ( ((r_ -void generate_R_dendrogram(int * const merge, double * const height, int * const order, cluster_result & Z2, const int N) { - // The array "nodes" is a union-find data structure for the cluster - // identites (only needed for unsorted cluster_result input). - union_find nodes(sorted ? 0 : N); - if (!sorted) { - std::stable_sort(Z2[0], Z2[N-1]); - } - - t_index node1, node2; - auto_array_ptr node_size(N-1); - - for (t_index i=0; inode1; - node2 = Z2[i]->node2; - } - else { - node1 = nodes.Find(Z2[i]->node1); - node2 = nodes.Find(Z2[i]->node2); - // Merge the nodes in the union-find data structure by making them - // children of a new node. - nodes.Union(node1, node2); - } - // Sort the nodes in the output array. - if (node1>node2) { - t_index tmp = node1; - node1 = node2; - node2 = tmp; - } - /* Conversion between labeling conventions. - Input: singleton nodes 0,...,N-1 - compound nodes N,...,2N-2 - Output: singleton nodes -1,...,-N - compound nodes 1,...,N - */ - merge[i] = (node1(node1)-1 - : static_cast(node1)-N+1; - merge[i+N-1] = (node2(node2)-1 - : static_cast(node2)-N+1; - height[i] = Z2[i]->dist; - node_size[i] = size_(node1) + size_(node2); - } - - order_nodes(N, merge, node_size, order); -} diff --git a/third_party/cluster/fastcluster_dm.cpp b/third_party/cluster/fastcluster_dm.cpp deleted file mode 100644 index ee6670c204..0000000000 --- a/third_party/cluster/fastcluster_dm.cpp +++ /dev/null @@ -1,1794 +0,0 @@ -/* - fastcluster: Fast hierarchical clustering routines for R and Python - - Copyright © 2011 Daniel Müllner - - - This library implements various fast algorithms for hierarchical, - agglomerative clustering methods: - - (1) Algorithms for the "stored matrix approach": the input is the array of - pairwise dissimilarities. - - MST_linkage_core: single linkage clustering with the "minimum spanning - tree algorithm (Rohlfs) - - NN_chain_core: nearest-neighbor-chain algorithm, suitable for single, - complete, average, weighted and Ward linkage (Murtagh) - - generic_linkage: generic algorithm, suitable for all distance update - formulas (Müllner) - - (2) Algorithms for the "stored data approach": the input are points in a - vector space. - - MST_linkage_core_vector: single linkage clustering for vector data - - generic_linkage_vector: generic algorithm for vector data, suitable for - the Ward, centroid and median methods. - - generic_linkage_vector_alternative: alternative scheme for updating the - nearest neighbors. This method seems faster than "generic_linkage_vector" - for the centroid and median methods but slower for the Ward method. - - All these implementation treat infinity values correctly. They throw an - exception if a NaN distance value occurs. -*/ - -// Older versions of Microsoft Visual Studio do not have the fenv header. -#ifdef _MSC_VER -#if (_MSC_VER == 1500 || _MSC_VER == 1600) -#define NO_INCLUDE_FENV -#endif -#endif -// NaN detection via fenv might not work on systems with software -// floating-point emulation (bug report for Debian armel). -#ifdef __SOFTFP__ -#define NO_INCLUDE_FENV -#endif -#ifdef NO_INCLUDE_FENV -#pragma message("Do not use fenv header.") -#else -//#pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.") -//#pragma STDC FENV_ACCESS on -#include -#endif - -#include // for std::pow, std::sqrt -#include // for std::ptrdiff_t -#include // for std::numeric_limits<...>::infinity() -#include // for std::fill_n -#include // for std::runtime_error -#include // for std::string - -#include // also for DBL_MAX, DBL_MIN -#ifndef DBL_MANT_DIG -#error The constant DBL_MANT_DIG could not be defined. -#endif -#define T_FLOAT_MANT_DIG DBL_MANT_DIG - -#ifndef LONG_MAX -#include -#endif -#ifndef LONG_MAX -#error The constant LONG_MAX could not be defined. -#endif -#ifndef INT_MAX -#error The constant INT_MAX could not be defined. -#endif - -#ifndef INT32_MAX -#ifdef _MSC_VER -#if _MSC_VER >= 1600 -#define __STDC_LIMIT_MACROS -#include -#else -typedef __int32 int_fast32_t; -typedef __int64 int64_t; -#endif -#else -#define __STDC_LIMIT_MACROS -#include -#endif -#endif - -#define FILL_N std::fill_n -#ifdef _MSC_VER -#if _MSC_VER < 1600 -#undef FILL_N -#define FILL_N stdext::unchecked_fill_n -#endif -#endif - -// Suppress warnings about (potentially) uninitialized variables. -#ifdef _MSC_VER - #pragma warning (disable:4700) -#endif - -#ifndef HAVE_DIAGNOSTIC -#if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) -#define HAVE_DIAGNOSTIC 1 -#endif -#endif - -#ifndef HAVE_VISIBILITY -#if __GNUC__ >= 4 -#define HAVE_VISIBILITY 1 -#endif -#endif - -/* Since the public interface is given by the Python respectively R interface, - * we do not want other symbols than the interface initalization routines to be - * visible in the shared object file. The "visibility" switch is a GCC concept. - * Hiding symbols keeps the relocation table small and decreases startup time. - * See http://gcc.gnu.org/wiki/Visibility - */ -#if HAVE_VISIBILITY -#pragma GCC visibility push(hidden) -#endif - -typedef int_fast32_t t_index; -#ifndef INT32_MAX -#define MAX_INDEX 0x7fffffffL -#else -#define MAX_INDEX INT32_MAX -#endif -#if (LONG_MAX < MAX_INDEX) -#error The integer format "t_index" must not have a greater range than "long int". -#endif -#if (INT_MAX > MAX_INDEX) -#error The integer format "int" must not have a greater range than "t_index". -#endif -typedef double t_float; - -/* Method codes. - - These codes must agree with the METHODS array in fastcluster.R and the - dictionary mthidx in fastcluster.py. -*/ -enum method_codes { - // non-Euclidean methods - METHOD_METR_SINGLE = 0, - METHOD_METR_COMPLETE = 1, - METHOD_METR_AVERAGE = 2, - METHOD_METR_WEIGHTED = 3, - METHOD_METR_WARD = 4, - METHOD_METR_WARD_D = METHOD_METR_WARD, - METHOD_METR_CENTROID = 5, - METHOD_METR_MEDIAN = 6, - METHOD_METR_WARD_D2 = 7, - - MIN_METHOD_CODE = 0, - MAX_METHOD_CODE = 7 -}; - -enum method_codes_vector { - // Euclidean methods - METHOD_VECTOR_SINGLE = 0, - METHOD_VECTOR_WARD = 1, - METHOD_VECTOR_CENTROID = 2, - METHOD_VECTOR_MEDIAN = 3, - - MIN_METHOD_VECTOR_CODE = 0, - MAX_METHOD_VECTOR_CODE = 3 -}; - -// self-destructing array pointer -template -class auto_array_ptr{ -private: - type * ptr; - auto_array_ptr(auto_array_ptr const &); // non construction-copyable - auto_array_ptr& operator=(auto_array_ptr const &); // non copyable -public: - auto_array_ptr() - : ptr(NULL) - { } - template - auto_array_ptr(index const size) - : ptr(new type[size]) - { } - template - auto_array_ptr(index const size, value const val) - : ptr(new type[size]) - { - FILL_N(ptr, size, val); - } - ~auto_array_ptr() { - delete [] ptr; } - void free() { - delete [] ptr; - ptr = NULL; - } - template - void init(index const size) { - ptr = new type [size]; - } - template - void init(index const size, value const val) { - init(size); - FILL_N(ptr, size, val); - } - inline operator type *() const { return ptr; } -}; - -struct node { - t_index node1, node2; - t_float dist; -}; - -inline bool operator< (const node a, const node b) { - return (a.dist < b.dist); -} - -class cluster_result { -private: - auto_array_ptr Z; - t_index pos; - -public: - cluster_result(const t_index size) - : Z(size) - , pos(0) - {} - - void append(const t_index node1, const t_index node2, const t_float dist) { - Z[pos].node1 = node1; - Z[pos].node2 = node2; - Z[pos].dist = dist; - ++pos; - } - - node * operator[] (const t_index idx) const { return Z + idx; } - - /* Define several methods to postprocess the distances. All these functions - are monotone, so they do not change the sorted order of distances. */ - - void sqrt() const { - for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { - ZZ->dist = std::sqrt(ZZ->dist); - } - } - - void sqrt(const t_float) const { // ignore the argument - sqrt(); - } - - void sqrtdouble(const t_float) const { // ignore the argument - for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { - ZZ->dist = std::sqrt(2*ZZ->dist); - } - } - - #ifdef R_pow - #define my_pow R_pow - #else - #define my_pow std::pow - #endif - - void power(const t_float p) const { - t_float const q = 1/p; - for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { - ZZ->dist = my_pow(ZZ->dist,q); - } - } - - void plusone(const t_float) const { // ignore the argument - for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { - ZZ->dist += 1; - } - } - - void divide(const t_float denom) const { - for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { - ZZ->dist /= denom; - } - } -}; - -class doubly_linked_list { - /* - Class for a doubly linked list. Initially, the list is the integer range - [0, size]. We provide a forward iterator and a method to delete an index - from the list. - - Typical use: for (i=L.start; L succ; - -private: - auto_array_ptr pred; - // Not necessarily private, we just do not need it in this instance. - -public: - doubly_linked_list(const t_index size) - // Initialize to the given size. - : start(0) - , succ(size+1) - , pred(size+1) - { - for (t_index i=0; i(2*N-3-(r_))*(r_)>>1)+(c_)-1] ) -// Z is an ((N-1)x4)-array -#define Z_(_r, _c) (Z[(_r)*4 + (_c)]) - -/* - Lookup function for a union-find data structure. - - The function finds the root of idx by going iteratively through all - parent elements until a root is found. An element i is a root if - nodes[i] is zero. To make subsequent searches faster, the entry for - idx and all its parents is updated with the root element. - */ -class union_find { -private: - auto_array_ptr parent; - t_index nextparent; - -public: - union_find(const t_index size) - : parent(size>0 ? 2*size-1 : 0, 0) - , nextparent(size) - { } - - t_index Find (t_index idx) const { - if (parent[idx] != 0 ) { // a → b - t_index p = idx; - idx = parent[idx]; - if (parent[idx] != 0 ) { // a → b → c - do { - idx = parent[idx]; - } while (parent[idx] != 0); - do { - t_index tmp = parent[p]; - parent[p] = idx; - p = tmp; - } while (parent[p] != idx); - } - } - return idx; - } - - void Union (const t_index node1, const t_index node2) { - parent[node1] = parent[node2] = nextparent++; - } -}; - -class nan_error{}; -#ifdef FE_INVALID -class fenv_error{}; -#endif - -static void MST_linkage_core(const t_index N, const t_float * const D, - cluster_result & Z2) { -/* - N: integer, number of data points - D: condensed distance matrix N*(N-1)/2 - Z2: output data structure - - The basis of this algorithm is an algorithm by Rohlf: - - F. James Rohlf, Hierarchical clustering using the minimum spanning tree, - The Computer Journal, vol. 16, 1973, p. 93–95. -*/ - t_index i; - t_index idx2; - doubly_linked_list active_nodes(N); - auto_array_ptr d(N); - - t_index prev_node; - t_float min; - - // first iteration - idx2 = 1; - min = std::numeric_limits::infinity(); - for (i=1; i tmp) - d[i] = tmp; - else if (fc_isnan(tmp)) - throw (nan_error()); -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - if (d[i] < min) { - min = d[i]; - idx2 = i; - } - } - Z2.append(prev_node, idx2, min); - } -} - -/* Functions for the update of the dissimilarity array */ - -inline static void f_single( t_float * const b, const t_float a ) { - if (*b > a) *b = a; -} -inline static void f_complete( t_float * const b, const t_float a ) { - if (*b < a) *b = a; -} -inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) { - *b = s*a + t*(*b); - #ifndef FE_INVALID -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - if (fc_isnan(*b)) { - throw(nan_error()); - } -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - #endif -} -inline static void f_weighted( t_float * const b, const t_float a) { - *b = (a+*b)*.5; - #ifndef FE_INVALID -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - if (fc_isnan(*b)) { - throw(nan_error()); - } -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - #endif -} -inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) { - *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v); - //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v); - #ifndef FE_INVALID -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - if (fc_isnan(*b)) { - throw(nan_error()); - } -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - #endif -} -inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) { - *b = s*a - stc + t*(*b); - #ifndef FE_INVALID - if (fc_isnan(*b)) { - throw(nan_error()); - } -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - #endif -} -inline static void f_median( t_float * const b, const t_float a, const t_float c_4) { - *b = (a+(*b))*.5 - c_4; - #ifndef FE_INVALID -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - if (fc_isnan(*b)) { - throw(nan_error()); - } -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - #endif -} - -template -static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { -/* - N: integer - D: condensed distance matrix N*(N-1)/2 - Z2: output data structure - - This is the NN-chain algorithm, described on page 86 in the following book: - - Fionn Murtagh, Multidimensional Clustering Algorithms, - Vienna, Würzburg: Physica-Verlag, 1985. -*/ - t_index i; - - auto_array_ptr NN_chain(N); - t_index NN_chain_tip = 0; - - t_index idx1, idx2; - - t_float size1, size2; - doubly_linked_list active_nodes(N); - - t_float min; - - for (t_float const * DD=D; DD!=D+(static_cast(N)*(N-1)>>1); - ++DD) { -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - if (fc_isnan(*DD)) { - throw(nan_error()); - } -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - } - - #ifdef FE_INVALID - if (feclearexcept(FE_INVALID)) throw fenv_error(); - #endif - - for (t_index j=0; jidx2) { - t_index tmp = idx1; - idx1 = idx2; - idx2 = tmp; - } - - if (method==METHOD_METR_AVERAGE || - method==METHOD_METR_WARD) { - size1 = static_cast(members[idx1]); - size2 = static_cast(members[idx2]); - members[idx2] += members[idx1]; - } - - // Remove the smaller index from the valid indices (active_nodes). - active_nodes.remove(idx1); - - switch (method) { - case METHOD_METR_SINGLE: - /* - Single linkage. - - Characteristic: new distances are never longer than the old distances. - */ - // Update the distance matrix in the range [start, idx1). - for (i=active_nodes.start; i(members[i]); - for (i=active_nodes.start; i(members[i]) ); - // Update the distance matrix in the range (idx1, idx2). - for (; i(members[i]) ); - // Update the distance matrix in the range (idx2, N). - for (i=active_nodes.succ[idx2]; i(members[i]) ); - break; - - default: - throw std::runtime_error(std::string("Invalid method.")); - } - } - #ifdef FE_INVALID - if (fetestexcept(FE_INVALID)) throw fenv_error(); - #endif -} - -class binary_min_heap { - /* - Class for a binary min-heap. The data resides in an array A. The elements of - A are not changed but two lists I and R of indices are generated which point - to elements of A and backwards. - - The heap tree structure is - - H[2*i+1] H[2*i+2] - \ / - \ / - ≤ ≤ - \ / - \ / - H[i] - - where the children must be less or equal than their parent. Thus, H[0] - contains the minimum. The lists I and R are made such that H[i] = A[I[i]] - and R[I[i]] = i. - - This implementation is not designed to handle NaN values. - */ -private: - t_float * const A; - t_index size; - auto_array_ptr I; - auto_array_ptr R; - - // no default constructor - binary_min_heap(); - // noncopyable - binary_min_heap(binary_min_heap const &); - binary_min_heap & operator=(binary_min_heap const &); - -public: - binary_min_heap(t_float * const A_, const t_index size_) - : A(A_), size(size_), I(size), R(size) - { // Allocate memory and initialize the lists I and R to the identity. This - // does not make it a heap. Call heapify afterwards! - for (t_index i=0; i>1); idx>0; ) { - --idx; - update_geq_(idx); - } - } - - inline t_index argmin() const { - // Return the minimal element. - return I[0]; - } - - void heap_pop() { - // Remove the minimal element from the heap. - --size; - I[0] = I[size]; - R[I[0]] = 0; - update_geq_(0); - } - - void remove(t_index idx) { - // Remove an element from the heap. - --size; - R[I[size]] = R[idx]; - I[R[idx]] = I[size]; - if ( H(size)<=A[idx] ) { - update_leq_(R[idx]); - } - else { - update_geq_(R[idx]); - } - } - - void replace ( const t_index idxold, const t_index idxnew, - const t_float val) { - R[idxnew] = R[idxold]; - I[R[idxnew]] = idxnew; - if (val<=A[idxold]) - update_leq(idxnew, val); - else - update_geq(idxnew, val); - } - - void update ( const t_index idx, const t_float val ) const { - // Update the element A[i] with val and re-arrange the indices to preserve - // the heap condition. - if (val<=A[idx]) - update_leq(idx, val); - else - update_geq(idx, val); - } - - void update_leq ( const t_index idx, const t_float val ) const { - // Use this when the new value is not more than the old value. - A[idx] = val; - update_leq_(R[idx]); - } - - void update_geq ( const t_index idx, const t_float val ) const { - // Use this when the new value is not less than the old value. - A[idx] = val; - update_geq_(R[idx]); - } - -private: - void update_leq_ (t_index i) const { - t_index j; - for ( ; (i>0) && ( H(i)>1) ); i=j) - heap_swap(i,j); - } - - void update_geq_ (t_index i) const { - t_index j; - for ( ; (j=2*i+1)=H(i) ) { - ++j; - if ( j>=size || H(j)>=H(i) ) break; - } - else if ( j+1 -static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { - /* - N: integer, number of data points - D: condensed distance matrix N*(N-1)/2 - Z2: output data structure - */ - - const t_index N_1 = N-1; - t_index i, j; // loop variables - t_index idx1, idx2; // row and column indices - - auto_array_ptr n_nghbr(N_1); // array of nearest neighbors - auto_array_ptr mindist(N_1); // distances to the nearest neighbors - auto_array_ptr row_repr(N); // row_repr[i]: node number that the - // i-th row represents - doubly_linked_list active_nodes(N); - binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for - // the distance to the nearest neighbor of each point - t_index node1, node2; // node numbers in the output - t_float size1, size2; // and their cardinalities - - t_float min; // minimum and row index for nearest-neighbor search - t_index idx; - - for (i=0; ii} D(i,j) for i in range(N-1) - t_float const * DD = D; - for (i=0; i::infinity(); - for (idx=j=i+1; ji} D(i,j) - - Normally, we have equality. However, this minimum may become invalid due - to the updates in the distance matrix. The rules are: - - 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct - minimum and n_nghbr[i] is a nearest neighbor. - - 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the - correct minimum. The minimum needs to be recomputed. - - 3) mindist[i] is never bigger than the true minimum. Hence, we never - miss the true minimum if we take the smallest mindist entry, - re-compute the value if necessary (thus maybe increasing it) and - looking for the now smallest mindist entry until a valid minimal - entry is found. This step is done in the lines below. - - The update process for D below takes care that these rules are - fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are - re-calculated when necessary but re-calculation is avoided whenever - possible. - - The re-calculation of the minima makes the worst-case runtime of this - algorithm cubic in N. We avoid this whenever possible, and in most cases - the runtime appears to be quadratic. - */ - idx1 = nn_distances.argmin(); - if (method != METHOD_METR_SINGLE) { - while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) { - // Recompute the minimum mindist[idx1] and n_nghbr[idx1]. - n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1 - min = D_(idx1,j); - for (j=active_nodes.succ[j]; j(members[idx1]); - size2 = static_cast(members[idx2]); - members[idx2] += members[idx1]; - } - Z2.append(node1, node2, mindist[idx1]); - - // Remove idx1 from the list of active indices (active_nodes). - active_nodes.remove(idx1); - // Index idx2 now represents the new (merged) node with label N+i. - row_repr[idx2] = N+i; - - // Update the distance matrix - switch (method) { - case METHOD_METR_SINGLE: - /* - Single linkage. - - Characteristic: new distances are never longer than the old distances. - */ - // Update the distance matrix in the range [start, idx1). - for (j=active_nodes.start; j(members[j]) ); - if (n_nghbr[j] == idx1) - n_nghbr[j] = idx2; - } - // Update the distance matrix in the range (idx1, idx2). - for (; j(members[j]) ); - if (D_(j, idx2) < mindist[j]) { - nn_distances.update_leq(j, D_(j, idx2)); - n_nghbr[j] = idx2; - } - } - // Update the distance matrix in the range (idx2, N). - if (idx2(members[j]) ); - min = D_(idx2,j); - for (j=active_nodes.succ[j]; j(members[j]) ); - if (D_(idx2,j) < min) { - min = D_(idx2,j); - n_nghbr[idx2] = j; - } - } - nn_distances.update(idx2, min); - } - break; - - case METHOD_METR_CENTROID: { - /* - Centroid linkage. - - Shorter and longer distances can occur, not bigger than max(d1,d2) - but maybe smaller than min(d1,d2). - */ - // Update the distance matrix in the range [start, idx1). - t_float s = size1/(size1+size2); - t_float t = size2/(size1+size2); - t_float stc = s*t*mindist[idx1]; - for (j=active_nodes.start; j -static void MST_linkage_core_vector(const t_index N, - t_dissimilarity & dist, - cluster_result & Z2) { -/* - N: integer, number of data points - dist: function pointer to the metric - Z2: output data structure - - The basis of this algorithm is an algorithm by Rohlf: - - F. James Rohlf, Hierarchical clustering using the minimum spanning tree, - The Computer Journal, vol. 16, 1973, p. 93–95. -*/ - t_index i; - t_index idx2; - doubly_linked_list active_nodes(N); - auto_array_ptr d(N); - - t_index prev_node; - t_float min; - - // first iteration - idx2 = 1; - min = std::numeric_limits::infinity(); - for (i=1; i tmp) - d[i] = tmp; - else if (fc_isnan(tmp)) - throw (nan_error()); -#if HAVE_DIAGNOSTIC -#pragma GCC diagnostic pop -#endif - if (d[i] < min) { - min = d[i]; - idx2 = i; - } - } - Z2.append(prev_node, idx2, min); - } -} - -template -static void generic_linkage_vector(const t_index N, - t_dissimilarity & dist, - cluster_result & Z2) { - /* - N: integer, number of data points - dist: function pointer to the metric - Z2: output data structure - - This algorithm is valid for the distance update methods - "Ward", "centroid" and "median" only! - */ - const t_index N_1 = N-1; - t_index i, j; // loop variables - t_index idx1, idx2; // row and column indices - - auto_array_ptr n_nghbr(N_1); // array of nearest neighbors - auto_array_ptr mindist(N_1); // distances to the nearest neighbors - auto_array_ptr row_repr(N); // row_repr[i]: node number that the - // i-th row represents - doubly_linked_list active_nodes(N); - binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for - // the distance to the nearest neighbor of each point - t_index node1, node2; // node numbers in the output - t_float min; // minimum and row index for nearest-neighbor search - - for (i=0; ii} D(i,j) for i in range(N-1) - for (i=0; i::infinity(); - t_index idx; - for (idx=j=i+1; j(i,j); - } - if (tmp(idx1,j); - for (j=active_nodes.succ[j]; j(idx1,j); - if (tmp(j, idx2); - if (tmp < mindist[j]) { - nn_distances.update_leq(j, tmp); - n_nghbr[j] = idx2; - } - else if (n_nghbr[j] == idx2) - n_nghbr[j] = idx1; // invalidate - } - // Find the nearest neighbor for idx2. - if (idx2(idx2,j); - for (j=active_nodes.succ[j]; j(idx2, j); - if (tmp < min) { - min = tmp; - n_nghbr[idx2] = j; - } - } - nn_distances.update(idx2, min); - } - } - } -} - -template -static void generic_linkage_vector_alternative(const t_index N, - t_dissimilarity & dist, - cluster_result & Z2) { - /* - N: integer, number of data points - dist: function pointer to the metric - Z2: output data structure - - This algorithm is valid for the distance update methods - "Ward", "centroid" and "median" only! - */ - const t_index N_1 = N-1; - t_index i, j=0; // loop variables - t_index idx1, idx2; // row and column indices - - auto_array_ptr n_nghbr(2*N-2); // array of nearest neighbors - auto_array_ptr mindist(2*N-2); // distances to the nearest neighbors - - doubly_linked_list active_nodes(N+N_1); - binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap - // structure for the distance to the nearest neighbor of each point - - t_float min; // minimum for nearest-neighbor searches - - // Initialize the minimal distances: - // Find the nearest neighbor of each point. - // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1) - for (i=1; i::infinity(); - t_index idx; - for (idx=j=0; j(i,j); - } - if (tmp - -extern "C" { -#include "fastcluster.h" -} - - -int main(int argc, const char* argv[]) { - const int n = 11; - const int m = 3; - double* pts = new double[n*m]{59.26000137, -9.35999966, -5.42500019, - 91.61999817, -0.31999999, -2.75, - 31.38000031, 0.40000001, -0.2, - 89.57999725, -8.07999992, -18.04999924, - 53.42000122, 0.63999999, -0.175, - 31.38000031, 0.47999999, -0.2, - 36.33999939, 0.16, -0.2, - 53.33999939, 0.95999998, -0.175, - 59.26000137, -9.76000023, -5.44999981, - 33.93999977, 0.40000001, -0.22499999, - 106.74000092, -5.76000023, -18.04999924}; - - int * idx = new int[n]; - int * correct_idx = new int[n]{0, 1, 2, 3, 4, 2, 5, 4, 0, 5, 6}; - - cluster_points_centroid(n, m, pts, 2.5 * 2.5, idx); - - for (int i = 0; i < n; i++) { - assert(idx[i] == correct_idx[i]); - } - - delete[] idx; - delete[] correct_idx; - delete[] pts; -}