openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1794 lines
51 KiB

/*
fastcluster: Fast hierarchical clustering routines for R and Python
Copyright © 2011 Daniel Müllner
<http://danifold.net>
This library implements various fast algorithms for hierarchical,
agglomerative clustering methods:
(1) Algorithms for the "stored matrix approach": the input is the array of
pairwise dissimilarities.
MST_linkage_core: single linkage clustering with the "minimum spanning
tree algorithm (Rohlfs)
NN_chain_core: nearest-neighbor-chain algorithm, suitable for single,
complete, average, weighted and Ward linkage (Murtagh)
generic_linkage: generic algorithm, suitable for all distance update
formulas (Müllner)
(2) Algorithms for the "stored data approach": the input are points in a
vector space.
MST_linkage_core_vector: single linkage clustering for vector data
generic_linkage_vector: generic algorithm for vector data, suitable for
the Ward, centroid and median methods.
generic_linkage_vector_alternative: alternative scheme for updating the
nearest neighbors. This method seems faster than "generic_linkage_vector"
for the centroid and median methods but slower for the Ward method.
All these implementation treat infinity values correctly. They throw an
exception if a NaN distance value occurs.
*/
// Older versions of Microsoft Visual Studio do not have the fenv header.
#ifdef _MSC_VER
#if (_MSC_VER == 1500 || _MSC_VER == 1600)
#define NO_INCLUDE_FENV
#endif
#endif
// NaN detection via fenv might not work on systems with software
// floating-point emulation (bug report for Debian armel).
#ifdef __SOFTFP__
#define NO_INCLUDE_FENV
#endif
#ifdef NO_INCLUDE_FENV
#pragma message("Do not use fenv header.")
#else
#pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
#pragma STDC FENV_ACCESS on
#include <fenv.h>
#endif
#include <cmath> // for std::pow, std::sqrt
#include <cstddef> // for std::ptrdiff_t
#include <limits> // for std::numeric_limits<...>::infinity()
#include <algorithm> // for std::fill_n
#include <stdexcept> // for std::runtime_error
#include <string> // for std::string
#include <cfloat> // also for DBL_MAX, DBL_MIN
#ifndef DBL_MANT_DIG
#error The constant DBL_MANT_DIG could not be defined.
#endif
#define T_FLOAT_MANT_DIG DBL_MANT_DIG
#ifndef LONG_MAX
#include <climits>
#endif
#ifndef LONG_MAX
#error The constant LONG_MAX could not be defined.
#endif
#ifndef INT_MAX
#error The constant INT_MAX could not be defined.
#endif
#ifndef INT32_MAX
#ifdef _MSC_VER
#if _MSC_VER >= 1600
#define __STDC_LIMIT_MACROS
#include <stdint.h>
#else
typedef __int32 int_fast32_t;
typedef __int64 int64_t;
#endif
#else
#define __STDC_LIMIT_MACROS
#include <stdint.h>
#endif
#endif
#define FILL_N std::fill_n
#ifdef _MSC_VER
#if _MSC_VER < 1600
#undef FILL_N
#define FILL_N stdext::unchecked_fill_n
#endif
#endif
// Suppress warnings about (potentially) uninitialized variables.
#ifdef _MSC_VER
#pragma warning (disable:4700)
#endif
#ifndef HAVE_DIAGNOSTIC
#if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
#define HAVE_DIAGNOSTIC 1
#endif
#endif
#ifndef HAVE_VISIBILITY
#if __GNUC__ >= 4
#define HAVE_VISIBILITY 1
#endif
#endif
/* Since the public interface is given by the Python respectively R interface,
* we do not want other symbols than the interface initalization routines to be
* visible in the shared object file. The "visibility" switch is a GCC concept.
* Hiding symbols keeps the relocation table small and decreases startup time.
* See http://gcc.gnu.org/wiki/Visibility
*/
#if HAVE_VISIBILITY
#pragma GCC visibility push(hidden)
#endif
typedef int_fast32_t t_index;
#ifndef INT32_MAX
#define MAX_INDEX 0x7fffffffL
#else
#define MAX_INDEX INT32_MAX
#endif
#if (LONG_MAX < MAX_INDEX)
#error The integer format "t_index" must not have a greater range than "long int".
#endif
#if (INT_MAX > MAX_INDEX)
#error The integer format "int" must not have a greater range than "t_index".
#endif
typedef double t_float;
/* Method codes.
These codes must agree with the METHODS array in fastcluster.R and the
dictionary mthidx in fastcluster.py.
*/
enum method_codes {
// non-Euclidean methods
METHOD_METR_SINGLE = 0,
METHOD_METR_COMPLETE = 1,
METHOD_METR_AVERAGE = 2,
METHOD_METR_WEIGHTED = 3,
METHOD_METR_WARD = 4,
METHOD_METR_WARD_D = METHOD_METR_WARD,
METHOD_METR_CENTROID = 5,
METHOD_METR_MEDIAN = 6,
METHOD_METR_WARD_D2 = 7,
MIN_METHOD_CODE = 0,
MAX_METHOD_CODE = 7
};
enum method_codes_vector {
// Euclidean methods
METHOD_VECTOR_SINGLE = 0,
METHOD_VECTOR_WARD = 1,
METHOD_VECTOR_CENTROID = 2,
METHOD_VECTOR_MEDIAN = 3,
MIN_METHOD_VECTOR_CODE = 0,
MAX_METHOD_VECTOR_CODE = 3
};
// self-destructing array pointer
template <typename type>
class auto_array_ptr{
private:
type * ptr;
auto_array_ptr(auto_array_ptr const &); // non construction-copyable
auto_array_ptr& operator=(auto_array_ptr const &); // non copyable
public:
auto_array_ptr()
: ptr(NULL)
{ }
template <typename index>
auto_array_ptr(index const size)
: ptr(new type[size])
{ }
template <typename index, typename value>
auto_array_ptr(index const size, value const val)
: ptr(new type[size])
{
FILL_N(ptr, size, val);
}
~auto_array_ptr() {
delete [] ptr; }
void free() {
delete [] ptr;
ptr = NULL;
}
template <typename index>
void init(index const size) {
ptr = new type [size];
}
template <typename index, typename value>
void init(index const size, value const val) {
init(size);
FILL_N(ptr, size, val);
}
inline operator type *() const { return ptr; }
};
struct node {
t_index node1, node2;
t_float dist;
};
inline bool operator< (const node a, const node b) {
return (a.dist < b.dist);
}
class cluster_result {
private:
auto_array_ptr<node> Z;
t_index pos;
public:
cluster_result(const t_index size)
: Z(size)
, pos(0)
{}
void append(const t_index node1, const t_index node2, const t_float dist) {
Z[pos].node1 = node1;
Z[pos].node2 = node2;
Z[pos].dist = dist;
++pos;
}
node * operator[] (const t_index idx) const { return Z + idx; }
/* Define several methods to postprocess the distances. All these functions
are monotone, so they do not change the sorted order of distances. */
void sqrt() const {
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
ZZ->dist = std::sqrt(ZZ->dist);
}
}
void sqrt(const t_float) const { // ignore the argument
sqrt();
}
void sqrtdouble(const t_float) const { // ignore the argument
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
ZZ->dist = std::sqrt(2*ZZ->dist);
}
}
#ifdef R_pow
#define my_pow R_pow
#else
#define my_pow std::pow
#endif
void power(const t_float p) const {
t_float const q = 1/p;
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
ZZ->dist = my_pow(ZZ->dist,q);
}
}
void plusone(const t_float) const { // ignore the argument
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
ZZ->dist += 1;
}
}
void divide(const t_float denom) const {
for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) {
ZZ->dist /= denom;
}
}
};
class doubly_linked_list {
/*
Class for a doubly linked list. Initially, the list is the integer range
[0, size]. We provide a forward iterator and a method to delete an index
from the list.
Typical use: for (i=L.start; L<size; i=L.succ[I])
or
for (i=somevalue; L<size; i=L.succ[I])
*/
public:
t_index start;
auto_array_ptr<t_index> succ;
private:
auto_array_ptr<t_index> pred;
// Not necessarily private, we just do not need it in this instance.
public:
doubly_linked_list(const t_index size)
// Initialize to the given size.
: start(0)
, succ(size+1)
, pred(size+1)
{
for (t_index i=0; i<size; ++i) {
pred[i+1] = i;
succ[i] = i+1;
}
// pred[0] is never accessed!
//succ[size] is never accessed!
}
~doubly_linked_list() {}
void remove(const t_index idx) {
// Remove an index from the list.
if (idx==start) {
start = succ[idx];
}
else {
succ[pred[idx]] = succ[idx];
pred[succ[idx]] = pred[idx];
}
succ[idx] = 0; // Mark as inactive
}
bool is_inactive(t_index idx) const {
return (succ[idx]==0);
}
};
// Indexing functions
// D is the upper triangular part of a symmetric (NxN)-matrix
// We require r_ < c_ !
#define D_(r_,c_) ( D[(static_cast<std::ptrdiff_t>(2*N-3-(r_))*(r_)>>1)+(c_)-1] )
// Z is an ((N-1)x4)-array
#define Z_(_r, _c) (Z[(_r)*4 + (_c)])
/*
Lookup function for a union-find data structure.
The function finds the root of idx by going iteratively through all
parent elements until a root is found. An element i is a root if
nodes[i] is zero. To make subsequent searches faster, the entry for
idx and all its parents is updated with the root element.
*/
class union_find {
private:
auto_array_ptr<t_index> parent;
t_index nextparent;
public:
union_find(const t_index size)
: parent(size>0 ? 2*size-1 : 0, 0)
, nextparent(size)
{ }
t_index Find (t_index idx) const {
if (parent[idx] != 0 ) { // a → b
t_index p = idx;
idx = parent[idx];
if (parent[idx] != 0 ) { // a → b → c
do {
idx = parent[idx];
} while (parent[idx] != 0);
do {
t_index tmp = parent[p];
parent[p] = idx;
p = tmp;
} while (parent[p] != idx);
}
}
return idx;
}
void Union (const t_index node1, const t_index node2) {
parent[node1] = parent[node2] = nextparent++;
}
};
class nan_error{};
#ifdef FE_INVALID
class fenv_error{};
#endif
static void MST_linkage_core(const t_index N, const t_float * const D,
cluster_result & Z2) {
/*
N: integer, number of data points
D: condensed distance matrix N*(N-1)/2
Z2: output data structure
The basis of this algorithm is an algorithm by Rohlf:
F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
The Computer Journal, vol. 16, 1973, p. 93–95.
*/
t_index i;
t_index idx2;
doubly_linked_list active_nodes(N);
auto_array_ptr<t_float> d(N);
t_index prev_node;
t_float min;
// first iteration
idx2 = 1;
min = std::numeric_limits<t_float>::infinity();
for (i=1; i<N; ++i) {
d[i] = D[i-1];
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (d[i] < min) {
min = d[i];
idx2 = i;
}
else if (fc_isnan(d[i]))
throw (nan_error());
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
}
Z2.append(0, idx2, min);
for (t_index j=1; j<N-1; ++j) {
prev_node = idx2;
active_nodes.remove(prev_node);
idx2 = active_nodes.succ[0];
min = d[idx2];
for (i=idx2; i<prev_node; i=active_nodes.succ[i]) {
t_float tmp = D_(i, prev_node);
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (tmp < d[i])
d[i] = tmp;
else if (fc_isnan(tmp))
throw (nan_error());
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
if (d[i] < min) {
min = d[i];
idx2 = i;
}
}
for (; i<N; i=active_nodes.succ[i]) {
t_float tmp = D_(prev_node, i);
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (d[i] > tmp)
d[i] = tmp;
else if (fc_isnan(tmp))
throw (nan_error());
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
if (d[i] < min) {
min = d[i];
idx2 = i;
}
}
Z2.append(prev_node, idx2, min);
}
}
/* Functions for the update of the dissimilarity array */
inline static void f_single( t_float * const b, const t_float a ) {
if (*b > a) *b = a;
}
inline static void f_complete( t_float * const b, const t_float a ) {
if (*b < a) *b = a;
}
inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) {
*b = s*a + t*(*b);
#ifndef FE_INVALID
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (fc_isnan(*b)) {
throw(nan_error());
}
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
#endif
}
inline static void f_weighted( t_float * const b, const t_float a) {
*b = (a+*b)*.5;
#ifndef FE_INVALID
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (fc_isnan(*b)) {
throw(nan_error());
}
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
#endif
}
inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) {
*b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v);
//*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
#ifndef FE_INVALID
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (fc_isnan(*b)) {
throw(nan_error());
}
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
#endif
}
inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) {
*b = s*a - stc + t*(*b);
#ifndef FE_INVALID
if (fc_isnan(*b)) {
throw(nan_error());
}
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
#endif
}
inline static void f_median( t_float * const b, const t_float a, const t_float c_4) {
*b = (a+(*b))*.5 - c_4;
#ifndef FE_INVALID
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (fc_isnan(*b)) {
throw(nan_error());
}
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
#endif
}
template <method_codes method, typename t_members>
static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
/*
N: integer
D: condensed distance matrix N*(N-1)/2
Z2: output data structure
This is the NN-chain algorithm, described on page 86 in the following book:
Fionn Murtagh, Multidimensional Clustering Algorithms,
Vienna, Würzburg: Physica-Verlag, 1985.
*/
t_index i;
auto_array_ptr<t_index> NN_chain(N);
t_index NN_chain_tip = 0;
t_index idx1, idx2;
t_float size1, size2;
doubly_linked_list active_nodes(N);
t_float min;
for (t_float const * DD=D; DD!=D+(static_cast<std::ptrdiff_t>(N)*(N-1)>>1);
++DD) {
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (fc_isnan(*DD)) {
throw(nan_error());
}
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
}
#ifdef FE_INVALID
if (feclearexcept(FE_INVALID)) throw fenv_error();
#endif
for (t_index j=0; j<N-1; ++j) {
if (NN_chain_tip <= 3) {
NN_chain[0] = idx1 = active_nodes.start;
NN_chain_tip = 1;
idx2 = active_nodes.succ[idx1];
min = D_(idx1,idx2);
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
if (D_(idx1,i) < min) {
min = D_(idx1,i);
idx2 = i;
}
}
} // a: idx1 b: idx2
else {
NN_chain_tip -= 3;
idx1 = NN_chain[NN_chain_tip-1];
idx2 = NN_chain[NN_chain_tip];
min = idx1<idx2 ? D_(idx1,idx2) : D_(idx2,idx1);
} // a: idx1 b: idx2
do {
NN_chain[NN_chain_tip] = idx2;
for (i=active_nodes.start; i<idx2; i=active_nodes.succ[i]) {
if (D_(i,idx2) < min) {
min = D_(i,idx2);
idx1 = i;
}
}
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i]) {
if (D_(idx2,i) < min) {
min = D_(idx2,i);
idx1 = i;
}
}
idx2 = idx1;
idx1 = NN_chain[NN_chain_tip++];
} while (idx2 != NN_chain[NN_chain_tip-2]);
Z2.append(idx1, idx2, min);
if (idx1>idx2) {
t_index tmp = idx1;
idx1 = idx2;
idx2 = tmp;
}
if (method==METHOD_METR_AVERAGE ||
method==METHOD_METR_WARD) {
size1 = static_cast<t_float>(members[idx1]);
size2 = static_cast<t_float>(members[idx2]);
members[idx2] += members[idx1];
}
// Remove the smaller index from the valid indices (active_nodes).
active_nodes.remove(idx1);
switch (method) {
case METHOD_METR_SINGLE:
/*
Single linkage.
Characteristic: new distances are never longer than the old distances.
*/
// Update the distance matrix in the range [start, idx1).
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
f_single(&D_(i, idx2), D_(i, idx1) );
// Update the distance matrix in the range (idx1, idx2).
for (; i<idx2; i=active_nodes.succ[i])
f_single(&D_(i, idx2), D_(idx1, i) );
// Update the distance matrix in the range (idx2, N).
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
f_single(&D_(idx2, i), D_(idx1, i) );
break;
case METHOD_METR_COMPLETE:
/*
Complete linkage.
Characteristic: new distances are never shorter than the old distances.
*/
// Update the distance matrix in the range [start, idx1).
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
f_complete(&D_(i, idx2), D_(i, idx1) );
// Update the distance matrix in the range (idx1, idx2).
for (; i<idx2; i=active_nodes.succ[i])
f_complete(&D_(i, idx2), D_(idx1, i) );
// Update the distance matrix in the range (idx2, N).
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
f_complete(&D_(idx2, i), D_(idx1, i) );
break;
case METHOD_METR_AVERAGE: {
/*
Average linkage.
Shorter and longer distances can occur.
*/
// Update the distance matrix in the range [start, idx1).
t_float s = size1/(size1+size2);
t_float t = size2/(size1+size2);
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
f_average(&D_(i, idx2), D_(i, idx1), s, t );
// Update the distance matrix in the range (idx1, idx2).
for (; i<idx2; i=active_nodes.succ[i])
f_average(&D_(i, idx2), D_(idx1, i), s, t );
// Update the distance matrix in the range (idx2, N).
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
f_average(&D_(idx2, i), D_(idx1, i), s, t );
break;
}
case METHOD_METR_WEIGHTED:
/*
Weighted linkage.
Shorter and longer distances can occur.
*/
// Update the distance matrix in the range [start, idx1).
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
f_weighted(&D_(i, idx2), D_(i, idx1) );
// Update the distance matrix in the range (idx1, idx2).
for (; i<idx2; i=active_nodes.succ[i])
f_weighted(&D_(i, idx2), D_(idx1, i) );
// Update the distance matrix in the range (idx2, N).
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
f_weighted(&D_(idx2, i), D_(idx1, i) );
break;
case METHOD_METR_WARD:
/*
Ward linkage.
Shorter and longer distances can occur, not smaller than min(d1,d2)
but maybe bigger than max(d1,d2).
*/
// Update the distance matrix in the range [start, idx1).
//t_float v = static_cast<t_float>(members[i]);
for (i=active_nodes.start; i<idx1; i=active_nodes.succ[i])
f_ward(&D_(i, idx2), D_(i, idx1), min,
size1, size2, static_cast<t_float>(members[i]) );
// Update the distance matrix in the range (idx1, idx2).
for (; i<idx2; i=active_nodes.succ[i])
f_ward(&D_(i, idx2), D_(idx1, i), min,
size1, size2, static_cast<t_float>(members[i]) );
// Update the distance matrix in the range (idx2, N).
for (i=active_nodes.succ[idx2]; i<N; i=active_nodes.succ[i])
f_ward(&D_(idx2, i), D_(idx1, i), min,
size1, size2, static_cast<t_float>(members[i]) );
break;
default:
throw std::runtime_error(std::string("Invalid method."));
}
}
#ifdef FE_INVALID
if (fetestexcept(FE_INVALID)) throw fenv_error();
#endif
}
class binary_min_heap {
/*
Class for a binary min-heap. The data resides in an array A. The elements of
A are not changed but two lists I and R of indices are generated which point
to elements of A and backwards.
The heap tree structure is
H[2*i+1] H[2*i+2]
\ /
\ /
≤ ≤
\ /
\ /
H[i]
where the children must be less or equal than their parent. Thus, H[0]
contains the minimum. The lists I and R are made such that H[i] = A[I[i]]
and R[I[i]] = i.
This implementation is not designed to handle NaN values.
*/
private:
t_float * const A;
t_index size;
auto_array_ptr<t_index> I;
auto_array_ptr<t_index> R;
// no default constructor
binary_min_heap();
// noncopyable
binary_min_heap(binary_min_heap const &);
binary_min_heap & operator=(binary_min_heap const &);
public:
binary_min_heap(t_float * const A_, const t_index size_)
: A(A_), size(size_), I(size), R(size)
{ // Allocate memory and initialize the lists I and R to the identity. This
// does not make it a heap. Call heapify afterwards!
for (t_index i=0; i<size; ++i)
R[i] = I[i] = i;
}
binary_min_heap(t_float * const A_, const t_index size1, const t_index size2,
const t_index start)
: A(A_), size(size1), I(size1), R(size2)
{ // Allocate memory and initialize the lists I and R to the identity. This
// does not make it a heap. Call heapify afterwards!
for (t_index i=0; i<size; ++i) {
R[i+start] = i;
I[i] = i + start;
}
}
~binary_min_heap() {}
void heapify() {
// Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
// condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
//
// Complexity: Θ(size)
// Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
// 3rd ed., 2009, Section 6.3 “Building a heap”
t_index idx;
for (idx=(size>>1); idx>0; ) {
--idx;
update_geq_(idx);
}
}
inline t_index argmin() const {
// Return the minimal element.
return I[0];
}
void heap_pop() {
// Remove the minimal element from the heap.
--size;
I[0] = I[size];
R[I[0]] = 0;
update_geq_(0);
}
void remove(t_index idx) {
// Remove an element from the heap.
--size;
R[I[size]] = R[idx];
I[R[idx]] = I[size];
if ( H(size)<=A[idx] ) {
update_leq_(R[idx]);
}
else {
update_geq_(R[idx]);
}
}
void replace ( const t_index idxold, const t_index idxnew,
const t_float val) {
R[idxnew] = R[idxold];
I[R[idxnew]] = idxnew;
if (val<=A[idxold])
update_leq(idxnew, val);
else
update_geq(idxnew, val);
}
void update ( const t_index idx, const t_float val ) const {
// Update the element A[i] with val and re-arrange the indices to preserve
// the heap condition.
if (val<=A[idx])
update_leq(idx, val);
else
update_geq(idx, val);
}
void update_leq ( const t_index idx, const t_float val ) const {
// Use this when the new value is not more than the old value.
A[idx] = val;
update_leq_(R[idx]);
}
void update_geq ( const t_index idx, const t_float val ) const {
// Use this when the new value is not less than the old value.
A[idx] = val;
update_geq_(R[idx]);
}
private:
void update_leq_ (t_index i) const {
t_index j;
for ( ; (i>0) && ( H(i)<H(j=(i-1)>>1) ); i=j)
heap_swap(i,j);
}
void update_geq_ (t_index i) const {
t_index j;
for ( ; (j=2*i+1)<size; i=j) {
if ( H(j)>=H(i) ) {
++j;
if ( j>=size || H(j)>=H(i) ) break;
}
else if ( j+1<size && H(j+1)<H(j) ) ++j;
heap_swap(i, j);
}
}
void heap_swap(const t_index i, const t_index j) const {
// Swap two indices.
t_index tmp = I[i];
I[i] = I[j];
I[j] = tmp;
R[I[i]] = i;
R[I[j]] = j;
}
inline t_float H(const t_index i) const {
return A[I[i]];
}
};
template <method_codes method, typename t_members>
static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) {
/*
N: integer, number of data points
D: condensed distance matrix N*(N-1)/2
Z2: output data structure
*/
const t_index N_1 = N-1;
t_index i, j; // loop variables
t_index idx1, idx2; // row and column indices
auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
// i-th row represents
doubly_linked_list active_nodes(N);
binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
// the distance to the nearest neighbor of each point
t_index node1, node2; // node numbers in the output
t_float size1, size2; // and their cardinalities
t_float min; // minimum and row index for nearest-neighbor search
t_index idx;
for (i=0; i<N; ++i)
// Build a list of row ↔ node label assignments.
// Initially i ↦ i
row_repr[i] = i;
// Initialize the minimal distances:
// Find the nearest neighbor of each point.
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
t_float const * DD = D;
for (i=0; i<N_1; ++i) {
min = std::numeric_limits<t_float>::infinity();
for (idx=j=i+1; j<N; ++j, ++DD) {
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (*DD<min) {
min = *DD;
idx = j;
}
else if (fc_isnan(*DD))
throw(nan_error());
}
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
mindist[i] = min;
n_nghbr[i] = idx;
}
// Put the minimal distances into a heap structure to make the repeated
// global minimum searches fast.
nn_distances.heapify();
#ifdef FE_INVALID
if (feclearexcept(FE_INVALID)) throw fenv_error();
#endif
// Main loop: We have N-1 merging steps.
for (i=0; i<N_1; ++i) {
/*
Here is a special feature that allows fast bookkeeping and updates of the
minimal distances.
mindist[i] stores a lower bound on the minimum distance of the point i to
all points of higher index:
mindist[i] ≥ min_{j>i} D(i,j)
Normally, we have equality. However, this minimum may become invalid due
to the updates in the distance matrix. The rules are:
1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct
minimum and n_nghbr[i] is a nearest neighbor.
2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the
correct minimum. The minimum needs to be recomputed.
3) mindist[i] is never bigger than the true minimum. Hence, we never
miss the true minimum if we take the smallest mindist entry,
re-compute the value if necessary (thus maybe increasing it) and
looking for the now smallest mindist entry until a valid minimal
entry is found. This step is done in the lines below.
The update process for D below takes care that these rules are
fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are
re-calculated when necessary but re-calculation is avoided whenever
possible.
The re-calculation of the minima makes the worst-case runtime of this
algorithm cubic in N. We avoid this whenever possible, and in most cases
the runtime appears to be quadratic.
*/
idx1 = nn_distances.argmin();
if (method != METHOD_METR_SINGLE) {
while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) {
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
min = D_(idx1,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
if (D_(idx1,j)<min) {
min = D_(idx1,j);
n_nghbr[idx1] = j;
}
}
/* Update the heap with the new true minimum and search for the
(possibly different) minimal entry. */
nn_distances.update_geq(idx1, min);
idx1 = nn_distances.argmin();
}
}
nn_distances.heap_pop(); // Remove the current minimum from the heap.
idx2 = n_nghbr[idx1];
// Write the newly found minimal pair of nodes to the output array.
node1 = row_repr[idx1];
node2 = row_repr[idx2];
if (method==METHOD_METR_AVERAGE ||
method==METHOD_METR_WARD ||
method==METHOD_METR_CENTROID) {
size1 = static_cast<t_float>(members[idx1]);
size2 = static_cast<t_float>(members[idx2]);
members[idx2] += members[idx1];
}
Z2.append(node1, node2, mindist[idx1]);
// Remove idx1 from the list of active indices (active_nodes).
active_nodes.remove(idx1);
// Index idx2 now represents the new (merged) node with label N+i.
row_repr[idx2] = N+i;
// Update the distance matrix
switch (method) {
case METHOD_METR_SINGLE:
/*
Single linkage.
Characteristic: new distances are never longer than the old distances.
*/
// Update the distance matrix in the range [start, idx1).
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
f_single(&D_(j, idx2), D_(j, idx1));
if (n_nghbr[j] == idx1)
n_nghbr[j] = idx2;
}
// Update the distance matrix in the range (idx1, idx2).
for (; j<idx2; j=active_nodes.succ[j]) {
f_single(&D_(j, idx2), D_(idx1, j));
// If the new value is below the old minimum in a row, update
// the mindist and n_nghbr arrays.
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
}
// Update the distance matrix in the range (idx2, N).
// Recompute the minimum mindist[idx2] and n_nghbr[idx2].
if (idx2<N_1) {
min = mindist[idx2];
for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j]) {
f_single(&D_(idx2, j), D_(idx1, j) );
if (D_(idx2, j) < min) {
n_nghbr[idx2] = j;
min = D_(idx2, j);
}
}
nn_distances.update_leq(idx2, min);
}
break;
case METHOD_METR_COMPLETE:
/*
Complete linkage.
Characteristic: new distances are never shorter than the old distances.
*/
// Update the distance matrix in the range [start, idx1).
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
f_complete(&D_(j, idx2), D_(j, idx1) );
if (n_nghbr[j] == idx1)
n_nghbr[j] = idx2;
}
// Update the distance matrix in the range (idx1, idx2).
for (; j<idx2; j=active_nodes.succ[j])
f_complete(&D_(j, idx2), D_(idx1, j) );
// Update the distance matrix in the range (idx2, N).
for (j=active_nodes.succ[idx2]; j<N; j=active_nodes.succ[j])
f_complete(&D_(idx2, j), D_(idx1, j) );
break;
case METHOD_METR_AVERAGE: {
/*
Average linkage.
Shorter and longer distances can occur.
*/
// Update the distance matrix in the range [start, idx1).
t_float s = size1/(size1+size2);
t_float t = size2/(size1+size2);
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
f_average(&D_(j, idx2), D_(j, idx1), s, t);
if (n_nghbr[j] == idx1)
n_nghbr[j] = idx2;
}
// Update the distance matrix in the range (idx1, idx2).
for (; j<idx2; j=active_nodes.succ[j]) {
f_average(&D_(j, idx2), D_(idx1, j), s, t);
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
}
// Update the distance matrix in the range (idx2, N).
if (idx2<N_1) {
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
f_average(&D_(idx2, j), D_(idx1, j), s, t);
min = D_(idx2,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
f_average(&D_(idx2, j), D_(idx1, j), s, t);
if (D_(idx2,j) < min) {
min = D_(idx2,j);
n_nghbr[idx2] = j;
}
}
nn_distances.update(idx2, min);
}
break;
}
case METHOD_METR_WEIGHTED:
/*
Weighted linkage.
Shorter and longer distances can occur.
*/
// Update the distance matrix in the range [start, idx1).
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
f_weighted(&D_(j, idx2), D_(j, idx1) );
if (n_nghbr[j] == idx1)
n_nghbr[j] = idx2;
}
// Update the distance matrix in the range (idx1, idx2).
for (; j<idx2; j=active_nodes.succ[j]) {
f_weighted(&D_(j, idx2), D_(idx1, j) );
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
}
// Update the distance matrix in the range (idx2, N).
if (idx2<N_1) {
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
f_weighted(&D_(idx2, j), D_(idx1, j) );
min = D_(idx2,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
f_weighted(&D_(idx2, j), D_(idx1, j) );
if (D_(idx2,j) < min) {
min = D_(idx2,j);
n_nghbr[idx2] = j;
}
}
nn_distances.update(idx2, min);
}
break;
case METHOD_METR_WARD:
/*
Ward linkage.
Shorter and longer distances can occur, not smaller than min(d1,d2)
but maybe bigger than max(d1,d2).
*/
// Update the distance matrix in the range [start, idx1).
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1],
size1, size2, static_cast<t_float>(members[j]) );
if (n_nghbr[j] == idx1)
n_nghbr[j] = idx2;
}
// Update the distance matrix in the range (idx1, idx2).
for (; j<idx2; j=active_nodes.succ[j]) {
f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2,
static_cast<t_float>(members[j]) );
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
}
// Update the distance matrix in the range (idx2, N).
if (idx2<N_1) {
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
size1, size2, static_cast<t_float>(members[j]) );
min = D_(idx2,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1],
size1, size2, static_cast<t_float>(members[j]) );
if (D_(idx2,j) < min) {
min = D_(idx2,j);
n_nghbr[idx2] = j;
}
}
nn_distances.update(idx2, min);
}
break;
case METHOD_METR_CENTROID: {
/*
Centroid linkage.
Shorter and longer distances can occur, not bigger than max(d1,d2)
but maybe smaller than min(d1,d2).
*/
// Update the distance matrix in the range [start, idx1).
t_float s = size1/(size1+size2);
t_float t = size2/(size1+size2);
t_float stc = s*t*mindist[idx1];
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
f_centroid(&D_(j, idx2), D_(j, idx1), stc, s, t);
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
else if (n_nghbr[j] == idx1)
n_nghbr[j] = idx2;
}
// Update the distance matrix in the range (idx1, idx2).
for (; j<idx2; j=active_nodes.succ[j]) {
f_centroid(&D_(j, idx2), D_(idx1, j), stc, s, t);
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
}
// Update the distance matrix in the range (idx2, N).
if (idx2<N_1) {
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
min = D_(idx2,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
if (D_(idx2,j) < min) {
min = D_(idx2,j);
n_nghbr[idx2] = j;
}
}
nn_distances.update(idx2, min);
}
break;
}
case METHOD_METR_MEDIAN: {
/*
Median linkage.
Shorter and longer distances can occur, not bigger than max(d1,d2)
but maybe smaller than min(d1,d2).
*/
// Update the distance matrix in the range [start, idx1).
t_float c_4 = mindist[idx1]*.25;
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
f_median(&D_(j, idx2), D_(j, idx1), c_4 );
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
else if (n_nghbr[j] == idx1)
n_nghbr[j] = idx2;
}
// Update the distance matrix in the range (idx1, idx2).
for (; j<idx2; j=active_nodes.succ[j]) {
f_median(&D_(j, idx2), D_(idx1, j), c_4 );
if (D_(j, idx2) < mindist[j]) {
nn_distances.update_leq(j, D_(j, idx2));
n_nghbr[j] = idx2;
}
}
// Update the distance matrix in the range (idx2, N).
if (idx2<N_1) {
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
f_median(&D_(idx2, j), D_(idx1, j), c_4 );
min = D_(idx2,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
f_median(&D_(idx2, j), D_(idx1, j), c_4 );
if (D_(idx2,j) < min) {
min = D_(idx2,j);
n_nghbr[idx2] = j;
}
}
nn_distances.update(idx2, min);
}
break;
}
default:
throw std::runtime_error(std::string("Invalid method."));
}
}
#ifdef FE_INVALID
if (fetestexcept(FE_INVALID)) throw fenv_error();
#endif
}
/*
Clustering methods for vector data
*/
template <typename t_dissimilarity>
static void MST_linkage_core_vector(const t_index N,
t_dissimilarity & dist,
cluster_result & Z2) {
/*
N: integer, number of data points
dist: function pointer to the metric
Z2: output data structure
The basis of this algorithm is an algorithm by Rohlf:
F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
The Computer Journal, vol. 16, 1973, p. 93–95.
*/
t_index i;
t_index idx2;
doubly_linked_list active_nodes(N);
auto_array_ptr<t_float> d(N);
t_index prev_node;
t_float min;
// first iteration
idx2 = 1;
min = std::numeric_limits<t_float>::infinity();
for (i=1; i<N; ++i) {
d[i] = dist(0,i);
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (d[i] < min) {
min = d[i];
idx2 = i;
}
else if (fc_isnan(d[i]))
throw (nan_error());
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
}
Z2.append(0, idx2, min);
for (t_index j=1; j<N-1; ++j) {
prev_node = idx2;
active_nodes.remove(prev_node);
idx2 = active_nodes.succ[0];
min = d[idx2];
for (i=idx2; i<N; i=active_nodes.succ[i]) {
t_float tmp = dist(i, prev_node);
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfloat-equal"
#endif
if (d[i] > tmp)
d[i] = tmp;
else if (fc_isnan(tmp))
throw (nan_error());
#if HAVE_DIAGNOSTIC
#pragma GCC diagnostic pop
#endif
if (d[i] < min) {
min = d[i];
idx2 = i;
}
}
Z2.append(prev_node, idx2, min);
}
}
template <method_codes_vector method, typename t_dissimilarity>
static void generic_linkage_vector(const t_index N,
t_dissimilarity & dist,
cluster_result & Z2) {
/*
N: integer, number of data points
dist: function pointer to the metric
Z2: output data structure
This algorithm is valid for the distance update methods
"Ward", "centroid" and "median" only!
*/
const t_index N_1 = N-1;
t_index i, j; // loop variables
t_index idx1, idx2; // row and column indices
auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
// i-th row represents
doubly_linked_list active_nodes(N);
binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
// the distance to the nearest neighbor of each point
t_index node1, node2; // node numbers in the output
t_float min; // minimum and row index for nearest-neighbor search
for (i=0; i<N; ++i)
// Build a list of row ↔ node label assignments.
// Initially i ↦ i
row_repr[i] = i;
// Initialize the minimal distances:
// Find the nearest neighbor of each point.
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
for (i=0; i<N_1; ++i) {
min = std::numeric_limits<t_float>::infinity();
t_index idx;
for (idx=j=i+1; j<N; ++j) {
t_float tmp;
switch (method) {
case METHOD_VECTOR_WARD:
tmp = dist.ward_initial(i,j);
break;
default:
tmp = dist.template sqeuclidean<true>(i,j);
}
if (tmp<min) {
min = tmp;
idx = j;
}
}
switch (method) {
case METHOD_VECTOR_WARD:
mindist[i] = t_dissimilarity::ward_initial_conversion(min);
break;
default:
mindist[i] = min;
}
n_nghbr[i] = idx;
}
// Put the minimal distances into a heap structure to make the repeated
// global minimum searches fast.
nn_distances.heapify();
// Main loop: We have N-1 merging steps.
for (i=0; i<N_1; ++i) {
idx1 = nn_distances.argmin();
while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
switch (method) {
case METHOD_VECTOR_WARD:
min = dist.ward(idx1,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
t_float const tmp = dist.ward(idx1,j);
if (tmp<min) {
min = tmp;
n_nghbr[idx1] = j;
}
}
break;
default:
min = dist.template sqeuclidean<true>(idx1,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
t_float const tmp = dist.template sqeuclidean<true>(idx1,j);
if (tmp<min) {
min = tmp;
n_nghbr[idx1] = j;
}
}
}
/* Update the heap with the new true minimum and search for the (possibly
different) minimal entry. */
nn_distances.update_geq(idx1, min);
idx1 = nn_distances.argmin();
}
nn_distances.heap_pop(); // Remove the current minimum from the heap.
idx2 = n_nghbr[idx1];
// Write the newly found minimal pair of nodes to the output array.
node1 = row_repr[idx1];
node2 = row_repr[idx2];
Z2.append(node1, node2, mindist[idx1]);
switch (method) {
case METHOD_VECTOR_WARD:
case METHOD_VECTOR_CENTROID:
dist.merge_inplace(idx1, idx2);
break;
case METHOD_VECTOR_MEDIAN:
dist.merge_inplace_weighted(idx1, idx2);
break;
default:
throw std::runtime_error(std::string("Invalid method."));
}
// Index idx2 now represents the new (merged) node with label N+i.
row_repr[idx2] = N+i;
// Remove idx1 from the list of active indices (active_nodes).
active_nodes.remove(idx1); // TBD later!!!
// Update the distance matrix
switch (method) {
case METHOD_VECTOR_WARD:
/*
Ward linkage.
Shorter and longer distances can occur, not smaller than min(d1,d2)
but maybe bigger than max(d1,d2).
*/
// Update the distance matrix in the range [start, idx1).
for (j=active_nodes.start; j<idx1; j=active_nodes.succ[j]) {
if (n_nghbr[j] == idx2) {
n_nghbr[j] = idx1; // invalidate
}
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j<idx2; j=active_nodes.succ[j]) {
t_float const tmp = dist.ward(j, idx2);
if (tmp < mindist[j]) {
nn_distances.update_leq(j, tmp);
n_nghbr[j] = idx2;
}
else if (n_nghbr[j]==idx2) {
n_nghbr[j] = idx1; // invalidate
}
}
// Find the nearest neighbor for idx2.
if (idx2<N_1) {
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
min = dist.ward(idx2,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
t_float const tmp = dist.ward(idx2,j);
if (tmp < min) {
min = tmp;
n_nghbr[idx2] = j;
}
}
nn_distances.update(idx2, min);
}
break;
default:
/*
Centroid and median linkage.
Shorter and longer distances can occur, not bigger than max(d1,d2)
but maybe smaller than min(d1,d2).
*/
for (j=active_nodes.start; j<idx2; j=active_nodes.succ[j]) {
t_float const tmp = dist.template sqeuclidean<true>(j, idx2);
if (tmp < mindist[j]) {
nn_distances.update_leq(j, tmp);
n_nghbr[j] = idx2;
}
else if (n_nghbr[j] == idx2)
n_nghbr[j] = idx1; // invalidate
}
// Find the nearest neighbor for idx2.
if (idx2<N_1) {
n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
min = dist.template sqeuclidean<true>(idx2,j);
for (j=active_nodes.succ[j]; j<N; j=active_nodes.succ[j]) {
t_float const tmp = dist.template sqeuclidean<true>(idx2, j);
if (tmp < min) {
min = tmp;
n_nghbr[idx2] = j;
}
}
nn_distances.update(idx2, min);
}
}
}
}
template <method_codes_vector method, typename t_dissimilarity>
static void generic_linkage_vector_alternative(const t_index N,
t_dissimilarity & dist,
cluster_result & Z2) {
/*
N: integer, number of data points
dist: function pointer to the metric
Z2: output data structure
This algorithm is valid for the distance update methods
"Ward", "centroid" and "median" only!
*/
const t_index N_1 = N-1;
t_index i, j=0; // loop variables
t_index idx1, idx2; // row and column indices
auto_array_ptr<t_index> n_nghbr(2*N-2); // array of nearest neighbors
auto_array_ptr<t_float> mindist(2*N-2); // distances to the nearest neighbors
doubly_linked_list active_nodes(N+N_1);
binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap
// structure for the distance to the nearest neighbor of each point
t_float min; // minimum for nearest-neighbor searches
// Initialize the minimal distances:
// Find the nearest neighbor of each point.
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
for (i=1; i<N; ++i) {
min = std::numeric_limits<t_float>::infinity();
t_index idx;
for (idx=j=0; j<i; ++j) {
t_float tmp;
switch (method) {
case METHOD_VECTOR_WARD:
tmp = dist.ward_initial(i,j);
break;
default:
tmp = dist.template sqeuclidean<true>(i,j);
}
if (tmp<min) {
min = tmp;
idx = j;
}
}
switch (method) {
case METHOD_VECTOR_WARD:
mindist[i] = t_dissimilarity::ward_initial_conversion(min);
break;
default:
mindist[i] = min;
}
n_nghbr[i] = idx;
}
// Put the minimal distances into a heap structure to make the repeated
// global minimum searches fast.
nn_distances.heapify();
// Main loop: We have N-1 merging steps.
for (i=N; i<N+N_1; ++i) {
/*
The bookkeeping is different from the "stored matrix approach" algorithm
generic_linkage.
mindist[i] stores a lower bound on the minimum distance of the point i to
all points of *lower* index:
mindist[i] ≥ min_{j<i} D(i,j)
Moreover, new nodes do not re-use one of the old indices, but they are
given a new, unique index (SciPy convention: initial nodes are 0,…,N−1,
new nodes are N,…,2N−2).
Invalid nearest neighbors are not recognized by the fact that the stored
distance is smaller than the actual distance, but the list active_nodes
maintains a flag whether a node is inactive. If n_nghbr[i] points to an
active node, the entries nn_distances[i] and n_nghbr[i] are valid,
otherwise they must be recomputed.
*/
idx1 = nn_distances.argmin();
while ( active_nodes.is_inactive(n_nghbr[idx1]) ) {
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
n_nghbr[idx1] = j = active_nodes.start;
switch (method) {
case METHOD_VECTOR_WARD:
min = dist.ward_extended(idx1,j);
for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
t_float tmp = dist.ward_extended(idx1,j);
if (tmp<min) {
min = tmp;
n_nghbr[idx1] = j;
}
}
break;
default:
min = dist.sqeuclidean_extended(idx1,j);
for (j=active_nodes.succ[j]; j<idx1; j=active_nodes.succ[j]) {
t_float const tmp = dist.sqeuclidean_extended(idx1,j);
if (tmp<min) {
min = tmp;
n_nghbr[idx1] = j;
}
}
}
/* Update the heap with the new true minimum and search for the (possibly
different) minimal entry. */
nn_distances.update_geq(idx1, min);
idx1 = nn_distances.argmin();
}
idx2 = n_nghbr[idx1];
active_nodes.remove(idx1);
active_nodes.remove(idx2);
Z2.append(idx1, idx2, mindist[idx1]);
if (i<2*N_1) {
switch (method) {
case METHOD_VECTOR_WARD:
case METHOD_VECTOR_CENTROID:
dist.merge(idx1, idx2, i);
break;
case METHOD_VECTOR_MEDIAN:
dist.merge_weighted(idx1, idx2, i);
break;
default:
throw std::runtime_error(std::string("Invalid method."));
}
n_nghbr[i] = active_nodes.start;
if (method==METHOD_VECTOR_WARD) {
/*
Ward linkage.
Shorter and longer distances can occur, not smaller than min(d1,d2)
but maybe bigger than max(d1,d2).
*/
min = dist.ward_extended(active_nodes.start, i);
for (j=active_nodes.succ[active_nodes.start]; j<i;
j=active_nodes.succ[j]) {
t_float tmp = dist.ward_extended(j, i);
if (tmp < min) {
min = tmp;
n_nghbr[i] = j;
}
}
}
else {
/*
Centroid and median linkage.
Shorter and longer distances can occur, not bigger than max(d1,d2)
but maybe smaller than min(d1,d2).
*/
min = dist.sqeuclidean_extended(active_nodes.start, i);
for (j=active_nodes.succ[active_nodes.start]; j<i;
j=active_nodes.succ[j]) {
t_float tmp = dist.sqeuclidean_extended(j, i);
if (tmp < min) {
min = tmp;
n_nghbr[i] = j;
}
}
}
if (idx2<active_nodes.start) {
nn_distances.remove(active_nodes.start);
} else {
nn_distances.remove(idx2);
}
nn_distances.replace(idx1, i, min);
}
}
}
#if HAVE_VISIBILITY
#pragma GCC visibility pop
#endif