AGGLOMERATIVE
Agglomerative clustering starts with each sample as its own cluster and repeatedly merges the closest pair according to a chosen linkage rule.
For example, Ward’s linkage (the default method) merges clusters A and B to minimize the increase in total within-cluster variance, evaluating distances as:
d(A, B) = \frac{|A| |B|}{|A| + |B|} ||\mu_A - \mu_B||^2
where \mu_A and \mu_B are the centroids of clusters A and B, respectively.
This wrapper accepts data with rows as samples and columns as features. It returns the final labels, a compact label count table, the merge tree children array, and optional merge distances when distance computation is enabled.
Excel Usage
=AGGLOMERATIVE(data, n_clusters, aggl_linkage, aggl_metric, compute_distances)
data(list[list], required): 2D array of input data with rows as samples and columns as features.n_clusters(int, optional, default: 2): Number of clusters to keep after hierarchical merging.aggl_linkage(str, optional, default: “ward”): Linkage rule used when merging clusters.aggl_metric(str, optional, default: “euclidean”): Distance metric used when evaluating merges.compute_distances(bool, optional, default: false): Whether to store merge distances for the fitted tree.
Returns (dict): Excel data type containing the cluster count, labels, label counts, and hierarchical tree arrays.
Example 1: Form two clusters with Ward linkage on separated points
Inputs:
| data | n_clusters | aggl_linkage | aggl_metric | compute_distances | |
|---|---|---|---|---|---|
| 0 | 0 | 2 | ward | euclidean | false |
| 0 | 1 | ||||
| 1 | 0 | ||||
| 5 | 5 | ||||
| 5 | 6 | ||||
| 6 | 5 |
Excel formula:
=AGGLOMERATIVE({0,0;0,1;1,0;5,5;5,6;6,5}, 2, "ward", "euclidean", FALSE)
Expected output:
{"type":"Double","basicValue":2,"properties":{"cluster_count":{"type":"Double","basicValue":2},"labels":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":0}]]},"label_counts":{"type":"Array","elements":[[{"type":"String","basicValue":"label"},{"type":"String","basicValue":"count"}],[{"type":"Double","basicValue":0},{"type":"Double","basicValue":3}],[{"type":"Double","basicValue":1},{"type":"Double","basicValue":3}]]},"children":{"type":"Array","elements":[[{"type":"Double","basicValue":0},{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":3},{"type":"Double","basicValue":4}],[{"type":"Double","basicValue":2},{"type":"Double","basicValue":6}],[{"type":"Double","basicValue":5},{"type":"Double","basicValue":7}],[{"type":"Double","basicValue":8},{"type":"Double","basicValue":9}]]},"n_leaves":{"type":"Double","basicValue":6},"n_connected_components":{"type":"Double","basicValue":1}}}
Example 2: Compute merge distances with average linkage and Manhattan distance
Inputs:
| data | n_clusters | aggl_linkage | aggl_metric | compute_distances | |
|---|---|---|---|---|---|
| 0 | 0 | 2 | average | manhattan | true |
| 0 | 1 | ||||
| 1 | 0 | ||||
| 6 | 6 | ||||
| 6 | 7 | ||||
| 7 | 6 |
Excel formula:
=AGGLOMERATIVE({0,0;0,1;1,0;6,6;6,7;7,6}, 2, "average", "manhattan", TRUE)
Expected output:
{"type":"Double","basicValue":2,"properties":{"cluster_count":{"type":"Double","basicValue":2},"labels":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":0}]]},"label_counts":{"type":"Array","elements":[[{"type":"String","basicValue":"label"},{"type":"String","basicValue":"count"}],[{"type":"Double","basicValue":0},{"type":"Double","basicValue":3}],[{"type":"Double","basicValue":1},{"type":"Double","basicValue":3}]]},"children":{"type":"Array","elements":[[{"type":"Double","basicValue":0},{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":3},{"type":"Double","basicValue":4}],[{"type":"Double","basicValue":2},{"type":"Double","basicValue":6}],[{"type":"Double","basicValue":5},{"type":"Double","basicValue":7}],[{"type":"Double","basicValue":8},{"type":"Double","basicValue":9}]]},"n_leaves":{"type":"Double","basicValue":6},"n_connected_components":{"type":"Double","basicValue":1},"distances":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1.5}],[{"type":"Double","basicValue":1.5}],[{"type":"Double","basicValue":12}]]}}}
Example 3: Merge one-dimensional samples with complete linkage and L1 distance
Inputs:
| data | n_clusters | aggl_linkage | aggl_metric | compute_distances |
|---|---|---|---|---|
| 0 | 2 | complete | l1 | false |
| 0.1 | ||||
| 0.2 | ||||
| 5 | ||||
| 5.1 | ||||
| 5.2 |
Excel formula:
=AGGLOMERATIVE({0;0.1;0.2;5;5.1;5.2}, 2, "complete", "l1", FALSE)
Expected output:
{"type":"Double","basicValue":2,"properties":{"cluster_count":{"type":"Double","basicValue":2},"labels":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":0}]]},"label_counts":{"type":"Array","elements":[[{"type":"String","basicValue":"label"},{"type":"String","basicValue":"count"}],[{"type":"Double","basicValue":0},{"type":"Double","basicValue":3}],[{"type":"Double","basicValue":1},{"type":"Double","basicValue":3}]]},"children":{"type":"Array","elements":[[{"type":"Double","basicValue":3},{"type":"Double","basicValue":4}],[{"type":"Double","basicValue":0},{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":2},{"type":"Double","basicValue":7}],[{"type":"Double","basicValue":5},{"type":"Double","basicValue":6}],[{"type":"Double","basicValue":8},{"type":"Double","basicValue":9}]]},"n_leaves":{"type":"Double","basicValue":6},"n_connected_components":{"type":"Double","basicValue":1}}}
Example 4: Separate directional vectors with cosine single linkage
Inputs:
| data | n_clusters | aggl_linkage | aggl_metric | compute_distances | |
|---|---|---|---|---|---|
| 1 | 0 | 2 | single | cosine | true |
| 0.95 | 0.05 | ||||
| 0 | 1 | ||||
| 0.05 | 0.95 |
Excel formula:
=AGGLOMERATIVE({1,0;0.95,0.05;0,1;0.05,0.95}, 2, "single", "cosine", TRUE)
Expected output:
{"type":"Double","basicValue":2,"properties":{"cluster_count":{"type":"Double","basicValue":2},"labels":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":0}]]},"label_counts":{"type":"Array","elements":[[{"type":"String","basicValue":"label"},{"type":"String","basicValue":"count"}],[{"type":"Double","basicValue":0},{"type":"Double","basicValue":2}],[{"type":"Double","basicValue":1},{"type":"Double","basicValue":2}]]},"children":{"type":"Array","elements":[[{"type":"Double","basicValue":0},{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":2},{"type":"Double","basicValue":3}],[{"type":"Double","basicValue":4},{"type":"Double","basicValue":5}]]},"n_leaves":{"type":"Double","basicValue":4},"n_connected_components":{"type":"Double","basicValue":1},"distances":{"type":"Array","elements":[[{"type":"Double","basicValue":0.00138217}],[{"type":"Double","basicValue":0.00138217}],[{"type":"Double","basicValue":0.895028}]]}}}
Python Code
import numpy as np
from sklearn.cluster import AgglomerativeClustering as SklearnAgglomerativeClustering
def agglomerative(data, n_clusters=2, aggl_linkage='ward', aggl_metric='euclidean', compute_distances=False):
"""
Build hierarchical clusters by recursively merging samples.
See: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
This example function is provided as-is without any representation of accuracy.
Args:
data (list[list]): 2D array of input data with rows as samples and columns as features.
n_clusters (int, optional): Number of clusters to keep after hierarchical merging. Default is 2.
aggl_linkage (str, optional): Linkage rule used when merging clusters. Valid options: Ward, Complete, Average, Single. Default is 'ward'.
aggl_metric (str, optional): Distance metric used when evaluating merges. Valid options: Euclidean, Manhattan, L1, L2, Cosine. Default is 'euclidean'.
compute_distances (bool, optional): Whether to store merge distances for the fitted tree. Default is False.
Returns:
dict: Excel data type containing the cluster count, labels, label counts, and hierarchical tree arrays.
"""
def to2d(value):
return [[value]] if not isinstance(value, list) else value
def parse_matrix(value):
value = to2d(value)
if not isinstance(value, list) or not value or not all(isinstance(row, list) and row for row in value):
return None, "Error: data must be a non-empty 2D list"
if len({len(row) for row in value}) != 1:
return None, "Error: data must be a rectangular 2D list"
matrix = np.array(value, dtype=float)
if matrix.ndim != 2 or matrix.size == 0:
return None, "Error: data must be a non-empty 2D list"
if not np.isfinite(matrix).all():
return None, "Error: data must contain only finite numeric values"
return matrix, None
def as_column(values):
return [[{"type": "Double", "basicValue": float(item)}] for item in values]
def as_matrix(values):
return [[{"type": "Double", "basicValue": float(item)} for item in row] for row in values]
def label_count_table(labels):
unique_labels, counts = np.unique(labels, return_counts=True)
rows = [[{"type": "String", "basicValue": "label"}, {"type": "String", "basicValue": "count"}]]
rows.extend(
[[{"type": "Double", "basicValue": float(label)}, {"type": "Double", "basicValue": float(count)}]
for label, count in zip(unique_labels.tolist(), counts.tolist())]
)
return rows
try:
data_np, error = parse_matrix(data)
if error:
return error
cluster_total = int(n_clusters)
if cluster_total < 1:
return "Error: n_clusters must be at least 1"
if cluster_total > data_np.shape[0]:
return "Error: n_clusters cannot exceed the number of samples"
linkage_value = str(aggl_linkage).strip()
metric_value = str(aggl_metric).strip()
if linkage_value not in {"ward", "complete", "average", "single"}:
return "Error: linkage must be one of 'ward', 'complete', 'average', or 'single'"
if metric_value not in {"euclidean", "manhattan", "l1", "l2", "cosine"}:
return "Error: metric must be one of 'euclidean', 'manhattan', 'l1', 'l2', or 'cosine'"
if linkage_value == "ward" and metric_value != "euclidean":
return "Error: metric must be euclidean when linkage is ward"
fitted = SklearnAgglomerativeClustering(
n_clusters=cluster_total,
linkage=linkage_value,
metric=metric_value,
compute_distances=bool(compute_distances)
).fit(data_np)
labels = fitted.labels_
properties = {
"cluster_count": {"type": "Double", "basicValue": float(fitted.n_clusters_)},
"labels": {"type": "Array", "elements": as_column(labels.tolist())},
"label_counts": {"type": "Array", "elements": label_count_table(labels)},
"children": {"type": "Array", "elements": as_matrix(fitted.children_.tolist())},
"n_leaves": {"type": "Double", "basicValue": float(fitted.n_leaves_)},
"n_connected_components": {"type": "Double", "basicValue": float(fitted.n_connected_components_)}
}
if hasattr(fitted, "distances_"):
properties["distances"] = {"type": "Array", "elements": as_column(fitted.distances_.tolist())}
return {
"type": "Double",
"basicValue": float(fitted.n_clusters_),
"properties": properties
}
except Exception as e:
return f"Error: {str(e)}"