Correctness, simplicity, and test coverage improvements

cmdcolin · claude · cmdcolin · commit aa6feff68e5b · 2026-04-27T15:24:42.000-04:00
- Fix clustersGivenK to have N elements (was N+1 with a trailing empty array)
- Avoid intermediate array allocations in clustersGivenK building; mutate
  membership arrays in place
- Remove {} as ClusterNode typecasts in fromNewick via newNode() helper,
  eliminating the fillDefaults post-pass entirely
- Simplify treeToJSON to return ClusterNode directly
- Add explicit case ';' in fromNewick switch
- Add integration tests: K=3 partition, order permutation, progress callbacks,
  equal-distance determinism, clusterObject label propagation
- Fix README Algorithm section (was describing old O(n³) pure-JS version;
  current C code uses Lance-Williams recurrence, same as R hclust)
- Add UPGMA and Lance-Williams citations to distance.c and README

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.prettierignore b/.prettierignore
@@ -1 +1,2 @@
 pnpm-lock.yaml
+src/distance.js
diff --git a/README.md b/README.md
@@ -6,22 +6,26 @@ WebAssembly, with JavaScript/TypeScript wrappers for easy integration.
 ## Algorithm
 
 **Agglomerative hierarchical clustering with average linkage (UPGMA).** Each
-sample starts as its own cluster; at each step the two clusters with the
+sample starts as its own cluster. At each step the two clusters with the
 smallest mean pairwise Euclidean distance are merged, until one cluster remains.
-
-Average linkage measures inter-cluster distance as the mean of all pairwise
-distances. This is a middle ground between single linkage (minimum distance,
-prone to chaining) and complete linkage (maximum distance, forces compact
-clusters). For the genomics track use case — ordering samples by similarity for
-a heatmap — average linkage is a good default. Note that R's `hclust` defaults
-to `method="complete"`; use `method="average"` to get equivalent behavior.
-
-This is equivalent to R's `hclust(method="average")`, with two differences: R
-uses the Lance-Williams recurrence for an O(n²) merge step, whereas this
-recomputes average distances from the original matrix each iteration (O(n³)).
-For the tens-to-hundreds of samples typical in genomics tracks, this is
-negligible and WASM more than compensates. R also accepts a precomputed distance
-matrix; this library computes Euclidean distances from raw vectors internally.
+The result is a binary tree (dendrogram) whose internal node heights record the
+distance at which each merge occurred.
+
+Average linkage (UPGMA) measures inter-cluster distance as the mean of all
+pairwise distances between members of the two clusters. It is a middle ground
+between single linkage (minimum distance, prone to chaining) and complete
+linkage (maximum distance, forces compact clusters). For the genomics use case —
+ordering samples by similarity for a heatmap — average linkage is a reliable
+default. Note that R's `hclust` defaults to `method="complete"`; use
+`method="average"` to match this library's output.
+
+This produces equivalent results to R's `hclust(method="average")`. The
+implementation uses the Lance-Williams recurrence (Lance & Williams 1967) to
+update inter-cluster distances in O(n) per merge step, giving O(n²) total for
+clustering after the O(n²) initial distance matrix computation. The one
+difference from R is that this library computes Euclidean distances from raw
+vectors internally and uses Float32 precision; R accepts a precomputed distance
+matrix and uses double precision.
 
 ## Features
 
@@ -149,6 +153,18 @@ clusterData({
 | SharedArrayBuffer + Atomics | yes                  | yes                 | yes                             |
 | Blob URL + sync XHR         | no                   | yes                 | no                              |
 
+## References
+
+- **UPGMA**: Sokal, R.R. & Michener, C.D. (1958). "A statistical method for
+  evaluating systematic relationships." _University of Kansas Science Bulletin_,
+  38, 1409–1438.
+- **Lance-Williams recurrence**: Lance, G.N. & Williams, W.T. (1967). "A general
+  theory of classificatory sorting strategies. 1. Hierarchical systems."
+  _Computer Journal_, 9(4), 373–380.
+- **Newick format**: Olsen, G.J. (1990). "Interpretation of the 'Newick's 8:45'
+  tree format standard."
+  http://evolution.genetics.washington.edu/phylip/newicktree.html
+
 ## Note
 
 Generated with the help of Claude Code AI, you might be able to tell from the
diff --git a/src/cluster.ts b/src/cluster.ts
@@ -23,25 +23,38 @@ export async function clusterData({
 
   // Build clustersGivenK from stable-slot merge sequence.
   // mergeA[i] and mergeB[i] are stable slot indices; slot mergeA[i] absorbs mergeB[i].
+  // clustersGivenK[k] = cluster partitions when there are k+1 clusters (k=0..N-1).
   const numSamples = data.length
-  const clustersGivenK: number[][][] = [[]]
+  const clustersGivenK: number[][][] = []
 
-  const membership = Array.from(
-    { length: numSamples },
-    (_, i) => [i] as number[],
-  )
-  const activeSlots = new Set(Array.from({ length: numSamples }, (_, i) => i))
+  const membership: number[][] = Array.from({ length: numSamples }, (_, i) => [
+    i,
+  ])
+  const activeSlots = new Set<number>()
+  for (let i = 0; i < numSamples; i++) {
+    activeSlots.add(i)
+  }
 
   for (let i = 0; i < numSamples - 1; i++) {
     const [a, b] = result.merges[i]!
 
-    clustersGivenK.push([...activeSlots].map(id => [...membership[id]!]))
+    const snapshot: number[][] = []
+    for (const id of activeSlots) {
+      snapshot.push([...membership[id]!])
+    }
+    clustersGivenK.push(snapshot)
 
-    membership[a] = [...membership[a]!, ...membership[b]!]
+    for (const m of membership[b]!) {
+      membership[a]!.push(m)
+    }
     activeSlots.delete(b)
   }
 
-  clustersGivenK.push([...activeSlots].map(id => [...membership[id]!]))
+  const finalSnapshot: number[][] = []
+  for (const id of activeSlots) {
+    finalSnapshot.push([...membership[id]!])
+  }
+  clustersGivenK.push(finalSnapshot)
 
   return {
     tree: result.tree,
diff --git a/src/distance.js b/src/distance.js
diff --git a/src/tree-utils.ts b/src/tree-utils.ts
@@ -19,6 +19,9 @@ export function printTree(
   return output
 }
 
+// Newick format: Olsen (1990) http://evolution.genetics.washington.edu/phylip/newicktree.html
+// Note: this library encodes internal node height as the label (e.g. "(A,B)1.2345"),
+// not as a branch length (":"). fromNewick handles both forms on input.
 export function toNewick(node: ClusterNode): string {
   if (!node.children || node.children.length === 0) {
     return node.name
@@ -28,14 +31,17 @@ export function toNewick(node: ClusterNode): string {
   return `(${childStrings.join(',')})${node.height.toFixed(4)}`
 }
 
+function newNode(): ClusterNode {
+  return { name: '', height: 0 }
+}
+
 export function fromNewick(s: string): ClusterNode {
   const ancestors: ClusterNode[] = []
-
-  let tree = {} as ClusterNode
+  let tree = newNode()
   const tokens = s.split(/\s*(;|\(|\)|,|:)\s*/)
   for (let i = 0; i < tokens.length; i++) {
     const token = tokens[i]!
-    const subtree = {} as ClusterNode
+    const subtree = newNode()
     switch (token) {
       case '(':
         tree.children = [subtree]
@@ -49,6 +55,7 @@ export function fromNewick(s: string): ClusterNode {
       case ')':
         tree = ancestors.pop()!
         break
+      case ';':
       case ':':
         break
       default: {
@@ -69,38 +76,16 @@ export function fromNewick(s: string): ClusterNode {
     }
   }
 
-  function fillDefaults(node: ClusterNode) {
-    if (!node.name) {
-      node.name = ''
-    }
-    // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
-    if (node.height === undefined) {
-      node.height = 0
-    }
-    if (node.children) {
-      for (const child of node.children) {
-        fillDefaults(child)
-      }
-    }
-  }
-
-  fillDefaults(tree)
   return tree
 }
 
-export function treeToJSON(node: ClusterNode) {
-  const result: {
-    name: string
-    height: number
-    children?: ReturnType<typeof treeToJSON>[]
-  } = {
+export function treeToJSON(node: ClusterNode): ClusterNode {
+  if (!node.children?.length) {
+    return { name: node.name, height: node.height }
+  }
+  return {
     name: node.name,
     height: node.height,
+    children: node.children.map(treeToJSON),
   }
-
-  if (node.children && node.children.length > 0) {
-    result.children = node.children.map(child => treeToJSON(child))
-  }
-
-  return result
 }
diff --git a/src/wasm/distance.c b/src/wasm/distance.c
@@ -2,6 +2,14 @@
  * High-performance hierarchical clustering (UPGMA / average-linkage)
  * Compiled to WebAssembly using Emscripten
  *
+ * Algorithm: UPGMA (Unweighted Pair Group Method with Arithmetic Mean)
+ *   Sokal & Michener (1958). "A statistical method for evaluating systematic
+ *   relationships." University of Kansas Science Bulletin, 38, 1409-1438.
+ *
+ * Distance update: Lance-Williams recurrence for average linkage
+ *   Lance & Williams (1967). "A general theory of classificatory sorting
+ *   strategies." Computer Journal, 9(4), 373-380.
+ *
  * Key design:
  *  - Stable slot IDs: slot mergeA[i] absorbs mergeB[i]; mergeA[i] < mergeB[i] always.
  *    Slot 0 is always the final root.
diff --git a/test/cluster.test.ts b/test/cluster.test.ts
@@ -145,10 +145,9 @@ describe('clusterData', () => {
 
     const result = await clusterData({ data })
 
-    expect(result.clustersGivenK).toHaveLength(3)
+    expect(result.clustersGivenK).toHaveLength(2)
     expect(result.clustersGivenK[0]).toEqual([[0, 1]])
     expect(result.clustersGivenK[1]).toEqual([[0], [1]])
-    expect(result.clustersGivenK[2]).toEqual([])
   })
 
   it('should build clustersGivenK correctly for 3 samples', async () => {
@@ -186,14 +185,13 @@ describe('clusterData', () => {
 
     const result = await clusterData({ data })
 
-    expect(result.clustersGivenK).toHaveLength(4)
+    expect(result.clustersGivenK).toHaveLength(3)
     expect(result.clustersGivenK[0]?.length).toBe(1)
     expect(result.clustersGivenK[0]?.[0]).toContain(0)
     expect(result.clustersGivenK[0]?.[0]).toContain(1)
     expect(result.clustersGivenK[0]?.[0]).toContain(2)
     expect(result.clustersGivenK[1]).toHaveLength(2)
     expect(result.clustersGivenK[2]).toHaveLength(3)
-    expect(result.clustersGivenK[3]).toEqual([])
   })
 
   it('should handle single sample case', async () => {
@@ -212,9 +210,8 @@ describe('clusterData', () => {
 
     expect(result.tree).toEqual({ name: 'Sample 0', height: 0 })
     expect(result.order).toEqual([0])
-    expect(result.clustersGivenK).toHaveLength(2)
+    expect(result.clustersGivenK).toHaveLength(1)
     expect(result.clustersGivenK[0]).toEqual([[0]])
-    expect(result.clustersGivenK[1]).toEqual([])
   })
 
   it('should handle complex merge sequences', async () => {
@@ -243,9 +240,11 @@ describe('clusterData', () => {
 
     const result = await clusterData({ data })
 
-    expect(result.clustersGivenK.length).toBe(5)
+    expect(result.clustersGivenK.length).toBe(4)
     expect(result.clustersGivenK[0]?.length).toBe(1)
-    expect(result.clustersGivenK[result.clustersGivenK.length - 1]).toEqual([])
+    expect(
+      result.clustersGivenK[result.clustersGivenK.length - 1],
+    ).toHaveLength(4)
   })
 
   it('should propagate error thrown by checkCancellation', async () => {
diff --git a/test/integration.test.ts b/test/integration.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest'
 
-import { clusterData } from '../src/cluster.js'
+import { clusterData, clusterObject } from '../src/cluster.js'
 
 function sortedClusters(clusters: number[][]) {
   return clusters
@@ -18,10 +18,9 @@ describe('clusterData integration', () => {
 
     expect(result.order).toEqual([0, 1])
 
-    expect(result.clustersGivenK).toHaveLength(3)
+    expect(result.clustersGivenK).toHaveLength(2)
     expect(sortedClusters(result.clustersGivenK[0]!)).toEqual([[0, 1]])
     expect(sortedClusters(result.clustersGivenK[1]!)).toEqual([[0], [1]])
-    expect(result.clustersGivenK[2]).toEqual([])
   })
 
   it('clusters 4 samples into correct groups', async () => {
@@ -42,8 +41,7 @@ describe('clusterData integration', () => {
 
     expect(result.order).toEqual([0, 1, 2, 3])
 
-    expect(result.clustersGivenK).toHaveLength(5)
-    expect(result.clustersGivenK[4]).toEqual([])
+    expect(result.clustersGivenK).toHaveLength(4)
     expect(sortedClusters(result.clustersGivenK[0]!)).toEqual([[0, 1, 2, 3]])
     expect(sortedClusters(result.clustersGivenK[1]!)).toEqual([
       [0, 1],
@@ -80,6 +78,68 @@ describe('clusterData integration', () => {
     )
   })
 
+  it('includes K=3 partition in clustersGivenK for 4 samples', async () => {
+    // After first merge {0,1}, before second merge {2,3}, K=3 = {0,1}, {2}, {3}
+    const data = [[1], [2], [5], [7]]
+    const result = await clusterData({ data })
+
+    expect(sortedClusters(result.clustersGivenK[2]!)).toEqual([
+      [0, 1],
+      [2],
+      [3],
+    ])
+  })
+
+  it('order is a valid permutation of sample indices', async () => {
+    const data = [
+      [1, 2],
+      [3, 4],
+      [5, 1],
+      [2, 8],
+    ]
+    const result = await clusterData({ data })
+
+    expect([...result.order].sort((a, b) => a - b)).toEqual([0, 1, 2, 3])
+  })
+
+  it('fires progress callbacks during real clustering', async () => {
+    const data = Array.from({ length: 10 }, (_, i) => [i])
+    const messages: string[] = []
+
+    await clusterData({ data, onProgress: msg => messages.push(msg) })
+
+    expect(messages.length).toBeGreaterThan(0)
+    expect(messages[0]).toBe('Running hierarchical clustering in WASM...')
+  })
+
+  it('handles equal distances deterministically', async () => {
+    // Sample 1 and 2 are both distance 1 from sample 0 — ties should resolve consistently
+    const data = [
+      [0, 0],
+      [1, 0],
+      [0, 1],
+    ]
+    const result1 = await clusterData({ data })
+    const result2 = await clusterData({ data })
+
+    expect(result1.order).toEqual(result2.order)
+    expect(result1.clustersGivenK).toEqual(result2.clustersGivenK)
+  })
+
+  it('clusterObject propagates labels to leaf nodes', async () => {
+    const result = await clusterObject({
+      data: { alpha: [1, 2], beta: [1, 3], gamma: [9, 9] },
+    })
+
+    const leafNames = (node: {
+      name: string
+      children?: (typeof node)[]
+    }): string[] =>
+      node.children ? node.children.flatMap(leafNames) : [node.name]
+
+    expect(leafNames(result.tree).sort()).toEqual(['alpha', 'beta', 'gamma'])
+  })
+
   it('returns deterministic results for the same input', async () => {
     const data = [
       [1, 2],