Add LMCLUS model (#950)

ishii-norimi · web-flow · commit 3c7a7dd4d709 · 2025-04-13T14:59:31.000+09:00
diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ for (let i = 0; i < n; i++) {
 
 | task | model |
 | ---- | ----- |
-| clustering | (Soft / Kernel / Genetic / Weighted / Bisecting) k-means, k-means++, k-medois, k-medians, x-means, G-means, LBG, ISODATA, Fuzzy c-means, Possibilistic c-means, k-harmonic means, MacQueen, Hartigan-Wong, Elkan, Hamelry, Drake, Yinyang, Agglomerative (complete linkage, single linkage, group average, Ward's, centroid, weighted average, median), DIANA, Monothetic, Mutual kNN, Mean shift, DBSCAN, OPTICS, DTSCAN, HDBSCAN, DENCLUE, DBCLASD, BRIDGE, CLUES, PAM, CLARA, CLARANS, BIRCH, CURE, ROCK, C2P, PLSA, Latent dirichlet allocation, GMM, VBGMM, Affinity propagation, Spectral clustering, Mountain, (Growing) SOM, GTM, (Growing) Neural gas, Growing cell structures, LVQ, ART, SVC, CAST, CHAMELEON, COLL, CLIQUE, PROCLUS, ORCLUS, FINDIT, DOC, FastDOC, DiSH, NMF, Autoencoder |
+| clustering | (Soft / Kernel / Genetic / Weighted / Bisecting) k-means, k-means++, k-medois, k-medians, x-means, G-means, LBG, ISODATA, Fuzzy c-means, Possibilistic c-means, k-harmonic means, MacQueen, Hartigan-Wong, Elkan, Hamelry, Drake, Yinyang, Agglomerative (complete linkage, single linkage, group average, Ward's, centroid, weighted average, median), DIANA, Monothetic, Mutual kNN, Mean shift, DBSCAN, OPTICS, DTSCAN, HDBSCAN, DENCLUE, DBCLASD, BRIDGE, CLUES, PAM, CLARA, CLARANS, BIRCH, CURE, ROCK, C2P, PLSA, Latent dirichlet allocation, GMM, VBGMM, Affinity propagation, Spectral clustering, Mountain, (Growing) SOM, GTM, (Growing) Neural gas, Growing cell structures, LVQ, ART, SVC, CAST, CHAMELEON, COLL, CLIQUE, PROCLUS, ORCLUS, FINDIT, DOC, FastDOC, DiSH, LMCLUS, NMF, Autoencoder |
 | classification | (Fisher's) Linear discriminant, Quadratic discriminant, Mixture discriminant, Least squares, (Multiclass / Kernel) Ridge, (Complement / Negation / Universal-set / Selective) Naive Bayes (gaussian), AODE, (Fuzzy / Weighted) k-nearest neighbor, Radius neighbor, Nearest centroid, ENN, ENaN, NNBCA, ADAMENN, DANN, IKNN, Decision tree, Random forest, Extra trees, GBDT, XGBoost, ALMA, (Aggressive) ROMMA, (Bounded) Online gradient descent, (Budgeted online) Passive aggressive, RLS, (Selective-sampling) Second order perceptron, AROW, NAROW, Confidence weighted, CELLIP, IELLIP, Normal herd, Stoptron, (Kernelized) Pegasos, MIRA, Forgetron, Projectron, Projectron++, Banditron, Ballseptron, (Multiclass) BSGD, ILK, SILK, (Multinomial) Logistic regression, (Multinomial) Probit, SVM, Gaussian process, HMM, CRF, Bayesian Network, LVQ, (Average / Multiclass / Voted / Kernelized / Selective-sampling / Margin / Shifting / Budget / Tighter / Tightest) Perceptron, PAUM, RBP, ADALINE, MADALINE, MLP, ELM, LMNN |
 | semi-supervised classification | k-nearest neighbor, Radius neighbor, Label propagation, Label spreading, k-means, GMM, S3VM, Ladder network |
 | regression | Least squares, Ridge, Lasso, Elastic net, RLS, Bayesian linear, Poisson, Least absolute deviations, Huber, Tukey, Least trimmed squares, Least median squares, Lp norm linear, SMA, Deming, Segmented, LOWESS, LOESS, spline, Naive Bayes, Gaussian process, Principal components, Partial least squares, Projection pursuit, Quantile regression, k-nearest neighbor, Radius neighbor, IDW, Nadaraya Watson, Priestley Chao, Gasser Muller, RBF Network, RVM, Decision tree, Random forest, Extra trees, GBDT, XGBoost, SVR, MARS, MLP, ELM, GMR, Isotonic, Ramer Douglas Peucker, Theil-Sen, Passing-Bablok, Repeated median |
diff --git a/js/model_selector.js b/js/model_selector.js
@@ -153,6 +153,7 @@ const AIMethods = [
 				{ value: 'clues', title: 'CLUES' },
 				{ value: 'chameleon', title: 'CHAMELEON' },
 				{ value: 'coll', title: 'COLL' },
+				{ value: 'lmclus', title: 'LMCLUS' },
 				{ value: 'plsa', title: 'PLSA' },
 				{ value: 'latent_dirichlet_allocation', title: 'Latent Dirichlet Allocation' },
 				{ value: 'nmf', title: 'NMF' },
diff --git a/js/view/lmclus.js b/js/view/lmclus.js
@@ -0,0 +1,27 @@
+import LMCLUS from '../../lib/model/lmclus.js'
+import Controller from '../controller.js'
+
+export default function (platform) {
+	platform.setting.ml.usage = 'Click and add data point. Then, click "Fit" button.'
+	platform.setting.ml.reference = {
+		author: 'R. Haralick, R. Harpaz',
+		title: 'Linear manifold clustering in high dimensional spaces by stochastic search',
+		year: 2007,
+	}
+	const controller = new Controller(platform)
+
+	const fitModel = () => {
+		const model = new LMCLUS(k.value, s.value, gamma.value)
+
+		model.fit(platform.trainInput)
+		const pred = model.predict().map(v => v + 1)
+		platform.trainResult = pred
+		clusters.value = model.size
+	}
+
+	const k = controller.input.number({ label: ' k ', min: 1, max: 1000, value: 2 })
+	const s = controller.input.number({ label: ' s ', min: 1, max: 1000, value: 1.5, step: 0.1 })
+	const gamma = controller.input.number({ label: ' gamma ', min: 0, max: 1, value: 0.4, step: 0.01 })
+	controller.input.button('Fit').on('click', fitModel)
+	const clusters = controller.text({ label: ' Clusters: ' })
+}
diff --git a/lib/model/lmclus.js b/lib/model/lmclus.js
@@ -0,0 +1,228 @@
+import Matrix from '../util/matrix.js'
+
+/**
+ * Linear manifold clustering
+ */
+export default class LMCLUS {
+	// Linear manifold clustering in high dimensional spaces by stochastic search
+	// https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=76925699b88b9e3c599269f214b0b50fb02bb1f6
+	/**
+	 * @param {number} k Max LM dim
+	 * @param {number} s Sampling level
+	 * @param {number} gamma Sensitivity threshold
+	 */
+	constructor(k, s, gamma) {
+		this._k = k
+		this._s = s
+		this._gamma = gamma
+	}
+
+	/**
+	 * Number of clusters.
+	 * @type {number}
+	 */
+	get size() {
+		return this._c.length
+	}
+
+	/**
+	 * Fit model
+	 * @param {Array<Array<number>>} datas Training data
+	 */
+	fit(datas) {
+		this._c = []
+		this._dims = []
+		const d = datas.concat()
+		const labeled = Array(d.length).fill(false)
+		while (labeled.some(v => !v)) {
+			let ddIndexes = []
+			for (let i = 0; i < labeled.length; i++) {
+				if (!labeled[i]) {
+					ddIndexes.push(i)
+				}
+			}
+			let dd = ddIndexes.map(i => d[i])
+			let lmDim = 1
+			for (let k = 1; k <= this._k; k++) {
+				const [g, tau, phi, beta] = this._findSeparation(dd, k, this._s)
+				if (g <= this._gamma) {
+					break
+				}
+				const newdd = []
+				const newddidx = []
+				for (let j = 0; j < dd.length; j++) {
+					const xd = new Matrix(
+						dd[j].length,
+						1,
+						dd[j].map((v, i) => v - phi[i])
+					)
+					const bxd = beta.dot(xd)
+					const dist = xd.tDot(xd).toScaler() - bxd.tDot(bxd).toScaler()
+					if (dist < tau) {
+						newdd.push(dd[j])
+						newddidx.push(ddIndexes[j])
+					}
+				}
+				dd = newdd
+				lmDim = k
+				ddIndexes = newddidx
+			}
+			if (dd.length === 0) {
+				continue
+			}
+
+			this._c.push(ddIndexes)
+			this._dims.push(lmDim)
+			for (let i = 0; i < ddIndexes.length; i++) {
+				labeled[ddIndexes[i]] = true
+			}
+		}
+	}
+
+	_sampleidx(n, k) {
+		const idx = []
+		for (let i = 0; i < k; i++) {
+			idx.push(Math.floor(Math.random() * (n - i)))
+		}
+		for (let i = idx.length - 1; i >= 0; i--) {
+			for (let j = idx.length - 1; j > i; j--) {
+				if (idx[i] <= idx[j]) {
+					idx[j]++
+				}
+			}
+		}
+		return idx
+	}
+
+	_findSeparation(d, k, s) {
+		let gamma = -Infinity
+		let tau = -Infinity
+		let phi = null
+		let beta = null
+		const eps = 1.0e-8
+		const c = 1
+		const N = Math.min(Math.log(eps) / Math.log(1 - (1 / s) ** k), c * d.length)
+
+		for (let i = 0; i < N; i++) {
+			const idxes = this._sampleidx(d.length, k + 1)
+			const m = idxes.map(idx => d[idx])
+			const [b] = Matrix.fromArray(m).qrGramSchmidt()
+			const distances = []
+			for (let j = 0; j < d.length; j++) {
+				if (idxes.includes(j)) {
+					continue
+				}
+				const xd = new Matrix(d[j].length, 1, d[j])
+				const bxd = b.dot(xd)
+				const dist = xd.tDot(xd).toScaler() - bxd.tDot(bxd).toScaler()
+				distances.push(dist)
+			}
+
+			const [hist, ranges] = this._makeHistogram(distances)
+			const [t, g] = this._findMinimumErrorThreshold(hist, ranges)
+			if (g > gamma) {
+				gamma = g
+				tau = t
+				phi = m[0]
+				beta = b
+			}
+		}
+		return [gamma, tau, phi, beta]
+	}
+
+	_makeHistogram(d) {
+		let max = -Infinity
+		let min = Infinity
+		let sum = 0
+		for (let i = 0; i < d.length; i++) {
+			max = Math.max(max, d[i])
+			min = Math.min(min, d[i])
+			sum += d[i]
+		}
+		const mean = sum / d.length
+		let vari = 0
+		for (let i = 0; i < d.length; i++) {
+			vari += (mean - d[i]) ** 2
+		}
+		vari /= d.length
+		const std = Math.sqrt(vari)
+		const step = std * Math.cbrt((24 * Math.sqrt(Math.PI)) / d.length)
+		const ranges = [min]
+		while (ranges[ranges.length - 1] < max) {
+			ranges[ranges.length] = ranges[ranges.length - 1] + step
+		}
+		const count = ranges.length - 1
+
+		const hist = Array(count).fill(0)
+		for (let i = 0; i < d.length; i++) {
+			if (d[i] === max) {
+				hist[count - 1]++
+			} else {
+				hist[Math.floor((d[i] - min) / step)]++
+			}
+		}
+		return [hist, ranges]
+	}
+
+	_findMinimumErrorThreshold(h, r) {
+		let tau = -1
+		let minj = Infinity
+		let maxj = -Infinity
+		let discriminability = 0
+		for (let t = 0; t < h.length - 1; t++) {
+			let p1 = 0
+			let p2 = 0
+			let m1 = 0
+			let m2 = 0
+			for (let i = 0; i < h.length; i++) {
+				if (i <= t) {
+					p1 += h[i]
+					m1 += i * h[i]
+				} else {
+					p2 += h[i]
+					m2 += i * h[i]
+				}
+			}
+			m1 /= p1
+			m2 /= p2
+			let s1 = 0
+			let s2 = 0
+			for (let i = 0; i < h.length; i++) {
+				if (i <= t) {
+					s1 += (i - m1) ** 2 * h[i]
+				} else {
+					s2 += (i - m2) ** 2 * h[i]
+				}
+			}
+			s1 /= p1
+			s2 /= p2
+
+			const j =
+				1 +
+				2 * (p1 * Math.log(Math.sqrt(s1)) + p2 * Math.log(Math.sqrt(s2))) -
+				2 * (p1 * Math.log(p1) + p2 * Math.log(p2))
+			if (j < minj) {
+				minj = j
+				tau = t
+				discriminability = (m1 - m2) ** 2 / (s1 + s2)
+			}
+			maxj = Math.max(maxj, j)
+		}
+		const g = discriminability * (maxj - minj)
+		return [r[tau + 1], g]
+	}
+
+	/**
+	 * Returns predicted categories.
+	 * @returns {number[]} Predicted values
+	 */
+	predict() {
+		const pred = []
+		for (let k = 0; k < this._c.length; k++) {
+			for (let i = 0; i < this._c[k].length; i++) {
+				pred[this._c[k][i]] = k
+			}
+		}
+		return pred
+	}
+}
diff --git a/tests/gui/view/lmclus.test.js b/tests/gui/view/lmclus.test.js
@@ -0,0 +1,42 @@
+import { getPage } from '../helper/browser'
+
+describe('clustering', () => {
+	/** @type {Awaited<ReturnType<getPage>>} */
+	let page
+	beforeEach(async () => {
+		page = await getPage()
+		const taskSelectBox = await page.waitForSelector('#ml_selector dl:first-child dd:nth-child(5) select')
+		await taskSelectBox.selectOption('CT')
+		const modelSelectBox = await page.waitForSelector('#ml_selector .model_selection #mlDisp')
+		await modelSelectBox.selectOption('lmclus')
+	})
+
+	afterEach(async () => {
+		await page?.close()
+	})
+
+	test('initialize', async () => {
+		const methodMenu = await page.waitForSelector('#ml_selector #method_menu')
+		const buttons = await methodMenu.waitForSelector('.buttons')
+
+		const k = await buttons.waitForSelector('input:nth-of-type(1)')
+		await expect(k.getAttribute('value')).resolves.toBe('2')
+		const s = await buttons.waitForSelector('input:nth-of-type(2)')
+		await expect(s.getAttribute('value')).resolves.toBe('1.5')
+		const gamma = await buttons.waitForSelector('input:nth-of-type(3)')
+		await expect(gamma.getAttribute('value')).resolves.toBe('0.4')
+	})
+
+	test('learn', async () => {
+		const methodMenu = await page.waitForSelector('#ml_selector #method_menu')
+		const buttons = await methodMenu.waitForSelector('.buttons')
+
+		const clusters = await buttons.waitForSelector('span:last-child', { state: 'attached' })
+		await expect(clusters.textContent()).resolves.toBe('')
+
+		const fitButton = await buttons.waitForSelector('input[value=Fit]')
+		await fitButton.evaluate(el => el.click())
+
+		await expect(clusters.textContent()).resolves.toMatch(/^[0-9]+$/)
+	})
+})
diff --git a/tests/lib/model/lmclus.test.js b/tests/lib/model/lmclus.test.js
@@ -0,0 +1,27 @@
+import { jest } from '@jest/globals'
+jest.retryTimes(5)
+
+import Matrix from '../../../lib/util/matrix.js'
+import LMCLUS from '../../../lib/model/lmclus.js'
+
+import { randIndex } from '../../../lib/evaluate/clustering.js'
+
+test('clustering', () => {
+	const model = new LMCLUS(5, 1.5, 0.4)
+	const n = 50
+	const x = Matrix.concat(
+		Matrix.concat(Matrix.randn(n, 5, 0, 0.1), Matrix.randn(n, 5, 5, 0.1)),
+		Matrix.randn(n, 5, [0, 5, 0, 5, 0], 0.1)
+	).toArray()
+
+	model.fit(x)
+	const y = model.predict()
+	expect(y).toHaveLength(x.length)
+
+	const t = []
+	for (let i = 0; i < x.length; i++) {
+		t[i] = Math.floor(i / n)
+	}
+	const ri = randIndex(y, t)
+	expect(ri).toBeGreaterThan(0.8)
+})