Improve embeddings comparison demo: better analogies, clustering, and visualizations

jeremymanning · jeremymanning · commit a6e0da8e8ca4 · 2026-01-16T08:21:07.000-05:00
- Wrap single words in context sentences for sentence embedding models
- Add explicit analogy candidate mappings for common patterns
- Upgrade k-means to 50 iterations with k-means++ initialization
- Add silhouette score for categorization quality metric
- Zoom radar chart and scatter plot axes to highlight model differences
- Add explanatory text to sidebar panels (leaderboard, performance, tradeoff)
diff --git a/demos/embeddings-comparison/css/embeddings-comparison.css b/demos/embeddings-comparison/css/embeddings-comparison.css
@@ -491,11 +491,20 @@
 }
 
 .panel h3 {
-    margin: 0 0 15px 0;
+    margin: 0 0 8px 0;
     font-size: 1.1em;
     color: var(--primary-color);
 }
 
+.panel-desc {
+    font-size: 0.8em;
+    color: var(--text-secondary);
+    margin: 0 0 15px 0;
+    line-height: 1.5;
+    padding-bottom: 10px;
+    border-bottom: 1px solid var(--border-color);
+}
+
 /* Leaderboard */
 .leaderboard-item {
     display: flex;
diff --git a/demos/embeddings-comparison/index.html b/demos/embeddings-comparison/index.html
@@ -142,7 +142,7 @@ <h3>Semantic Similarity</h3>
                     <!-- Analogy Task -->
                     <div id="analogy-task" class="task-panel">
                         <h3>Word Analogies</h3>
-                        <p>Test reasoning: "A is to B as C is to ?"</p>
+                        <p>Test reasoning: "A is to B as C is to ?" Note: These sentence embedding models are optimized for sentences, not individual words. Classic analogies (like Word2Vec's king-man+woman=queen) may not work as reliably. Try different examples to see which relationships these models capture best.</p>
 
                         <div class="test-inputs">
                             <div class="input-group">
@@ -189,7 +189,7 @@ <h3>Word Analogies</h3>
                     <!-- Categorization Task -->
                     <div id="categorization-task" class="task-panel">
                         <h3>Topic Categorization</h3>
-                        <p>Test how well models group similar items using k-means clustering.</p>
+                        <p>Test how well models group similar items using k-means clustering. The algorithm uses k-means++ initialization for better results. Cluster quality is measured by silhouette score (higher = better separation).</p>
 
                         <div class="test-inputs">
                             <div class="input-group">
@@ -358,18 +358,21 @@ <h4>Similarity Computation</h4>
                 <!-- Leaderboard -->
                 <div class="panel leaderboard-panel">
                     <h3>Leaderboard</h3>
+                    <p class="panel-desc">Rankings based on the most recent test. Higher scores indicate better semantic understanding for the given task.</p>
                     <div id="leaderboard"></div>
                 </div>
 
                 <!-- Performance Radar Chart -->
                 <div class="panel radar-panel">
                     <h3>Performance Overview</h3>
+                    <p class="panel-desc">Compares models across three dimensions: Quality (similarity/accuracy), Speed (inference time), and Consistency. Axes zoom to highlight differences.</p>
                     <div id="radar-chart"></div>
                 </div>
 
                 <!-- Speed vs Quality -->
                 <div class="panel tradeoff-panel">
                     <h3>Speed vs Quality</h3>
+                    <p class="panel-desc">The fundamental tradeoff: larger models (upper-right) offer better quality but slower inference. Choose based on your latency requirements.</p>
                     <div id="tradeoff-chart"></div>
                 </div>
 
diff --git a/demos/embeddings-comparison/js/benchmark-tasks.js b/demos/embeddings-comparison/js/benchmark-tasks.js
@@ -41,11 +41,18 @@ export class BenchmarkTasks {
             candidates = this.generateAnalogyCandidates(wordA, wordB, wordC);
         }
 
+        // Sentence-based analogy approach: embed words in context sentences
+        // This works better for sentence embedding models than raw word vectors
+        const contextTemplate = (word) => `The word "${word}" represents a concept.`;
+        
         for (const modelId of modelIds) {
+            const startTime = performance.now();
+            
+            // Embed words in sentence context for better representations
             const [embA, embB, embC] = await Promise.all([
-                this.modelsManager.embed(modelId, wordA),
-                this.modelsManager.embed(modelId, wordB),
-                this.modelsManager.embed(modelId, wordC)
+                this.modelsManager.embed(modelId, contextTemplate(wordA)),
+                this.modelsManager.embed(modelId, contextTemplate(wordB)),
+                this.modelsManager.embed(modelId, contextTemplate(wordC))
             ]);
 
             // Calculate analogy vector: B - A + C
@@ -58,7 +65,7 @@ export class BenchmarkTasks {
             // Find best match among candidates
             const candidateResults = [];
             for (const candidate of candidates) {
-                const embD = await this.modelsManager.embed(modelId, candidate);
+                const embD = await this.modelsManager.embed(modelId, contextTemplate(candidate));
                 const similarity = this.modelsManager.cosineSimilarity(
                     analogyVec,
                     embD.embedding
@@ -67,14 +74,15 @@ export class BenchmarkTasks {
             }
 
             candidateResults.sort((a, b) => b.similarity - a.similarity);
+            const endTime = performance.now();
 
             results.push({
                 modelId,
                 modelName: this.modelsManager.getModelConfig(modelId).name,
                 prediction: candidateResults[0].word,
                 confidence: candidateResults[0].similarity,
                 allCandidates: candidateResults.slice(0, 5),
-                time: embA.time + embB.time + embC.time
+                time: endTime - startTime
             });
         }
 
@@ -95,40 +103,47 @@ export class BenchmarkTasks {
         const lower = (w) => w.toLowerCase();
         const a = lower(wordA), b = lower(wordB), c = lower(wordC);
         
-        const candidateSets = {
-            royalty: ['woman', 'girl', 'female', 'lady', 'princess', 'duchess', 'empress'],
-            capitals: ['England', 'UK', 'Britain', 'Germany', 'Spain', 'Italy', 'Japan', 'China', 'Canada', 'Australia'],
-            grammar: ['worse', 'worst', 'badly', 'poorly', 'terrible', 'awful'],
-            tense: ['ran', 'walked', 'jumped', 'swam', 'flew', 'drove', 'ate', 'slept'],
-            countries: ['French', 'German', 'Spanish', 'Italian', 'Japanese', 'Chinese', 'British', 'American'],
-            profession: ['actress', 'waitress', 'hostess', 'stewardess', 'heroine', 'woman'],
-            size: ['tiny', 'small', 'little', 'huge', 'giant', 'massive', 'enormous'],
-            emotion: ['sad', 'angry', 'scared', 'excited', 'nervous', 'calm', 'joyful']
+        const analogyMap = {
+            'king:queen:man': ['woman', 'lady', 'female', 'girl', 'wife', 'mother', 'queen', 'princess'],
+            'actor:actress:waiter': ['waitress', 'hostess', 'woman', 'female', 'lady', 'stewardess', 'maid'],
+            'hero:heroine:prince': ['princess', 'lady', 'queen', 'duchess', 'woman', 'girl', 'female'],
+            'paris:france:london': ['England', 'Britain', 'UK', 'United Kingdom', 'British', 'London', 'Europe'],
+            'tokyo:japan:berlin': ['Germany', 'German', 'Deutschland', 'Europe', 'Berlin', 'Austria'],
+            'rome:italy:madrid': ['Spain', 'Spanish', 'Espana', 'Europe', 'Portugal', 'Madrid'],
+            'good:better:bad': ['worse', 'worst', 'terrible', 'awful', 'poor', 'inferior', 'bad'],
+            'big:bigger:small': ['smaller', 'tinier', 'little', 'tiny', 'minor', 'lesser', 'small'],
+            'walk:walked:run': ['ran', 'running', 'runs', 'sprinted', 'jogged', 'run', 'raced'],
+            'dog:puppy:cat': ['kitten', 'kitty', 'baby cat', 'young cat', 'cub', 'feline', 'cat'],
+            'wood:tree:paper': ['pulp', 'plant', 'fiber', 'bamboo', 'reed', 'tree', 'forest'],
+            'hammer:nail:screwdriver': ['screw', 'bolt', 'fastener', 'nut', 'nail', 'pin', 'rivet']
         };
 
+        const key = `${a}:${b}:${c}`;
+        if (analogyMap[key]) {
+            return analogyMap[key];
+        }
+
         if ((a === 'king' && b === 'queen') || (a === 'man' && b === 'woman') || 
             (a === 'boy' && b === 'girl') || (a === 'father' && b === 'mother')) {
-            return candidateSets.royalty;
+            return ['woman', 'lady', 'female', 'girl', 'wife', 'mother', 'queen', 'princess', 'daughter'];
         }
         
-        if (['paris', 'london', 'berlin', 'tokyo', 'rome', 'madrid'].includes(a) ||
-            ['france', 'england', 'germany', 'japan', 'italy', 'spain'].includes(a)) {
-            return candidateSets.capitals;
+        if (['paris', 'london', 'berlin', 'tokyo', 'rome', 'madrid'].includes(a)) {
+            return ['England', 'Britain', 'UK', 'Germany', 'Spain', 'Italy', 'Japan', 'France', 'Europe'];
         }
         
-        if (['good', 'bad', 'big', 'small', 'fast', 'slow'].includes(a) &&
-            ['better', 'worse', 'bigger', 'smaller', 'faster', 'slower'].includes(b)) {
-            return candidateSets.grammar;
+        if (['good', 'bad', 'big', 'small', 'fast', 'slow'].includes(a)) {
+            return ['worse', 'smaller', 'slower', 'faster', 'bigger', 'better', 'terrible', 'great'];
         }
         
         if (['walk', 'run', 'swim', 'fly', 'drive', 'eat'].includes(a)) {
-            return candidateSets.tense;
+            return ['ran', 'walked', 'swam', 'flew', 'drove', 'ate', 'slept', 'jumped'];
         }
 
         return [
-            wordB, `${wordB}s`, `${wordC}er`, `${wordC}ing`,
-            'woman', 'man', 'person', 'thing', 'place',
-            'good', 'bad', 'big', 'small', 'new', 'old'
+            'woman', 'man', 'person', 'thing', 'place', 'time',
+            'good', 'bad', 'big', 'small', 'new', 'old',
+            wordB, wordC
         ];
     }
 
@@ -234,21 +249,13 @@ export class BenchmarkTasks {
         return validPoints > 0 ? totalScore / validPoints : 0;
     }
 
-    kMeansClusteringWithCentroids(embeddings, k, maxIters = 10) {
+    kMeansClusteringWithCentroids(embeddings, k, maxIters = 50) {
         const n = embeddings.length;
         const dim = embeddings[0].length;
 
-        const centroids = [];
-        const indices = new Set();
-        while (centroids.length < k) {
-            const idx = Math.floor(Math.random() * n);
-            if (!indices.has(idx)) {
-                centroids.push([...embeddings[idx]]);
-                indices.add(idx);
-            }
-        }
-
+        const centroids = this.kMeansPlusPlusInit(embeddings, k);
         let assignments = Array(n).fill(0);
+        let prevAssignments = null;
 
         for (let iter = 0; iter < maxIters; iter++) {
             for (let i = 0; i < n; i++) {
@@ -266,6 +273,11 @@ export class BenchmarkTasks {
                 assignments[i] = bestCluster;
             }
 
+            if (prevAssignments && assignments.every((a, i) => a === prevAssignments[i])) {
+                break;
+            }
+            prevAssignments = [...assignments];
+
             const clusterSums = Array.from({ length: k }, () => Array(dim).fill(0));
             const clusterCounts = Array(k).fill(0);
 
@@ -289,6 +301,42 @@ export class BenchmarkTasks {
         return { assignments, centroids };
     }
 
+    kMeansPlusPlusInit(embeddings, k) {
+        const n = embeddings.length;
+        const centroids = [];
+        
+        const firstIdx = Math.floor(Math.random() * n);
+        centroids.push([...embeddings[firstIdx]]);
+        
+        while (centroids.length < k) {
+            const distances = embeddings.map(emb => {
+                let minDist = Infinity;
+                for (const centroid of centroids) {
+                    const dist = this.euclideanDistance(emb, centroid);
+                    if (dist < minDist) minDist = dist;
+                }
+                return minDist * minDist;
+            });
+            
+            const totalDist = distances.reduce((a, b) => a + b, 0);
+            let threshold = Math.random() * totalDist;
+            
+            for (let i = 0; i < n; i++) {
+                threshold -= distances[i];
+                if (threshold <= 0) {
+                    centroids.push([...embeddings[i]]);
+                    break;
+                }
+            }
+            
+            if (centroids.length === centroids.length - 1) {
+                centroids.push([...embeddings[Math.floor(Math.random() * n)]]);
+            }
+        }
+        
+        return centroids;
+    }
+
     euclideanDistance(vecA, vecB) {
         let sum = 0;
         for (let i = 0; i < vecA.length; i++) {
diff --git a/demos/embeddings-comparison/js/visualization.js b/demos/embeddings-comparison/js/visualization.js
@@ -122,32 +122,47 @@ export class Visualization {
         const theme = this.getPlotlyTheme();
         const colors = theme.colorway;
 
-        const traces = results.map((result, idx) => {
+        const allValues = [];
+        const traceData = results.map((result, idx) => {
             const quality = result.similarity || result.confidence || result.avgSimilarity || 0.5;
-            const speed = result.time ? Math.max(0, 1 - (result.time / 1000)) : 0.5;
+            const maxTime = Math.max(...results.map(r => r.time || 100));
+            const speed = result.time ? Math.max(0, 1 - (result.time / maxTime)) : 0.5;
             const consistency = quality;
-
-            return {
-                type: 'scatterpolar',
-                r: [quality, speed, consistency, quality],
-                theta: ['Quality', 'Speed', 'Consistency', 'Quality'],
-                fill: 'toself',
-                name: result.modelName,
-                opacity: 0.6,
-                line: { color: colors[idx % colors.length], width: 2 },
-                fillcolor: colors[idx % colors.length].replace(')', ', 0.3)').replace('rgb', 'rgba'),
-                marker: { color: colors[idx % colors.length] }
-            };
+            allValues.push(quality, speed, consistency);
+            return { quality, speed, consistency, result, idx };
         });
 
+        const minVal = Math.min(...allValues);
+        const maxVal = Math.max(...allValues);
+        const padding = (maxVal - minVal) * 0.15 || 0.1;
+        const rangeMin = Math.max(0, minVal - padding);
+        const rangeMax = Math.min(1, maxVal + padding);
+
+        const traces = traceData.map(({ quality, speed, consistency, result, idx }) => ({
+            type: 'scatterpolar',
+            r: [quality, speed, consistency, quality],
+            theta: ['Quality', 'Speed', 'Consistency', 'Quality'],
+            fill: 'toself',
+            name: result.modelName,
+            opacity: 0.6,
+            line: { color: colors[idx % colors.length], width: 2 },
+            fillcolor: colors[idx % colors.length].replace(')', ', 0.3)').replace('rgb', 'rgba'),
+            marker: { color: colors[idx % colors.length] }
+        }));
+
+        const tickCount = 4;
+        const tickStep = (rangeMax - rangeMin) / tickCount;
+        const tickvals = Array.from({ length: tickCount + 1 }, (_, i) => rangeMin + i * tickStep);
+        const ticktext = tickvals.map(v => v.toFixed(2));
+
         const layout = {
             polar: {
                 bgcolor: theme.polar.bgcolor,
                 radialaxis: {
                     visible: true,
-                    range: [0, 1],
-                    tickvals: [0.25, 0.5, 0.75, 1],
-                    ticktext: ['0.25', '0.50', '0.75', '1.00'],
+                    range: [rangeMin, rangeMax],
+                    tickvals: tickvals,
+                    ticktext: ticktext,
                     ...theme.polar.radialaxis
                 },
                 angularaxis: {
@@ -180,9 +195,16 @@ export class Visualization {
         const theme = this.getPlotlyTheme();
         const colors = theme.colorway;
 
+        const yValues = results.map(r => r.similarity || r.confidence || r.avgSimilarity || 0);
+        const minY = Math.min(...yValues);
+        const maxY = Math.max(...yValues);
+        const paddingY = (maxY - minY) * 0.2 || 0.05;
+        const yMin = Math.max(0, minY - paddingY);
+        const yMax = Math.min(1.05, maxY + paddingY);
+
         const trace = {
             x: results.map(r => r.time),
-            y: results.map(r => r.similarity || r.confidence || r.avgSimilarity || 0),
+            y: yValues,
             mode: 'markers+text',
             type: 'scatter',
             text: results.map(r => r.modelName.split('-')[0]),
@@ -208,7 +230,7 @@ export class Visualization {
             },
             yaxis: {
                 title: { text: 'Quality Score', font: theme.yaxis.titlefont },
-                range: [0, 1.1],
+                range: [yMin, yMax],
                 ...theme.yaxis,
                 automargin: true
             },