Skip to content

Commit 0ad8242

Browse files
patconclaude
andcommitted
Fix PCA metrics layer in h5ad mode
The principal-components metric was falling back to loading local demo projections.json (2D localmap coordinates) instead of actual PCA data. Now h5ad-loader extracts full-dimension embeddings (>2D) separately, and App uses them directly for PCA metrics. Prefers pca_masked_unscaled when available. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f776e7c commit 0ad8242

3 files changed

Lines changed: 78 additions & 25 deletions

File tree

src/components/convo-explorer/App.h5ad.stories.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ function H5adFileLoader() {
3434
statements: parsed.statements,
3535
votesRows: parsed.votesRows,
3636
pipelineData: parsed.allEmbeddings,
37+
fullDimensionEmbeddings: parsed.fullDimensionEmbeddings,
3738
});
3839
} catch (err) {
3940
console.error('Failed to load h5ad file:', err);

src/components/convo-explorer/App.tsx

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ export type PreloadedData = {
3636
statements: { statement_id: string; txt: string; moderated: number }[];
3737
votesRows: { participant_id: string; comment_id: string; vote: number }[];
3838
pipelineData?: Record<string, [string, [number, number]][]>;
39+
/** Full-dimension embeddings (>2D, e.g. PCA) for metrics layer */
40+
fullDimensionEmbeddings?: Record<string, [string, number[]][]>;
3941
};
4042

4143
type AppProps = {
@@ -405,33 +407,69 @@ export const App: React.FC<AppProps> = ({ testAnimation = false, kedroBaseUrl, i
405407

406408
setPointMetrics(newPointMetrics);
407409
} else if (metricConfig.type === "principal-components") {
408-
// Load principal component metrics (new logic)
410+
// Load principal component metrics
409411
const componentIndex = metricConfig.component - 1; // Convert 1-based to 0-based index
410412

411-
// Extract the imputer from the current pipeline ID and construct the PCA pipeline ID
412-
// e.g., "mean_localmap_bestkmeans" -> "mean_pca_bestkmeans"
413-
// e.g., "median_umap_bestkmeans" -> "median_pca_bestkmeans"
414-
const pipelineParts = currentPipelineId.split('_');
415-
let pcaPipelineId = 'mean_pca_bestkmeans'; // fallback default
413+
if (preloadedData?.fullDimensionEmbeddings) {
414+
// Preloaded mode: extract components from full-dimension embeddings
415+
const embKeys = Object.keys(preloadedData.fullDimensionEmbeddings);
416+
// Prefer pca_masked_unscaled, then any key containing 'pca', then first available
417+
const pcaKey = embKeys.find(k => k === 'pca_masked_unscaled')
418+
|| embKeys.find(k => k.includes('pca'))
419+
|| embKeys[0];
420+
421+
if (pcaKey) {
422+
const fullData = preloadedData.fullDimensionEmbeddings[pcaKey];
423+
console.log(`Using preloaded PCA embedding "${pcaKey}" (${fullData[0]?.[1]?.length || 0} dimensions)`);
424+
425+
// Build a map and normalize
426+
const rawValues = new Map<string, number>();
427+
let minValue = Infinity;
428+
let maxValue = -Infinity;
429+
430+
for (const [pid, coords] of fullData) {
431+
if (coords.length > componentIndex) {
432+
const value = coords[componentIndex];
433+
rawValues.set(pid, value);
434+
minValue = Math.min(minValue, value);
435+
maxValue = Math.max(maxValue, value);
436+
}
437+
}
438+
439+
const range = maxValue - minValue;
440+
const newPointMetrics = dataset.map(([participantId]) => {
441+
const raw = rawValues.get(participantId);
442+
if (raw === undefined) return null;
443+
return range > 0 ? (raw - minValue) / range : 0.5;
444+
});
445+
446+
console.log(`Calculated principal component ${metricConfig.component} for ${rawValues.size} participants (range: ${minValue.toFixed(3)} - ${maxValue.toFixed(3)})`);
447+
setPointMetrics(newPointMetrics);
448+
}
449+
} else {
450+
// Kedro/static mode: derive PCA pipeline ID from current pipeline
451+
const pipelineParts = currentPipelineId.split('_');
452+
let pcaPipelineId = 'mean_pca_bestkmeans'; // fallback default
453+
454+
if (pipelineParts.length >= 3) {
455+
const imputer = pipelineParts[0]; // e.g., "mean", "median"
456+
const clustering = "bestkmeans";
457+
pcaPipelineId = `${imputer}_pca_${clustering}`;
458+
}
416459

417-
if (pipelineParts.length >= 3) {
418-
const imputer = pipelineParts[0]; // e.g., "mean", "median"
419-
const clustering = "bestkmeans"; // alwasy assume "bestkmeans"
420-
pcaPipelineId = `${imputer}_pca_${clustering}`;
421-
}
460+
console.log(`Using PCA pipeline "${pcaPipelineId}" derived from current pipeline "${currentPipelineId}"`);
422461

423-
console.log(`Using PCA pipeline "${pcaPipelineId}" derived from current pipeline "${currentPipelineId}"`);
462+
const componentValues = await getPrincipalComponentValues(componentIndex, {
463+
kedroBaseUrl,
464+
pipelineId: pcaPipelineId
465+
});
424466

425-
const componentValues = await getPrincipalComponentValues(componentIndex, {
426-
kedroBaseUrl,
427-
pipelineId: pcaPipelineId
428-
});
467+
const newPointMetrics = dataset.map(([participantId]) => {
468+
return componentValues.get(participantId) ?? null;
469+
});
429470

430-
const newPointMetrics = dataset.map(([participantId]) => {
431-
return componentValues.get(participantId) ?? null;
432-
});
433-
434-
setPointMetrics(newPointMetrics);
471+
setPointMetrics(newPointMetrics);
472+
}
435473
}
436474
} catch (err) {
437475
console.error('Error loading metrics:', err);

src/lib/h5ad-loader.ts

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ export type H5adData = {
88
availableEmbeddings: string[];
99
/** All 2D embeddings keyed by pipeline-style ID (X_ prefix stripped) */
1010
allEmbeddings: Record<string, [string, [number, number]][]>;
11+
/** Full-dimension embeddings (>2D, e.g. PCA) keyed by pipeline-style ID */
12+
fullDimensionEmbeddings: Record<string, [string, number[]][]>;
1113
};
1214

1315
/**
@@ -155,6 +157,7 @@ export async function loadH5adFile(
155157
if (!obsmGroup) throw new Error('Missing /obsm group');
156158

157159
const allEmbeddings: Record<string, [string, [number, number]][]> = {};
160+
const fullDimensionEmbeddings: Record<string, [string, number[]][]> = {};
158161
for (const embKey of availableEmbeddings) {
159162
const ds = obsmGroup.get(embKey) as Dataset | null;
160163
if (!ds) continue;
@@ -174,16 +177,27 @@ export async function loadH5adFile(
174177
continue;
175178
}
176179

180+
// Strip X_ prefix for pipeline-style IDs
181+
const pipelineId = embKey.replace(/^X_/, '');
182+
183+
// Always store 2D projection (first 2 columns)
177184
const embDataset: [string, [number, number]][] = [];
178185
for (let i = 0; i < nObs; i++) {
179186
const x = flatCoords[i * nDims];
180187
const y = flatCoords[i * nDims + 1];
181188
embDataset.push([obsNames[i], [x, y]]);
182189
}
183-
184-
// Strip X_ prefix for pipeline-style IDs
185-
const pipelineId = embKey.replace(/^X_/, '');
186190
allEmbeddings[pipelineId] = embDataset;
191+
192+
// For high-dimensional embeddings (>2D), also store all dimensions
193+
if (nDims > 2) {
194+
const fullDataset: [string, number[]][] = [];
195+
for (let i = 0; i < nObs; i++) {
196+
const coords = flatCoords.slice(i * nDims, (i + 1) * nDims);
197+
fullDataset.push([obsNames[i], coords]);
198+
}
199+
fullDimensionEmbeddings[pipelineId] = fullDataset;
200+
}
187201
}
188202

189203
// Use the selected embedding as the default dataset
@@ -238,7 +252,7 @@ export async function loadH5adFile(
238252
// --- Read votes from uns/votes ---
239253
const votesRows = readVotes(file);
240254

241-
return { dataset, statements, votesRows, availableEmbeddings, allEmbeddings };
255+
return { dataset, statements, votesRows, availableEmbeddings, allEmbeddings, fullDimensionEmbeddings };
242256
} finally {
243257
if (file) {
244258
file.close();

0 commit comments

Comments
 (0)