Skip to content

Commit d32b969

Browse files
authored
feat: PageRank-based image summarizer (#192)
1 parent 5938e87 commit d32b969

23 files changed

Lines changed: 1024 additions & 60 deletions

.vscode/settings.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,7 @@
2727
"editor.codeActionsOnSave": {
2828
"source.organizeImports": "explicit"
2929
}
30-
}
30+
},
31+
"python.testing.pytestArgs": ["packages/backend"],
32+
"python.testing.pytestEnabled": true
3133
}

packages/backend/embedding_atlas/cli.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,14 @@ def import_modules(names: list[str]):
227227
"neighbors_column",
228228
help='Column containing pre-computed nearest neighbors in format: {"ids": [n1, n2, ...], "distances": [d1, d2, ...]}. IDs should be zero-based row indices.',
229229
)
230+
@click.option(
231+
"--pagerank",
232+
"pagerank_column",
233+
default=None,
234+
is_flag=False,
235+
flag_value="__compute__",
236+
help="Compute PageRank scores from the neighbor graph, or specify a column containing pre-computed scores. Automatically computed when --image is specified.",
237+
)
230238
@click.option(
231239
"--query",
232240
default=None,
@@ -352,6 +360,7 @@ def main(
352360
x_column: str | None,
353361
y_column: str | None,
354362
neighbors_column: str | None,
363+
pagerank_column: str | None,
355364
query: str | None,
356365
sample: int | None,
357366
umap_n_neighbors: int | None,
@@ -475,12 +484,32 @@ def main(
475484
labels_df = load_pandas_data(labels)
476485
labels_resolved = labels_df.to_dict("records")
477486

487+
# Compute PageRank from neighbor graph when requested or when --image is specified
488+
should_compute_pagerank = (pagerank_column == "__compute__") or (
489+
image is not None and pagerank_column is None
490+
)
491+
if (
492+
should_compute_pagerank
493+
and neighbors_column is not None
494+
and neighbors_column in df.columns
495+
):
496+
from embedding_atlas.pagerank import compute_pagerank_column
497+
498+
logger.info("Computing PageRank scores from neighbor graph...")
499+
pagerank_column = find_column_name(df.columns, "pagerank")
500+
df[pagerank_column] = compute_pagerank_column(df, neighbors=neighbors_column)
501+
elif pagerank_column == "__compute__":
502+
logger.warning("Cannot compute PageRank: no neighbor data available.")
503+
pagerank_column = None
504+
478505
props = make_embedding_atlas_props(
479506
row_id=id_column,
480507
x=x_column,
481508
y=y_column,
482509
neighbors=neighbors_column,
510+
importance=pagerank_column,
483511
text=text,
512+
image=image,
484513
point_size=point_size,
485514
stop_words=stop_words_resolved,
486515
labels=labels_resolved,

packages/backend/embedding_atlas/options.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@ class EmbeddingAtlasOptions(TypedDict, total=False):
1717
text:
1818
The column name for the textual data.
1919
20+
image:
21+
The column name for image data.
22+
23+
importance:
24+
The column name for importance scores (e.g., PageRank). Used with ``image`` to select
25+
representative images for cluster labels. Maps to ``importance`` in the frontend API.
26+
2027
neighbors:
2128
The column name containing precomputed K-nearest neighbors for each point.
2229
Each value in the column should be a dictionary with the format:
@@ -60,6 +67,8 @@ class EmbeddingAtlasOptions(TypedDict, total=False):
6067
x: str | None
6168
y: str | None
6269
text: str | None
70+
image: str | None
71+
importance: str | None
6372
neighbors: str | None
6473

6574
point_size: float | None
@@ -109,6 +118,8 @@ def set_prop(key: str, value):
109118
if options.get("x") is not None and options.get("y") is not None:
110119
set_prop("data.projection", {"x": options.get("x"), "y": options.get("y")})
111120
set_prop("data.text", options.get("text"))
121+
set_prop("data.image", options.get("image"))
122+
set_prop("data.importance", options.get("importance"))
112123
set_prop("data.neighbors", options.get("neighbors"))
113124

114125
# Embedding View

0 commit comments

Comments
 (0)