Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,7 @@
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit"
}
}
},
"python.testing.pytestArgs": ["packages/backend"],
"python.testing.pytestEnabled": true
}
29 changes: 29 additions & 0 deletions packages/backend/embedding_atlas/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,14 @@ def import_modules(names: list[str]):
"neighbors_column",
help='Column containing pre-computed nearest neighbors in format: {"ids": [n1, n2, ...], "distances": [d1, d2, ...]}. IDs should be zero-based row indices.',
)
@click.option(
"--pagerank",
"pagerank_column",
default=None,
is_flag=False,
flag_value="__compute__",
help="Compute PageRank scores from the neighbor graph, or specify a column containing pre-computed scores. Automatically computed when --image is specified.",
)
@click.option(
"--query",
default=None,
Expand Down Expand Up @@ -352,6 +360,7 @@ def main(
x_column: str | None,
y_column: str | None,
neighbors_column: str | None,
pagerank_column: str | None,
query: str | None,
sample: int | None,
umap_n_neighbors: int | None,
Expand Down Expand Up @@ -475,12 +484,32 @@ def main(
labels_df = load_pandas_data(labels)
labels_resolved = labels_df.to_dict("records")

# Compute PageRank from neighbor graph when requested or when --image is specified
should_compute_pagerank = (pagerank_column == "__compute__") or (
image is not None and pagerank_column is None
)
if (
should_compute_pagerank
and neighbors_column is not None
and neighbors_column in df.columns
):
from embedding_atlas.pagerank import compute_pagerank_column

logger.info("Computing PageRank scores from neighbor graph...")
pagerank_column = find_column_name(df.columns, "pagerank")
df[pagerank_column] = compute_pagerank_column(df, neighbors=neighbors_column)
elif pagerank_column == "__compute__":
logger.warning("Cannot compute PageRank: no neighbor data available.")
pagerank_column = None

props = make_embedding_atlas_props(
row_id=id_column,
x=x_column,
y=y_column,
neighbors=neighbors_column,
importance=pagerank_column,
text=text,
image=image,
point_size=point_size,
stop_words=stop_words_resolved,
labels=labels_resolved,
Expand Down
11 changes: 11 additions & 0 deletions packages/backend/embedding_atlas/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ class EmbeddingAtlasOptions(TypedDict, total=False):
text:
The column name for the textual data.

image:
The column name for image data.

importance:
The column name for importance scores (e.g., PageRank). Used with ``image`` to select
representative images for cluster labels. Maps to ``importance`` in the frontend API.

neighbors:
The column name containing precomputed K-nearest neighbors for each point.
Each value in the column should be a dictionary with the format:
Expand Down Expand Up @@ -60,6 +67,8 @@ class EmbeddingAtlasOptions(TypedDict, total=False):
x: str | None
y: str | None
text: str | None
image: str | None
importance: str | None
neighbors: str | None

point_size: float | None
Expand Down Expand Up @@ -109,6 +118,8 @@ def set_prop(key: str, value):
if options.get("x") is not None and options.get("y") is not None:
set_prop("data.projection", {"x": options.get("x"), "y": options.get("y")})
set_prop("data.text", options.get("text"))
set_prop("data.image", options.get("image"))
set_prop("data.importance", options.get("importance"))
set_prop("data.neighbors", options.get("neighbors"))

# Embedding View
Expand Down
Loading
Loading