Skip to content

Commit 4f6eb81

Browse files
committed
✨ Update llm ranking scripts to only update changed entries
1 parent 46baac9 commit 4f6eb81

File tree

2 files changed

+46
-8
lines changed

2 files changed

+46
-8
lines changed

scripts/rank_translations_llm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import time
2525
import argparse
2626
import re
27+
import shutil
2728
import pandas as pd
2829
from tqdm import tqdm
2930
from typing import List, Tuple, Dict

scripts/update_llm_rank.py

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,19 @@
33
Batch update Supabase translation.llm_rank using CSV produced by rank_translations_llm.py
44
55
Inputs:
6-
- llm_ranks.csv with columns: translation_id, llm_rank
6+
- llm_ranks.csv with columns: translation_id, llm_rank (new rankings)
7+
- llm_ranks.csv.bak (previous rankings for diffing)
78
89
Environment:
910
- For direct Postgres: use same .env as dump.py (user, password, host, port, dbname)
1011
or set DATABASE_URL explicitly (postgresql+psycopg2://...)
1112
1213
Behavior:
13-
- Performs efficient batched updates using SQLAlchemy executemany.
14-
- On missing rows, skips silently.
14+
- Diffs llm_ranks.csv against llm_ranks.csv.bak to find changed/new rows
15+
- Only updates rows where llm_rank has changed (typically ~10 rows)
16+
- Does not update updated_at timestamp
17+
- If no backup file exists, updates all rows
18+
- On missing rows in DB, skips silently
1519
1620
Note:
1721
- New translations added after a ranking run should keep llm_rank NULL until next batch.
@@ -46,26 +50,59 @@ def get_engine():
4650
def main():
4751
parser = argparse.ArgumentParser(description="Update translation.llm_rank from CSV")
4852
parser.add_argument("--ranks_csv", default="llm_ranks.csv")
53+
parser.add_argument("--backup_csv", default="llm_ranks.csv.bak")
4954
parser.add_argument("--batch_size", type=int, default=1000)
5055
args = parser.parse_args()
5156

52-
df = pd.read_csv(args.ranks_csv)
53-
if "translation_id" not in df.columns or "llm_rank" not in df.columns:
57+
# Read new and backup CSVs
58+
df_new = pd.read_csv(args.ranks_csv)
59+
if "translation_id" not in df_new.columns or "llm_rank" not in df_new.columns:
5460
raise ValueError("llm_ranks.csv must have columns: translation_id, llm_rank")
5561

62+
# Read backup CSV (if it doesn't exist, update all rows)
63+
if not os.path.exists(args.backup_csv):
64+
print(f"Backup file {args.backup_csv} not found; updating all rows")
65+
df_changed = df_new
66+
else:
67+
df_old = pd.read_csv(args.backup_csv)
68+
if "translation_id" not in df_old.columns or "llm_rank" not in df_old.columns:
69+
raise ValueError("Backup CSV must have columns: translation_id, llm_rank")
70+
71+
# Merge to find changes
72+
merged = df_new.merge(
73+
df_old,
74+
on="translation_id",
75+
how="left",
76+
suffixes=("_new", "_old")
77+
)
78+
79+
# Filter to only rows where llm_rank changed or is new
80+
df_changed = merged[
81+
(merged["llm_rank_old"].isna()) |
82+
(merged["llm_rank_new"] != merged["llm_rank_old"])
83+
][["translation_id", "llm_rank_new"]].rename(columns={"llm_rank_new": "llm_rank"})
84+
85+
print(f"Found {len(df_changed)} changed/new rows out of {len(df_new)} total")
86+
87+
if len(df_changed) == 0:
88+
print("No changes to update")
89+
return
90+
91+
if len(df_changed) <= 200:
92+
print("df_changed:", df_changed, sep="\n")
93+
5694
engine = get_engine()
5795

5896
sql = text("""
5997
update public.translation as t
60-
set llm_rank = v.llm_rank,
61-
updated_at = now()
98+
set llm_rank = v.llm_rank
6299
from (values (:translation_id, :llm_rank)) as v(translation_id, llm_rank)
63100
where t.id = v.translation_id::uuid
64101
""")
65102

66103
total = 0
67104
with engine.begin() as conn:
68-
rows = df.to_dict("records")
105+
rows = df_changed.to_dict("records")
69106
for i in range(0, len(rows), args.batch_size):
70107
batch = rows[i : i + args.batch_size]
71108
# executemany: param style is dict per row

0 commit comments

Comments
 (0)