|
3 | 3 | Batch update Supabase translation.llm_rank using CSV produced by rank_translations_llm.py |
4 | 4 |
|
5 | 5 | Inputs: |
6 | | - - llm_ranks.csv with columns: translation_id, llm_rank |
| 6 | + - llm_ranks.csv with columns: translation_id, llm_rank (new rankings) |
| 7 | + - llm_ranks.csv.bak (previous rankings for diffing) |
7 | 8 |
|
8 | 9 | Environment: |
9 | 10 | - For direct Postgres: use same .env as dump.py (user, password, host, port, dbname) |
10 | 11 | or set DATABASE_URL explicitly (postgresql+psycopg2://...) |
11 | 12 |
|
12 | 13 | Behavior: |
13 | | - - Performs efficient batched updates using SQLAlchemy executemany. |
14 | | - - On missing rows, skips silently. |
| 14 | + - Diffs llm_ranks.csv against llm_ranks.csv.bak to find changed/new rows |
| 15 | + - Only updates rows where llm_rank has changed (typically ~10 rows) |
| 16 | + - Does not update updated_at timestamp |
| 17 | + - If no backup file exists, updates all rows |
| 18 | + - On missing rows in DB, skips silently |
15 | 19 |
|
16 | 20 | Note: |
17 | 21 | - New translations added after a ranking run should keep llm_rank NULL until next batch. |
@@ -46,26 +50,59 @@ def get_engine(): |
46 | 50 | def main(): |
47 | 51 | parser = argparse.ArgumentParser(description="Update translation.llm_rank from CSV") |
48 | 52 | parser.add_argument("--ranks_csv", default="llm_ranks.csv") |
| 53 | + parser.add_argument("--backup_csv", default="llm_ranks.csv.bak") |
49 | 54 | parser.add_argument("--batch_size", type=int, default=1000) |
50 | 55 | args = parser.parse_args() |
51 | 56 |
|
52 | | - df = pd.read_csv(args.ranks_csv) |
53 | | - if "translation_id" not in df.columns or "llm_rank" not in df.columns: |
| 57 | + # Read new and backup CSVs |
| 58 | + df_new = pd.read_csv(args.ranks_csv) |
| 59 | + if "translation_id" not in df_new.columns or "llm_rank" not in df_new.columns: |
54 | 60 | raise ValueError("llm_ranks.csv must have columns: translation_id, llm_rank") |
55 | 61 |
|
| 62 | + # Read backup CSV (if it doesn't exist, update all rows) |
| 63 | + if not os.path.exists(args.backup_csv): |
| 64 | + print(f"Backup file {args.backup_csv} not found; updating all rows") |
| 65 | + df_changed = df_new |
| 66 | + else: |
| 67 | + df_old = pd.read_csv(args.backup_csv) |
| 68 | + if "translation_id" not in df_old.columns or "llm_rank" not in df_old.columns: |
| 69 | + raise ValueError("Backup CSV must have columns: translation_id, llm_rank") |
| 70 | + |
| 71 | + # Merge to find changes |
| 72 | + merged = df_new.merge( |
| 73 | + df_old, |
| 74 | + on="translation_id", |
| 75 | + how="left", |
| 76 | + suffixes=("_new", "_old") |
| 77 | + ) |
| 78 | + |
| 79 | + # Filter to only rows where llm_rank changed or is new |
| 80 | + df_changed = merged[ |
| 81 | + (merged["llm_rank_old"].isna()) | |
| 82 | + (merged["llm_rank_new"] != merged["llm_rank_old"]) |
| 83 | + ][["translation_id", "llm_rank_new"]].rename(columns={"llm_rank_new": "llm_rank"}) |
| 84 | + |
| 85 | + print(f"Found {len(df_changed)} changed/new rows out of {len(df_new)} total") |
| 86 | + |
| 87 | + if len(df_changed) == 0: |
| 88 | + print("No changes to update") |
| 89 | + return |
| 90 | + |
| 91 | + if len(df_changed) <= 200: |
| 92 | + print("df_changed:", df_changed, sep="\n") |
| 93 | + |
56 | 94 | engine = get_engine() |
57 | 95 |
|
58 | 96 | sql = text(""" |
59 | 97 | update public.translation as t |
60 | | - set llm_rank = v.llm_rank, |
61 | | - updated_at = now() |
| 98 | + set llm_rank = v.llm_rank |
62 | 99 | from (values (:translation_id, :llm_rank)) as v(translation_id, llm_rank) |
63 | 100 | where t.id = v.translation_id::uuid |
64 | 101 | """) |
65 | 102 |
|
66 | 103 | total = 0 |
67 | 104 | with engine.begin() as conn: |
68 | | - rows = df.to_dict("records") |
| 105 | + rows = df_changed.to_dict("records") |
69 | 106 | for i in range(0, len(rows), args.batch_size): |
70 | 107 | batch = rows[i : i + args.batch_size] |
71 | 108 | # executemany: param style is dict per row |
|
0 commit comments