Skip to content

Commit 1461e54

Browse files
benzsevernclaude
andcommitted
feat: llm_auto flag + memory auto-enablement (#35)
- llm_auto: bool on GoldenMatchConfig, applied uniformly - When llm_auto=True + API key: LLM scorer with $0.05 budget - Memory enabled only when llm_auto=True - Pipeline forwards llm_scorer and memory from auto-config Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent a6b9bfb commit 1461e54

5 files changed

Lines changed: 91 additions & 2 deletions

File tree

goldenmatch/_api.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ def dedupe_df(
269269
blocking: list[str] | None = None,
270270
threshold: float | None = None,
271271
llm_scorer: bool = False,
272+
llm_auto: bool = False,
272273
backend: str | None = None,
273274
source_name: str = "dataframe",
274275
) -> DedupeResult:
@@ -314,6 +315,8 @@ def dedupe_df(
314315
if llm_scorer and hasattr(config, "llm_scorer"):
315316
from goldenmatch.config.schemas import LLMScorerConfig
316317
config.llm_scorer = LLMScorerConfig(enabled=True)
318+
if llm_auto and hasattr(config, "llm_auto"):
319+
config.llm_auto = llm_auto
317320

318321
result = run_dedupe_df(
319322
df, config, source_name=source_name,

goldenmatch/config/schemas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ class GoldenMatchConfig(BaseModel):
429429
transform: TransformConfig | None = None
430430
llm_boost: bool = False
431431
llm_scorer: LLMScorerConfig | None = None
432+
llm_auto: bool = False
432433
domain: DomainConfig | None = None
433434
backend: str | None = None # None (default Polars), "ray", "duckdb"
434435
memory: MemoryConfig | None = None

goldenmatch/core/autoconfig.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,13 @@
1212
from goldenmatch.config.schemas import (
1313
BlockingConfig,
1414
BlockingKeyConfig,
15+
BudgetConfig,
1516
GoldenMatchConfig,
1617
GoldenRulesConfig,
18+
LLMScorerConfig,
1719
MatchkeyConfig,
1820
MatchkeyField,
21+
MemoryConfig,
1922
OutputConfig,
2023
)
2124
from goldenmatch.core.profiler import _guess_type
@@ -970,7 +973,10 @@ def select_model(row_count: int, has_embedding_columns: bool, threshold: int = 5
970973

971974
# ── Main entry point ──────────────────────────────────────────────────────
972975

973-
def auto_configure_df(df: pl.DataFrame, llm_provider: str | None = None, domain_config=None) -> GoldenMatchConfig:
976+
def auto_configure_df(
977+
df: pl.DataFrame, llm_provider: str | None = None,
978+
domain_config=None, llm_auto: bool = False,
979+
) -> GoldenMatchConfig:
974980
"""Auto-generate a GoldenMatchConfig from a DataFrame.
975981
976982
Profiles columns by name heuristics and data sampling, then builds
@@ -1139,12 +1145,37 @@ def auto_configure_df(df: pl.DataFrame, llm_provider: str | None = None, domain_
11391145
mk.name, original, mk.threshold, avg_null, avg_len,
11401146
)
11411147

1148+
# ── LLM auto-config ──
1149+
llm_scorer_config = None
1150+
if llm_auto:
1151+
import os
1152+
_provider = None
1153+
if os.environ.get("ANTHROPIC_API_KEY"):
1154+
_provider = "anthropic"
1155+
elif os.environ.get("OPENAI_API_KEY"):
1156+
_provider = "openai"
1157+
if _provider:
1158+
llm_scorer_config = LLMScorerConfig(
1159+
enabled=True,
1160+
candidate_lo=0.60,
1161+
candidate_hi=0.90,
1162+
auto_threshold=0.90,
1163+
budget=BudgetConfig(max_cost_usd=0.05),
1164+
)
1165+
logger.info("LLM scorer auto-enabled (provider=%s, budget=$0.05)", _provider)
1166+
else:
1167+
logger.info("llm_auto=True but no API key found")
1168+
1169+
memory_config = MemoryConfig(enabled=True) if llm_auto else None
1170+
11421171
# Build config
11431172
config = GoldenMatchConfig(
11441173
matchkeys=matchkeys,
11451174
blocking=blocking,
11461175
golden_rules=GoldenRulesConfig(default_strategy="most_complete"),
11471176
output=OutputConfig(),
1177+
llm_scorer=llm_scorer_config,
1178+
memory=memory_config,
11481179
)
11491180

11501181
return config

goldenmatch/core/pipeline.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,11 +227,17 @@ def _run_dedupe_pipeline(
227227
if auto_config:
228228
from goldenmatch.core.autoconfig import auto_configure_df
229229
combined_df_tmp = combined_lf.collect()
230-
auto_cfg = auto_configure_df(combined_df_tmp, llm_provider=auto_config_llm_provider)
230+
auto_cfg = auto_configure_df(
231+
combined_df_tmp,
232+
llm_provider=auto_config_llm_provider,
233+
llm_auto=config.llm_auto,
234+
)
231235
config.matchkeys = auto_cfg.matchkeys
232236
config.match_settings = auto_cfg.match_settings
233237
config.blocking = auto_cfg.blocking
234238
config.golden_rules = auto_cfg.golden_rules
239+
config.llm_scorer = auto_cfg.llm_scorer
240+
config.memory = auto_cfg.memory
235241
matchkeys = config.get_matchkeys()
236242
logger.info("Auto-configured from cleaned data: %d matchkeys", len(matchkeys))
237243
combined_lf = combined_df_tmp.lazy()

tests/test_autoconfig.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,3 +1050,51 @@ def test_threshold_raised_short_strings(self):
10501050
weighted_mks = [mk for mk in config.get_matchkeys() if mk.type == "weighted"]
10511051
assert len(weighted_mks) > 0, "Expected at least one weighted matchkey"
10521052
assert weighted_mks[0].threshold >= 0.80
1053+
1054+
1055+
class TestLLMMemoryAutoEnablement:
1056+
"""Tests for LLM + memory auto-enablement."""
1057+
1058+
def test_llm_auto_with_api_key(self):
1059+
from goldenmatch.core.autoconfig import auto_configure_df
1060+
from unittest.mock import patch
1061+
df = pl.DataFrame({"name": ["John", "Jane", "Bob"], "email": ["a@t.com", "b@t.com", "c@t.com"]})
1062+
with patch.dict("os.environ", {"OPENAI_API_KEY": "sk-fake"}):
1063+
config = auto_configure_df(df, llm_auto=True)
1064+
assert config.llm_scorer is not None
1065+
assert config.llm_scorer.enabled is True
1066+
assert config.llm_scorer.budget.max_cost_usd == 0.05
1067+
1068+
def test_llm_auto_no_key(self):
1069+
from goldenmatch.core.autoconfig import auto_configure_df
1070+
from unittest.mock import patch
1071+
import os
1072+
df = pl.DataFrame({"name": ["John", "Jane", "Bob"], "email": ["a@t.com", "b@t.com", "c@t.com"]})
1073+
with patch.dict("os.environ", {"OPENAI_API_KEY": "", "ANTHROPIC_API_KEY": ""}):
1074+
os.environ.pop("OPENAI_API_KEY", None)
1075+
os.environ.pop("ANTHROPIC_API_KEY", None)
1076+
config = auto_configure_df(df, llm_auto=True)
1077+
assert config.llm_scorer is None
1078+
1079+
def test_llm_auto_off(self):
1080+
from goldenmatch.core.autoconfig import auto_configure_df
1081+
from unittest.mock import patch
1082+
df = pl.DataFrame({"name": ["John", "Jane", "Bob"], "email": ["a@t.com", "b@t.com", "c@t.com"]})
1083+
with patch.dict("os.environ", {"OPENAI_API_KEY": "sk-fake"}):
1084+
config = auto_configure_df(df, llm_auto=False)
1085+
assert config.llm_scorer is None
1086+
1087+
def test_memory_with_llm_auto(self):
1088+
from goldenmatch.core.autoconfig import auto_configure_df
1089+
from unittest.mock import patch
1090+
df = pl.DataFrame({"name": ["John", "Jane", "Bob"], "email": ["a@t.com", "b@t.com", "c@t.com"]})
1091+
with patch.dict("os.environ", {"OPENAI_API_KEY": "sk-fake"}):
1092+
config = auto_configure_df(df, llm_auto=True)
1093+
assert config.memory is not None
1094+
assert config.memory.enabled is True
1095+
1096+
def test_memory_off_by_default(self):
1097+
from goldenmatch.core.autoconfig import auto_configure_df
1098+
df = pl.DataFrame({"name": ["John", "Jane", "Bob"], "email": ["a@t.com", "b@t.com", "c@t.com"]})
1099+
config = auto_configure_df(df)
1100+
assert config.memory is None

0 commit comments

Comments
 (0)