Skip to content

Commit fdcb7ac

Browse files
committed
add prob. set cover, ruff issues, updated readme
1 parent ad6637a commit fdcb7ac

File tree

7 files changed

+1472
-126
lines changed

7 files changed

+1472
-126
lines changed

README.md

Lines changed: 227 additions & 108 deletions
Large diffs are not rendered by default.

examples/predictive_selection_demo.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
exit(1)
5454

5555

56-
5756
def create_noisy_version(df: pd.DataFrame, noise_level: float = 0.1) -> pd.DataFrame:
5857
"""Create a noisy version of dataset to simulate measurement uncertainty."""
5958
noisy_df = df.copy()

examples/probabilistic_demo.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python3
2+
"""Demo of probabilistic and ε-relaxed set cover capabilities.
3+
4+
This script demonstrates the new probabilistic set cover algorithms that handle
5+
uncertainty about unseen column values. Instead of requiring deterministic
6+
disambiguation, these algorithms find minimal column sets that achieve
7+
disambiguation with high probability (1-ε).
8+
"""
9+
10+
import numpy as np
11+
import pandas as pd
12+
13+
from rowvoi import (
14+
evaluate_coverage,
15+
minimal_key_greedy,
16+
probabilistic_minimal_key,
17+
suggest_next_feature_epsilon,
18+
)
19+
from rowvoi.types import CandidateState
20+
21+
print("🎲 PROBABILISTIC SET COVER DEMO")
22+
print("=" * 60)
23+
print("""
24+
This demo shows how probabilistic set cover handles uncertainty
25+
about column values, finding minimal sets that disambiguate rows
26+
with high probability rather than requiring certainty.
27+
""")
28+
29+
# Create a sample dataset with some uncertainty
30+
np.random.seed(42)
31+
df = pd.DataFrame(
32+
{
33+
"region": ["North", "South", "East", "West"] * 25,
34+
"product": ["A", "A", "B", "B", "C", "C", "A", "B"] * 12 + ["C"] * 4,
35+
"channel": ["Online", "Store", "Online", "Store"] * 25,
36+
"segment": np.random.choice(["SMB", "Enterprise", "Consumer"], 100),
37+
"value": np.random.choice(["High", "Medium", "Low"], 100, p=[0.2, 0.5, 0.3]),
38+
"priority": np.random.choice(["P1", "P2", "P3"], 100, p=[0.1, 0.3, 0.6]),
39+
}
40+
)
41+
42+
print(f"📊 Dataset: {len(df)} rows × {len(df.columns)} columns")
43+
print(f" Columns: {list(df.columns)}")
44+
print()
45+
46+
# Select some rows to disambiguate
47+
rows = [5, 12, 23, 34, 45]
48+
print(f"🎯 Task: Disambiguate rows {rows}")
49+
print()
50+
51+
# 1. DETERMINISTIC APPROACH
52+
print("1️⃣ DETERMINISTIC SET COVER (eps=0)")
53+
print("-" * 40)
54+
deterministic = minimal_key_greedy(df, rows)
55+
print(f" Columns needed: {deterministic}")
56+
print(f" Number of columns: {len(deterministic)}")
57+
58+
# Verify coverage
59+
coverage, uncovered = evaluate_coverage(df, rows, deterministic)
60+
print(f" Actual coverage: {coverage:.1%}")
61+
print()
62+
63+
# 2. PROBABILISTIC WITH SMALL EPSILON
64+
print("2️⃣ PROBABILISTIC SET COVER (eps=0.05)")
65+
print("-" * 40)
66+
print(" Allow 5% expected uncovered pairs")
67+
68+
result_5pct = probabilistic_minimal_key(df, rows, eps=0.05)
69+
print(f" Columns selected: {result_5pct.columns}")
70+
print(f" Number of columns: {len(result_5pct.columns)}")
71+
print(f" Expected coverage: {result_5pct.expected_coverage:.1%}")
72+
73+
# Check actual coverage
74+
actual_cov, _ = evaluate_coverage(df, rows, result_5pct.columns)
75+
print(f" Actual coverage: {actual_cov:.1%}")
76+
print(f" Savings: {len(deterministic) - len(result_5pct.columns)} fewer columns")
77+
print()
78+
79+
# 3. PROBABILISTIC WITH LARGER EPSILON
80+
print("3️⃣ PROBABILISTIC SET COVER (eps=0.20)")
81+
print("-" * 40)
82+
print(" Allow 20% expected uncovered pairs")
83+
84+
result_20pct = probabilistic_minimal_key(df, rows, eps=0.20)
85+
print(f" Columns selected: {result_20pct.columns}")
86+
print(f" Number of columns: {len(result_20pct.columns)}")
87+
print(f" Expected coverage: {result_20pct.expected_coverage:.1%}")
88+
89+
actual_cov, uncov_pairs = evaluate_coverage(df, rows, result_20pct.columns)
90+
print(f" Actual coverage: {actual_cov:.1%}")
91+
print(f" Savings: {len(deterministic) - len(result_20pct.columns)} fewer columns")
92+
print()
93+
94+
# 4. COST-AWARE SELECTION
95+
print("4️⃣ COST-AWARE PROBABILISTIC SELECTION")
96+
print("-" * 40)
97+
print(" Some columns are more expensive to collect")
98+
99+
costs = {
100+
"region": 1.0, # Easy to get
101+
"product": 1.0, # Easy to get
102+
"channel": 2.0, # Requires lookup
103+
"segment": 5.0, # Requires analysis
104+
"value": 10.0, # Expensive calculation
105+
"priority": 3.0, # Moderate cost
106+
}
107+
108+
result_cost = probabilistic_minimal_key(df, rows, eps=0.1, costs=costs)
109+
print(f" Columns selected: {result_cost.columns}")
110+
print(f" Total cost: {sum(costs[c] for c in result_cost.columns):.1f}")
111+
print(f" Expected coverage: {result_cost.expected_coverage:.1%}")
112+
113+
# Compare to non-cost-aware
114+
result_no_cost = probabilistic_minimal_key(df, rows, eps=0.1)
115+
cost_no_aware = sum(costs[c] for c in result_no_cost.columns)
116+
print(f" Cost without awareness: {cost_no_aware:.1f}")
117+
print(
118+
f" Cost savings: {cost_no_aware - sum(costs[c] for c in result_cost.columns):.1f}"
119+
)
120+
print()
121+
122+
# 5. ADAPTIVE SEQUENTIAL SELECTION
123+
print("5️⃣ ADAPTIVE EPSILON POLICY")
124+
print("-" * 40)
125+
print(" Sequential feature selection with eps=0.1")
126+
127+
# Start with no observations
128+
state = CandidateState(
129+
candidate_rows=rows,
130+
posterior={r: 1 / len(rows) for r in rows},
131+
observed_cols=[],
132+
observed_values={},
133+
)
134+
135+
selected_sequence = []
136+
for step in range(5):
137+
suggestion = suggest_next_feature_epsilon(df, state, eps=0.1, costs=costs)
138+
139+
if suggestion is None:
140+
print(f" ✅ Target coverage achieved after {step} features")
141+
break
142+
143+
print(f" Step {step + 1}: Select '{suggestion.col}'")
144+
print(f" Current coverage: {suggestion.current_coverage:.1%}")
145+
print(f" Expected gain: {suggestion.expected_coverage_gain:.2f} pairs")
146+
147+
selected_sequence.append(suggestion.col)
148+
149+
# Update state with observation
150+
state = CandidateState(
151+
candidate_rows=rows,
152+
posterior=state.posterior,
153+
observed_cols=state.observed_cols + [suggestion.col],
154+
observed_values={
155+
**state.observed_values,
156+
suggestion.col: df.loc[rows[0], suggestion.col],
157+
},
158+
)
159+
160+
print(f"\n Final sequence: {selected_sequence}")
161+
final_cov, _ = evaluate_coverage(df, rows, selected_sequence)
162+
print(f" Final coverage: {final_cov:.1%}")
163+
print()
164+
165+
# 6. COMPARISON SUMMARY
166+
print("📊 SUMMARY COMPARISON")
167+
print("-" * 40)
168+
print(f"{'Approach':<30} {'Columns':<10} {'Coverage':<12}")
169+
print("-" * 52)
170+
171+
det_cov, _ = evaluate_coverage(df, rows, deterministic)
172+
print(f"{'Deterministic (eps=0)':<30} {len(deterministic):<10} {det_cov:>11.1%}")
173+
174+
r5_cov, _ = evaluate_coverage(df, rows, result_5pct.columns)
175+
print(
176+
f"{'Probabilistic (eps=0.05)':<30} {len(result_5pct.columns):<10} {r5_cov:>11.1%}"
177+
)
178+
179+
r20_cov, _ = evaluate_coverage(df, rows, result_20pct.columns)
180+
print(
181+
f"{'Probabilistic (eps=0.20)':<30} {len(result_20pct.columns):<10} {r20_cov:>11.1%}"
182+
)
183+
184+
print(
185+
f"{'Cost-aware (eps=0.10)':<30} {len(result_cost.columns):<10} {actual_cov:>11.1%}"
186+
)
187+
188+
print(f"{'Adaptive (eps=0.10)':<30} {len(selected_sequence):<10} {final_cov:>11.1%}")
189+
190+
print()
191+
print("💡 KEY INSIGHTS:")
192+
print(" • Relaxing epsilon reduces columns needed")
193+
print(" • Cost awareness changes column selection")
194+
print(" • Adaptive policy responds to observations")
195+
print(" • Trade-off: fewer columns vs coverage guarantee")

rowvoi/__init__.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,15 @@
5656
FeatureSuggestion,
5757
RowVoiModel,
5858
)
59+
from .probcover import (
60+
AdaptiveFeatureSuggestion,
61+
ProbabilisticCoverageResult,
62+
estimate_pair_separation_probs,
63+
evaluate_coverage,
64+
greedy_epsilon_cover,
65+
probabilistic_minimal_key,
66+
suggest_next_feature_epsilon,
67+
)
5968
from .setcover import (
6069
SetCoverAlgorithm,
6170
SetCoverResult,
@@ -82,6 +91,14 @@
8291
"SetCoverAlgorithm",
8392
"SetCoverResult",
8493
"solve_set_cover",
94+
# probabilistic cover
95+
"probabilistic_minimal_key",
96+
"suggest_next_feature_epsilon",
97+
"ProbabilisticCoverageResult",
98+
"AdaptiveFeatureSuggestion",
99+
"estimate_pair_separation_probs",
100+
"evaluate_coverage",
101+
"greedy_epsilon_cover",
85102
# mutual information (candidate set)
86103
"candidate_mi",
87104
"best_feature_by_candidate_mi",

0 commit comments

Comments
 (0)