Skip to content

Commit 9ee82fd

Browse files
committed
logging, deptry, type hints
1 parent 54e8b90 commit 9ee82fd

File tree

9 files changed

+745
-240
lines changed

9 files changed

+745
-240
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ test-cov: ## Run tests with coverage
1616

1717
lint: ## Run linting checks
1818
uv run ruff check .
19+
uv run deptry .
1920

2021
format: ## Format code
2122
uv run ruff format .
@@ -54,6 +55,7 @@ ci-docker: ## Run CI in Docker (standard Python image)
5455
"pip install uv && \
5556
uv sync --extra dev && \
5657
uv run ruff check . && \
58+
uv run deptry . && \
5759
uv run ruff format --check . && \
5860
uv run pytest tests/ -v"
5961

examples/advanced_algorithms/known_data_setcover_demo.py

Lines changed: 63 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,21 @@
4141
import numpy as np
4242
import pandas as pd
4343

44-
from rowvoi import KeyProblem, find_key, plan_key_path
44+
from rowvoi import KeyProblem, find_key, get_logger, plan_key_path
4545

4646
warnings.filterwarnings("ignore")
4747

48+
# Set up logging
49+
logger = get_logger(__name__)
50+
4851
# Check if we can import sklearn for datasets
4952
try:
5053
from sklearn.datasets import load_breast_cancer, load_digits, load_iris, load_wine
5154

5255
SKLEARN_AVAILABLE = True
5356
except ImportError:
5457
SKLEARN_AVAILABLE = False
55-
print("⚠️ scikit-learn not available. Using synthetic data instead.")
58+
logger.warning("⚠️ scikit-learn not available. Using synthetic data instead.")
5659

5760

5861
def load_sample_datasets() -> dict[str, pd.DataFrame]:
@@ -148,8 +151,10 @@ def benchmark_algorithms(
148151
problem = KeyProblem(df, rows, costs=costs)
149152
total_pairs = len(rows) * (len(rows) - 1) // 2
150153

151-
print(f"\n 🎯 Benchmarking {len(algorithms)} algorithms on {len(rows)} rows...")
152-
print(f" Total pairs to distinguish: {total_pairs}")
154+
logger.info(
155+
f"\n 🎯 Benchmarking {len(algorithms)} algorithms on {len(rows)} rows..."
156+
)
157+
logger.info(f" Total pairs to distinguish: {total_pairs}")
153158

154159
for algo in algorithms:
155160
try:
@@ -178,21 +183,27 @@ def benchmark_algorithms(
178183
}
179184

180185
status = "✅" if is_valid else "❌"
181-
print(
182-
f" {status} {algo.upper()}: {len(key)} cols, "
183-
f"cost={total_cost:.1f}, {runtime:.3f}s"
184-
)
186+
if status == "✅":
187+
logger.info(
188+
f" {status} {algo.upper()}: {len(key)} cols, "
189+
f"cost={total_cost:.1f}, {runtime:.3f}s"
190+
)
191+
else:
192+
logger.error(
193+
f" {status} {algo.upper()}: {len(key)} cols, "
194+
f"cost={total_cost:.1f}, {runtime:.3f}s"
195+
)
185196

186197
except Exception as e:
187-
print(f" ❌ {algo.upper()}: Failed ({str(e)[:50]})")
198+
logger.error(f" ❌ {algo.upper()}: Failed ({str(e)[:50]})")
188199
results[algo] = {"error": str(e)}
189200

190201
return results
191202

192203

193204
def demonstrate_path_planning(df: pd.DataFrame, rows: list[int], dataset_name: str):
194205
"""Demonstrate path planning functionality."""
195-
print(f"\n 🛤️ Path Planning for {dataset_name}")
206+
logger.info(f"\n 🛤️ Path Planning for {dataset_name}")
196207

197208
# Create some example costs
198209
costs = {col: np.random.uniform(0.5, 3.0) for col in df.columns}
@@ -201,16 +212,16 @@ def demonstrate_path_planning(df: pd.DataFrame, rows: list[int], dataset_name: s
201212
path_coverage = plan_key_path(df, rows, costs=costs, objective="pair_coverage")
202213
path_entropy = plan_key_path(df, rows, costs=costs, objective="entropy")
203214

204-
print(" Coverage-optimized path (first 3 steps):")
215+
logger.info(" Coverage-optimized path (first 3 steps):")
205216
for i, step in enumerate(path_coverage.steps[:3]):
206-
print(
217+
logger.info(
207218
f" {i + 1}. {step.col}: +{step.newly_covered_pairs} pairs "
208219
f"({step.coverage:.0%} total, cost={step.cumulative_cost:.1f})"
209220
)
210221

211-
print(" Entropy-optimized path (first 3 steps):")
222+
logger.info(" Entropy-optimized path (first 3 steps):")
212223
for i, step in enumerate(path_entropy.steps[:3]):
213-
print(
224+
logger.info(
214225
f" {i + 1}. {step.col}: +{step.newly_covered_pairs} pairs "
215226
f"({step.coverage:.0%} total, cost={step.cumulative_cost:.1f})"
216227
)
@@ -219,19 +230,19 @@ def demonstrate_path_planning(df: pd.DataFrame, rows: list[int], dataset_name: s
219230
budget_cols = path_coverage.prefix_for_budget(5.0)
220231
epsilon_cols = path_coverage.prefix_for_epsilon_pairs(0.1)
221232

222-
print(f" Within budget of 5.0: {budget_cols}")
223-
print(f" For 90% coverage: {epsilon_cols}")
233+
logger.info(f" Within budget of 5.0: {budget_cols}")
234+
logger.info(f" For 90% coverage: {epsilon_cols}")
224235

225236

226237
def analyze_dataset_properties(df: pd.DataFrame, dataset_name: str):
227238
"""Analyze properties of the dataset that affect set cover performance."""
228-
print(f"\n📊 Dataset Analysis: {dataset_name}")
229-
print(f" Shape: {df.shape[0]} rows × {df.shape[1]} columns")
230-
print(f" Data types: {df.dtypes.value_counts().to_dict()}")
239+
logger.info(f"\n📊 Dataset Analysis: {dataset_name}")
240+
logger.info(f" Shape: {df.shape[0]} rows × {df.shape[1]} columns")
241+
logger.info(f" Data types: {df.dtypes.value_counts().to_dict()}")
231242

232243
# Column cardinality analysis
233244
cardinalities = [df[col].nunique() for col in df.columns]
234-
print(
245+
logger.info(
235246
f" Column cardinalities: min={min(cardinalities)}, "
236247
f"max={max(cardinalities)}, mean={np.mean(cardinalities):.1f}"
237248
)
@@ -241,26 +252,30 @@ def analyze_dataset_properties(df: pd.DataFrame, dataset_name: str):
241252
high_card_cols = [col for col in df.columns if df[col].nunique() >= len(df) * 0.8]
242253

243254
if low_card_cols:
244-
print(f" ⚠️ Low-cardinality columns (≤2 values): {len(low_card_cols)}")
255+
logger.warning(
256+
f" ⚠️ Low-cardinality columns (≤2 values): {len(low_card_cols)}"
257+
)
245258
if high_card_cols:
246-
print(f" ⚠️ High-cardinality columns (≥80% unique): {len(high_card_cols)}")
259+
logger.warning(
260+
f" ⚠️ High-cardinality columns (≥80% unique): {len(high_card_cols)}"
261+
)
247262

248263

249264
def main():
250265
"""Run comprehensive set cover demonstration."""
251-
print("🎯 ROWVOI SET COVER DEMONSTRATION")
252-
print("=" * 50)
253-
print("\n🔍 Loading datasets...")
266+
logger.info("🎯 ROWVOI SET COVER DEMONSTRATION")
267+
logger.info("=" * 50)
268+
logger.info("\n🔍 Loading datasets...")
254269

255270
datasets = load_sample_datasets()
256-
print(f" Loaded {len(datasets)} datasets: {list(datasets.keys())}")
271+
logger.info(f" Loaded {len(datasets)} datasets: {list(datasets.keys())}")
257272

258273
all_results = {}
259274

260275
for name, df_raw in datasets.items():
261-
print(f"\n{'=' * 60}")
262-
print(f"🧪 TESTING DATASET: {name}")
263-
print("=" * 60)
276+
logger.info(f"\n{'=' * 60}")
277+
logger.info(f"🧪 TESTING DATASET: {name}")
278+
logger.info("=" * 60)
264279

265280
# Discretize for better performance
266281
df = discretize_dataset(df_raw)
@@ -274,7 +289,7 @@ def main():
274289
dataset_results = []
275290

276291
for i, rows in enumerate(subsets):
277-
print(f"\n🔬 Test Case {i + 1}: {len(rows)} rows {rows}")
292+
logger.info(f"\n🔬 Test Case {i + 1}: {len(rows)} rows {rows}")
278293

279294
# Create example costs based on column cardinality
280295
costs = {
@@ -293,7 +308,7 @@ def main():
293308
all_results[name] = dataset_results
294309

295310
# Summary for this dataset
296-
print(f"\n📈 Summary for {name}:")
311+
logger.info(f"\n📈 Summary for {name}:")
297312
successful_results = []
298313
for test_case in dataset_results:
299314
for _algo, result in test_case["results"].items():
@@ -303,26 +318,30 @@ def main():
303318
if successful_results:
304319
avg_size = np.mean([r["size"] for r in successful_results])
305320
avg_runtime = np.mean([r["runtime"] for r in successful_results])
306-
print(f" Average key size: {avg_size:.1f} columns")
307-
print(f" Average runtime: {avg_runtime:.3f} seconds")
321+
logger.info(f" Average key size: {avg_size:.1f} columns")
322+
logger.info(f" Average runtime: {avg_runtime:.3f} seconds")
308323

309324
# Best algorithm by size
310325
best_by_size = min(successful_results, key=lambda x: x["size"])
311-
print(
326+
logger.info(
312327
f" Best solution: {best_by_size['size']} columns "
313328
f"({best_by_size['algorithm']}, cost={best_by_size['cost']:.1f})"
314329
)
315330

316-
print(f"\n{'=' * 60}")
317-
print("✅ DEMONSTRATION COMPLETE")
318-
print("=" * 60)
319-
print("\n💡 KEY INSIGHTS:")
320-
print(" • Greedy algorithm provides good approximation quickly")
321-
print(" • Exact solutions feasible for small problems (<15 columns)")
322-
print(" • Metaheuristics (SA, GA) can improve on greedy for larger problems")
323-
print(" • Column costs significantly impact optimal column selection")
324-
print(" • Path planning enables budget-constrained and progressive selection")
325-
print(
331+
logger.info(f"\n{'=' * 60}")
332+
logger.info("✅ DEMONSTRATION COMPLETE")
333+
logger.info("=" * 60)
334+
logger.info("\n💡 KEY INSIGHTS:")
335+
logger.info(" • Greedy algorithm provides good approximation quickly")
336+
logger.info(" • Exact solutions feasible for small problems (<15 columns)")
337+
logger.info(
338+
" • Metaheuristics (SA, GA) can improve on greedy for larger problems"
339+
)
340+
logger.info(" • Column costs significantly impact optimal column selection")
341+
logger.info(
342+
" • Path planning enables budget-constrained and progressive selection"
343+
)
344+
logger.info(
326345
"\n📖 For interactive selection with unknown data, "
327346
"see predictive_selection_demo.py"
328347
)

0 commit comments

Comments
 (0)