4141import numpy as np
4242import pandas as pd
4343
44- from rowvoi import KeyProblem , find_key , plan_key_path
44+ from rowvoi import KeyProblem , find_key , get_logger , plan_key_path
4545
4646warnings .filterwarnings ("ignore" )
4747
48+ # Set up logging
49+ logger = get_logger (__name__ )
50+
4851# Check if we can import sklearn for datasets
4952try :
5053 from sklearn .datasets import load_breast_cancer , load_digits , load_iris , load_wine
5154
5255 SKLEARN_AVAILABLE = True
5356except ImportError :
5457 SKLEARN_AVAILABLE = False
55- print ("⚠️ scikit-learn not available. Using synthetic data instead." )
58+ logger . warning ("⚠️ scikit-learn not available. Using synthetic data instead." )
5659
5760
5861def load_sample_datasets () -> dict [str , pd .DataFrame ]:
@@ -148,8 +151,10 @@ def benchmark_algorithms(
148151 problem = KeyProblem (df , rows , costs = costs )
149152 total_pairs = len (rows ) * (len (rows ) - 1 ) // 2
150153
151- print (f"\n 🎯 Benchmarking { len (algorithms )} algorithms on { len (rows )} rows..." )
152- print (f" Total pairs to distinguish: { total_pairs } " )
154+ logger .info (
155+ f"\n 🎯 Benchmarking { len (algorithms )} algorithms on { len (rows )} rows..."
156+ )
157+ logger .info (f" Total pairs to distinguish: { total_pairs } " )
153158
154159 for algo in algorithms :
155160 try :
@@ -178,21 +183,27 @@ def benchmark_algorithms(
178183 }
179184
180185 status = "✅" if is_valid else "❌"
181- print (
182- f" { status } { algo .upper ()} : { len (key )} cols, "
183- f"cost={ total_cost :.1f} , { runtime :.3f} s"
184- )
186+ if status == "✅" :
187+ logger .info (
188+ f" { status } { algo .upper ()} : { len (key )} cols, "
189+ f"cost={ total_cost :.1f} , { runtime :.3f} s"
190+ )
191+ else :
192+ logger .error (
193+ f" { status } { algo .upper ()} : { len (key )} cols, "
194+ f"cost={ total_cost :.1f} , { runtime :.3f} s"
195+ )
185196
186197 except Exception as e :
187- print (f" ❌ { algo .upper ()} : Failed ({ str (e )[:50 ]} )" )
198+ logger . error (f" ❌ { algo .upper ()} : Failed ({ str (e )[:50 ]} )" )
188199 results [algo ] = {"error" : str (e )}
189200
190201 return results
191202
192203
193204def demonstrate_path_planning (df : pd .DataFrame , rows : list [int ], dataset_name : str ):
194205 """Demonstrate path planning functionality."""
195- print (f"\n 🛤️ Path Planning for { dataset_name } " )
206+ logger . info (f"\n 🛤️ Path Planning for { dataset_name } " )
196207
197208 # Create some example costs
198209 costs = {col : np .random .uniform (0.5 , 3.0 ) for col in df .columns }
@@ -201,16 +212,16 @@ def demonstrate_path_planning(df: pd.DataFrame, rows: list[int], dataset_name: s
201212 path_coverage = plan_key_path (df , rows , costs = costs , objective = "pair_coverage" )
202213 path_entropy = plan_key_path (df , rows , costs = costs , objective = "entropy" )
203214
204- print (" Coverage-optimized path (first 3 steps):" )
215+ logger . info (" Coverage-optimized path (first 3 steps):" )
205216 for i , step in enumerate (path_coverage .steps [:3 ]):
206- print (
217+ logger . info (
207218 f" { i + 1 } . { step .col } : +{ step .newly_covered_pairs } pairs "
208219 f"({ step .coverage :.0%} total, cost={ step .cumulative_cost :.1f} )"
209220 )
210221
211- print (" Entropy-optimized path (first 3 steps):" )
222+ logger . info (" Entropy-optimized path (first 3 steps):" )
212223 for i , step in enumerate (path_entropy .steps [:3 ]):
213- print (
224+ logger . info (
214225 f" { i + 1 } . { step .col } : +{ step .newly_covered_pairs } pairs "
215226 f"({ step .coverage :.0%} total, cost={ step .cumulative_cost :.1f} )"
216227 )
@@ -219,19 +230,19 @@ def demonstrate_path_planning(df: pd.DataFrame, rows: list[int], dataset_name: s
219230 budget_cols = path_coverage .prefix_for_budget (5.0 )
220231 epsilon_cols = path_coverage .prefix_for_epsilon_pairs (0.1 )
221232
222- print (f" Within budget of 5.0: { budget_cols } " )
223- print (f" For 90% coverage: { epsilon_cols } " )
233+ logger . info (f" Within budget of 5.0: { budget_cols } " )
234+ logger . info (f" For 90% coverage: { epsilon_cols } " )
224235
225236
226237def analyze_dataset_properties (df : pd .DataFrame , dataset_name : str ):
227238 """Analyze properties of the dataset that affect set cover performance."""
228- print (f"\n 📊 Dataset Analysis: { dataset_name } " )
229- print (f" Shape: { df .shape [0 ]} rows × { df .shape [1 ]} columns" )
230- print (f" Data types: { df .dtypes .value_counts ().to_dict ()} " )
239+ logger . info (f"\n 📊 Dataset Analysis: { dataset_name } " )
240+ logger . info (f" Shape: { df .shape [0 ]} rows × { df .shape [1 ]} columns" )
241+ logger . info (f" Data types: { df .dtypes .value_counts ().to_dict ()} " )
231242
232243 # Column cardinality analysis
233244 cardinalities = [df [col ].nunique () for col in df .columns ]
234- print (
245+ logger . info (
235246 f" Column cardinalities: min={ min (cardinalities )} , "
236247 f"max={ max (cardinalities )} , mean={ np .mean (cardinalities ):.1f} "
237248 )
@@ -241,26 +252,30 @@ def analyze_dataset_properties(df: pd.DataFrame, dataset_name: str):
241252 high_card_cols = [col for col in df .columns if df [col ].nunique () >= len (df ) * 0.8 ]
242253
243254 if low_card_cols :
244- print (f" ⚠️ Low-cardinality columns (≤2 values): { len (low_card_cols )} " )
255+ logger .warning (
256+ f" ⚠️ Low-cardinality columns (≤2 values): { len (low_card_cols )} "
257+ )
245258 if high_card_cols :
246- print (f" ⚠️ High-cardinality columns (≥80% unique): { len (high_card_cols )} " )
259+ logger .warning (
260+ f" ⚠️ High-cardinality columns (≥80% unique): { len (high_card_cols )} "
261+ )
247262
248263
249264def main ():
250265 """Run comprehensive set cover demonstration."""
251- print ("🎯 ROWVOI SET COVER DEMONSTRATION" )
252- print ("=" * 50 )
253- print ("\n 🔍 Loading datasets..." )
266+ logger . info ("🎯 ROWVOI SET COVER DEMONSTRATION" )
267+ logger . info ("=" * 50 )
268+ logger . info ("\n 🔍 Loading datasets..." )
254269
255270 datasets = load_sample_datasets ()
256- print (f" Loaded { len (datasets )} datasets: { list (datasets .keys ())} " )
271+ logger . info (f" Loaded { len (datasets )} datasets: { list (datasets .keys ())} " )
257272
258273 all_results = {}
259274
260275 for name , df_raw in datasets .items ():
261- print (f"\n { '=' * 60 } " )
262- print (f"🧪 TESTING DATASET: { name } " )
263- print ("=" * 60 )
276+ logger . info (f"\n { '=' * 60 } " )
277+ logger . info (f"🧪 TESTING DATASET: { name } " )
278+ logger . info ("=" * 60 )
264279
265280 # Discretize for better performance
266281 df = discretize_dataset (df_raw )
@@ -274,7 +289,7 @@ def main():
274289 dataset_results = []
275290
276291 for i , rows in enumerate (subsets ):
277- print (f"\n 🔬 Test Case { i + 1 } : { len (rows )} rows { rows } " )
292+ logger . info (f"\n 🔬 Test Case { i + 1 } : { len (rows )} rows { rows } " )
278293
279294 # Create example costs based on column cardinality
280295 costs = {
@@ -293,7 +308,7 @@ def main():
293308 all_results [name ] = dataset_results
294309
295310 # Summary for this dataset
296- print (f"\n 📈 Summary for { name } :" )
311+ logger . info (f"\n 📈 Summary for { name } :" )
297312 successful_results = []
298313 for test_case in dataset_results :
299314 for _algo , result in test_case ["results" ].items ():
@@ -303,26 +318,30 @@ def main():
303318 if successful_results :
304319 avg_size = np .mean ([r ["size" ] for r in successful_results ])
305320 avg_runtime = np .mean ([r ["runtime" ] for r in successful_results ])
306- print (f" Average key size: { avg_size :.1f} columns" )
307- print (f" Average runtime: { avg_runtime :.3f} seconds" )
321+ logger . info (f" Average key size: { avg_size :.1f} columns" )
322+ logger . info (f" Average runtime: { avg_runtime :.3f} seconds" )
308323
309324 # Best algorithm by size
310325 best_by_size = min (successful_results , key = lambda x : x ["size" ])
311- print (
326+ logger . info (
312327 f" Best solution: { best_by_size ['size' ]} columns "
313328 f"({ best_by_size ['algorithm' ]} , cost={ best_by_size ['cost' ]:.1f} )"
314329 )
315330
316- print (f"\n { '=' * 60 } " )
317- print ("✅ DEMONSTRATION COMPLETE" )
318- print ("=" * 60 )
319- print ("\n 💡 KEY INSIGHTS:" )
320- print (" • Greedy algorithm provides good approximation quickly" )
321- print (" • Exact solutions feasible for small problems (<15 columns)" )
322- print (" • Metaheuristics (SA, GA) can improve on greedy for larger problems" )
323- print (" • Column costs significantly impact optimal column selection" )
324- print (" • Path planning enables budget-constrained and progressive selection" )
325- print (
331+ logger .info (f"\n { '=' * 60 } " )
332+ logger .info ("✅ DEMONSTRATION COMPLETE" )
333+ logger .info ("=" * 60 )
334+ logger .info ("\n 💡 KEY INSIGHTS:" )
335+ logger .info (" • Greedy algorithm provides good approximation quickly" )
336+ logger .info (" • Exact solutions feasible for small problems (<15 columns)" )
337+ logger .info (
338+ " • Metaheuristics (SA, GA) can improve on greedy for larger problems"
339+ )
340+ logger .info (" • Column costs significantly impact optimal column selection" )
341+ logger .info (
342+ " • Path planning enables budget-constrained and progressive selection"
343+ )
344+ logger .info (
326345 "\n 📖 For interactive selection with unknown data, "
327346 "see predictive_selection_demo.py"
328347 )
0 commit comments