superconfounder/example.py at feature/confounder-analysis-module · zhenchenwang/superconfounder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from simulation import generate_data
from confounder_analyzer import ConfounderAnalyzer
import pandas as pd

def main():
    """
    Main function to demonstrate the confounder analysis workflow.
    1. Generates synthetic data with a known causal structure.
    2. Initializes the ConfounderAnalyzer.
    3. Ranks potential confounders and prints the results.
    """
    print("Step 1: Generating synthetic data...")
    # Generate data with 5000 samples and 6 potential confounders
    try:
        df = generate_data(n_samples=5000, n_features=6)
        print("Data generation complete.")
        print(f"Generated {len(df)} samples and {len(df['cluster'].unique())} clusters.")
        print("True confounders in simulation: Z1, Z2 (linear)")
        print("Variable with non-linear effect on Y: Z3")
        print("-" * 30)
    except Exception as e:
        print(f"Failed to generate data. Error: {e}")
        return

    # Define the list of potential confounders to be analyzed
    confounder_candidates = [f'Z{i+1}' for i in range(6)]

    print("\nStep 2: Initializing and running ConfounderAnalyzer...")
    # Initialize the analyzer with the list of candidates
    analyzer = ConfounderAnalyzer(
        confounder_candidates=confounder_candidates,
        n_boot=100,      # Using 100 bootstrap samples for reasonable speed
        pc_alpha=0.05,
        random_state=42
    )

    # Run the full analysis
    try:
        final_ranking_df = analyzer.rank_confounders(df)
        print("\nStep 3: Confounder Analysis Results")
        print("-" * 30)
        print("Final Confounder Ranking (higher score is stronger evidence):")

        # Set display options for better formatting
        pd.set_option('display.width', 100)
        pd.set_option('display.max_columns', 10)

        print(final_ranking_df)
        print("\nAnalysis complete. Z1 and Z2 are expected to rank highest.")

        # Generate and save the text report
        print("\nStep 4: Generating and saving report...")
        report_text = analyzer.generate_report(final_ranking_df)
        report_filename = "confounder_analysis_report.txt"
        with open(report_filename, "w") as f:
            f.write(report_text)
        print(f"Report saved to {report_filename}")

    except Exception as e:
        print(f"An error occurred during confounder analysis. Error: {e}")

if __name__ == '__main__':
    main()