Update statistical_analysis.py

mdsanwarhossain · web-flow · commit eaf979300d80 · 2025-07-06T23:12:32.000+06:00
diff --git a/models/metarials of experiment/statistical_analysis.py b/models/metarials of experiment/statistical_analysis.py
@@ -3,25 +3,21 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 from scipy.stats import chi2_contingency, fisher_exact
-import os
 import math
 from matplotlib.patches import Patch
 
-# Set style for better visualizations
 plt.style.use('ggplot')
 sns.set(font_scale=1.1)
 sns.set_style("whitegrid")
 
-# Define custom color palette
 custom_palette = {
-    'SO': '#1f77b4',  # Blue
-    'GPT': '#ff7f0e',  # Orange
-    'Both': '#2ca02c',  # Green
-    'Neither': '#d62728'  # Red
+    'SO': '#1f77b4', 
+    'GPT': '#ff7f0e', 
+    'Both': '#2ca02c', 
+    'Neither': '#d62728'
 }
 
-# Define path to dataset
-path = "final/ML-code-smell-detection-main/finall_dataset_so_vs_gpt .csv"
+path = "dataset_so_vs_gpt.csv"
 df = pd.read_csv(path)
 
 print("\nDataset Information:")
@@ -30,7 +26,6 @@
 print("\nColumn names:")
 print(df.columns.tolist())
 
-# Process cyclomatic complexity
 print("\nProcessing cyclomatic complexity")
 
 so_cc_col = [col for col in df.columns if 'Cyclomatic complexity(so)' in col]
@@ -51,21 +46,18 @@
 else:
     print("Cyclomatic complexity columns not found in the dataset")
 
-# Get metrics columns
+
 so_metrics = [col for col in df.columns if '(so)' in col and 'code_snippet' not in col]
 gpt_metrics = [col for col in df.columns if '(gpt)' in col and 'code_snippet' not in col]
 
 print(f"Found {len(so_metrics)} StackOverflow metrics and {len(gpt_metrics)} GPT metrics")
 
-# Function to calculate association metrics from a contingency table
 def calculate_association_metrics(table):
-    # Extract values from the table
     try:
-        # Make sure we're handling string indices
-        a = table.loc['True', 'True']   # True, True
-        b = table.loc['True', 'False']  # True, False
-        c = table.loc['False', 'True']  # False, True
-        d = table.loc['False', 'False'] # False, False
+        a = table.loc['True', 'True']   
+        b = table.loc['True', 'False']  
+        c = table.loc['False', 'True']  
+        d = table.loc['False', 'False']
         
         # Calculate Phi coefficient
         n = a + b + c + d
@@ -76,7 +68,6 @@ def calculate_association_metrics(table):
         
         # Calculate Odds ratio with handling for zero cells
         if b*c == 0:
-            # Add a small constant to zero cells for odds ratio calculation
             adj_b = b if b > 0 else 0.5
             adj_c = c if c > 0 else 0.5
             odds_ratio = (a*d) / (adj_b*adj_c)
@@ -96,9 +87,9 @@ def calculate_association_metrics(table):
         neither_pct = (d / n) * 100  # Neither has TD
         
         # Calculate prevalence rates
-        gpt_rate = ((a + b) / n) * 100  # GPT TD rate
-        so_rate = ((a + c) / n) * 100  # SO TD rate
-        difference = so_rate - gpt_rate  # SO - GPT difference
+        gpt_rate = ((a + b) / n) * 100 
+        so_rate = ((a + c) / n) * 100 
+        difference = so_rate - gpt_rate
         
         return {
             'phi': phi,
@@ -126,41 +117,32 @@ def calculate_association_metrics(table):
         print(f"Error calculating association metrics: {e}")
         return None
 
-# Perform statistical tests between corresponding SO and GPT metrics
 print("\n--- Statistical Tests and Association Metrics: SO vs GPT for each metric ---")
 
 all_results = []
 distribution_data = []
 
 for so_metric, gpt_metric in zip(so_metrics, gpt_metrics):
-    # Skip if not categorical
     if df[so_metric].dtype == 'float64' or df[gpt_metric].dtype == 'float64':
         continue
     
-    # Create contingency table (False/True for consistency)
-    # Convert values to boolean for consistent table formatting
     df_temp = df.copy()
-    # Determine if columns are boolean or have other values (like True/False strings)
     if df[so_metric].dtype == bool:
-        pass  # Already boolean
+        pass 
     elif set(df[so_metric].dropna().unique()).issubset({True, False, 'True', 'False', 'TRUE', 'FALSE'}):
         df_temp[so_metric] = df_temp[so_metric].map(lambda x: x == True or x == 'True' or x == 'TRUE')
     else:
-        # For non-boolean categorical, convert to boolean based on presence of any value
         df_temp[so_metric] = df_temp[so_metric].notna() & (df_temp[so_metric] != False) & (df_temp[so_metric] != 'False') & (df_temp[so_metric] != 'FALSE')
     
     if df[gpt_metric].dtype == bool:
-        pass  # Already boolean
+        pass  
     elif set(df[gpt_metric].dropna().unique()).issubset({True, False, 'True', 'False', 'TRUE', 'FALSE'}):
         df_temp[gpt_metric] = df_temp[gpt_metric].map(lambda x: x == True or x == 'True' or x == 'TRUE')
     else:
-        # For non-boolean categorical, convert to boolean based on presence of any value
         df_temp[gpt_metric] = df_temp[gpt_metric].notna() & (df_temp[gpt_metric] != False) & (df_temp[gpt_metric] != 'False') & (df_temp[gpt_metric] != 'FALSE')
     
     contingency_table = pd.crosstab(df_temp[so_metric], df_temp[gpt_metric])
     
-    # Ensure the contingency table has the proper structure with False/True values
-    # Convert index and columns to string to avoid boolean indexing issues
     contingency_table.index = contingency_table.index.astype(str)
     contingency_table.columns = contingency_table.columns.astype(str)
     
@@ -175,7 +157,6 @@ def calculate_association_metrics(table):
     # Sort to ensure consistent order
     contingency_table = contingency_table.reindex(index=['False', 'True'], columns=['False', 'True'])
     
-    # Perform statistical tests - use Fisher's exact test instead of chi-square for small counts
     try:
         # Check if chi-square test is appropriate (expected frequencies > 5)
         row_totals = contingency_table.sum(axis=1)
@@ -185,8 +166,7 @@ def calculate_association_metrics(table):
         # Calculate expected frequencies
         expected = np.outer(row_totals, col_totals) / n
         
-        # Determine which test to use
-        use_fisher = np.any(expected < 1)
+        use_fisher = np.any(expected < 5)
         
         if use_fisher:
             # Use Fisher's exact test
@@ -203,7 +183,6 @@ def calculate_association_metrics(table):
         association_metrics = calculate_association_metrics(contingency_table)
         
         if association_metrics:
-            # Extract metric name (remove "(so)" suffix)
             metric_name = so_metric.replace('(so)', '')
             
             # Store results
@@ -266,18 +245,14 @@ def calculate_association_metrics(table):
         print(f"Error performing analysis for {so_metric} vs {gpt_metric}: {e}")
         print(f"Contingency table: \n{contingency_table}")
 
-# Create summary dataframe and visualizations if results exist
 if all_results:
-    # Create summary dataframe
     results_df = pd.DataFrame(all_results)
     print("\n--- Analysis Results Summary ---")
     print(results_df.to_string())
     
     # Create distributions dataframe
     distributions_df = pd.DataFrame(distribution_data)
-    
-    # ------- Create Visualizations -------
-    
+        
     # 1. Test Statistic and Phi Coefficient visualization
     plt.figure(figsize=(14, 7))
     metrics = results_df['Metric'].tolist()
@@ -286,7 +261,6 @@ def calculate_association_metrics(table):
     
     # Create two-part bar chart
     ax = plt.subplot(111)
-    # Use a normalized test statistic for better visualization across different tests
     normalized_stats = []
     for i, test in enumerate(results_df['Test']):
         if test == "Chi-square test":
@@ -322,12 +296,10 @@ def calculate_association_metrics(table):
     # 2. Technical Debt Distribution visualization
     plt.figure(figsize=(16, 8))
     
-    # Create stacked bar chart for distribution
     metrics = distributions_df['Metric'].tolist()
     x = np.arange(len(metrics))
     width = 0.7
     
-    # Stacked bars
     plt.bar(x, distributions_df['Both (%)'], width, label='Both have TD', color=custom_palette['Both'])
     plt.bar(x, distributions_df['Only GPT (%)'], width, bottom=distributions_df['Both (%)'], 
             label='Only GPT has TD', color=custom_palette['GPT'])
@@ -338,20 +310,17 @@ def calculate_association_metrics(table):
             bottom=distributions_df['Both (%)'] + distributions_df['Only GPT (%)'] + distributions_df['Only SO (%)'], 
             label='Neither has TD', color=custom_palette['Neither'])
     
-    # Add labels and title
     plt.ylabel('Percentage of Cases')
     plt.title('Technical Debt Distribution by Metric')
     plt.xticks(x, metrics, rotation=45, ha='right')
     plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
     
-    # Add percentage labels on each segment (only for segments > 5%)
     for i, metric in enumerate(metrics):
-        # Variables to track the bottom of each bar
         bottom = 0
         for category, color_key in [('Both (%)', 'Both'), ('Only GPT (%)', 'GPT'), 
                              ('Only SO (%)', 'SO'), ('Neither (%)', 'Neither')]:
             value = distributions_df.loc[distributions_df['Metric'] == metric, category].values[0]
-            if value > 5:  # Only add text if segment is > 5%
+            if value > 5:
                 plt.text(i, bottom + value/2, f'{value:.1f}%', ha='center', va='center', 
                         fontweight='bold', color='black')
             bottom += value
@@ -363,18 +332,16 @@ def calculate_association_metrics(table):
     # 3. Prevalence Comparison visualization
     plt.figure(figsize=(14, 7))
     
-    # Create grouped bar chart
     x = np.arange(len(metrics))
     width = 0.35
     
     plt.bar(x - width/2, results_df['GPT Rate (%)'], width, label='GPT TD Rate', color=custom_palette['GPT'])
     plt.bar(x + width/2, results_df['SO Rate (%)'], width, label='SO TD Rate', color=custom_palette['SO'])
     
-    # Add text showing the difference
     for i, metric in enumerate(metrics):
         diff = results_df.loc[results_df['Metric'] == metric, 'Difference (SO-GPT)'].values[0]
-        if abs(diff) > 2:  # Only show difference if it's more than 2 percentage points
-            color = 'green' if diff < 0 else 'red'  # Green if GPT is better (less TD), red if SO is better
+        if abs(diff) > 2:  #
+            color = 'green' if diff < 0 else 'red'  
             plt.text(i, max(results_df.loc[i, 'GPT Rate (%)'], results_df.loc[i, 'SO Rate (%)']) + 2,
                     f"{diff:+.1f}pp", ha='center', va='bottom', color=color, fontweight='bold')
     
@@ -397,7 +364,6 @@ def calculate_association_metrics(table):
     # 4a. Odds ratio (upper left)
     ax1 = axes[0, 0]
     odds_ratios = results_df['Odds Ratio'].values
-    # Cap extremely large values for visualization
     odds_ratios_capped = [min(x, 10) for x in odds_ratios]
     
     # Use plt.bar directly with metrics for x values
@@ -463,20 +429,18 @@ def calculate_association_metrics(table):
     td_dist = distributions_df[distributions_df['Metric'] == 'TD'].iloc[0]
     
     # Create figure with more space between subplots
-    fig, ax = plt.subplots(2, 2, figsize=(16, 14))  # Increased figure size
-    plt.subplots_adjust(wspace=0.3, hspace=0.4)  # Add more space between subplots
+    fig, ax = plt.subplots(2, 2, figsize=(16, 14)) 
+    plt.subplots_adjust(wspace=0.3, hspace=0.4)  
     
     # 5a. TD Distribution (upper left)
     labels = ['Both have TD', 'Only GPT has TD', 'Only SO has TD', 'Neither has TD']
     sizes = [td_dist['Both (%)'], td_dist['Only GPT (%)'], td_dist['Only SO (%)'], td_dist['Neither (%)']]
     colors = [custom_palette['Both'], custom_palette['GPT'], custom_palette['SO'], custom_palette['Neither']]
     
     ax[0, 0].pie(sizes, labels=None, colors=colors, autopct='%1.1f%%', startangle=90)
-    ax[0, 0].set_title('Technical Debt Distribution', fontsize=14, pad=20)  # Add padding to title
+    ax[0, 0].set_title('Technical Debt Distribution', fontsize=14, pad=20) 
     
-    # Fix the count for "Only SO has TD" if needed
     so_only_count = int(td_dist['Only SO'])
-    # Add legend with counts - position adjusted to avoid overlap
     legend_labels = [
         f"Both have TD ({int(td_dist['Both'])})",
         f"Only GPT has TD ({int(td_dist['Only GPT'])})",
@@ -492,13 +456,11 @@ def calculate_association_metrics(table):
     ax[0, 1].set_title('Technical Debt Prevalence', fontsize=14, pad=20)  # Add padding to title
     ax[0, 1].set_ylabel('Percentage of Solutions with TD')
     
-    # Add text showing the difference - position adjusted to avoid overlap with title
     diff = td_data['Difference (SO-GPT)']
     color = 'green' if diff > 0 else 'red'  # Green if SO has more TD (GPT is better), red otherwise
     
-    # Position the difference text at the top of the plot with more space
     y_max = max(heights)
-    ax[0, 1].set_ylim(0, y_max * 1.2)  # Extend y-axis to make room for text
+    ax[0, 1].set_ylim(0, y_max * 1.2)  
     ax[0, 1].text(0.5, y_max * 1.15, f"Difference: {diff:.1f}pp", 
                  ha='center', va='center', color=color, fontweight='bold', fontsize=12)
     
@@ -507,14 +469,12 @@ def calculate_association_metrics(table):
     heights = [td_data['P(GPT|SO)'], td_data['P(GPT|~SO)'], td_data['P(SO|GPT)'], td_data['P(SO|~GPT)']]
     colors = [custom_palette['GPT'], custom_palette['GPT'], custom_palette['SO'], custom_palette['SO']]
     ax[1, 0].bar(x_labels, heights, color=colors)
-    ax[1, 0].set_title('Conditional Probabilities', fontsize=14, pad=20)  # Add padding to title
+    ax[1, 0].set_title('Conditional Probabilities', fontsize=14, pad=20) 
     ax[1, 0].set_ylabel('Probability')
     
-    # Add text for each bar - improved positioning
     for i, h in enumerate(heights):
         ax[1, 0].text(i, h/2, f"{h*100:.1f}%", ha='center', va='center', color='white', fontweight='bold')
     
-    # 5d. Association Metrics (lower right) - FIXED HERE
     # Create separate y-axis scales for different metrics
     ax1 = ax[1, 1]
     ax1.set_title('Association Metrics', fontsize=14, pad=20)  # Add padding to title
@@ -545,7 +505,7 @@ def calculate_association_metrics(table):
     ax2.text(x_positions[2], td_data['Odds Ratio']/2, f"{td_data['Odds Ratio']:.2f}", 
              ha='center', va='center', color='white', fontweight='bold')
     
-    # Add legend - position it ABOVE the plot to avoid overlap
+    # Add legend
     legend_elements = [
         plt.Rectangle((0,0), 1, 1, color='#1f77b4', label='Test Statistic'),
         plt.Rectangle((0,0), 1, 1, color='#ff7f0e', label='Phi Coefficient'),
@@ -557,15 +517,14 @@ def calculate_association_metrics(table):
     # Ensure proper scaling for secondary y-axis
     ax2.set_ylim(0, max(td_data['Phi Coefficient'], td_data['Odds Ratio']) * 1.2)
     
-    plt.tight_layout(pad=4.0)  # Increased padding between subplots
+    plt.tight_layout(pad=4.0) 
     plt.savefig('overall_td_analysis.png', dpi=300, bbox_inches='tight')
     print("\nOverall technical debt analysis visualization saved as 'overall_td_analysis.png'")
     
 else:
     print("\nNo analysis results to visualize.")
 
-# Print the final contingency table for TD
 print("\nContingency table for final TD(so) vs TD(gpt)!")
 contingency_table = pd.crosstab(df['TD(so)'], df['TD(gpt)'], margins=True, margins_name="Total")
 print("\nContingency Table:")
-print(contingency_table)
+print(contingency_table)