Skip to content

Commit eaf9793

Browse files
Update statistical_analysis.py
1 parent 854d43b commit eaf9793

1 file changed

Lines changed: 28 additions & 69 deletions

File tree

models/metarials of experiment/statistical_analysis.py

Lines changed: 28 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,21 @@
33
import matplotlib.pyplot as plt
44
import seaborn as sns
55
from scipy.stats import chi2_contingency, fisher_exact
6-
import os
76
import math
87
from matplotlib.patches import Patch
98

10-
# Set style for better visualizations
119
plt.style.use('ggplot')
1210
sns.set(font_scale=1.1)
1311
sns.set_style("whitegrid")
1412

15-
# Define custom color palette
1613
custom_palette = {
17-
'SO': '#1f77b4', # Blue
18-
'GPT': '#ff7f0e', # Orange
19-
'Both': '#2ca02c', # Green
20-
'Neither': '#d62728' # Red
14+
'SO': '#1f77b4',
15+
'GPT': '#ff7f0e',
16+
'Both': '#2ca02c',
17+
'Neither': '#d62728'
2118
}
2219

23-
# Define path to dataset
24-
path = "final/ML-code-smell-detection-main/finall_dataset_so_vs_gpt .csv"
20+
path = "dataset_so_vs_gpt.csv"
2521
df = pd.read_csv(path)
2622

2723
print("\nDataset Information:")
@@ -30,7 +26,6 @@
3026
print("\nColumn names:")
3127
print(df.columns.tolist())
3228

33-
# Process cyclomatic complexity
3429
print("\nProcessing cyclomatic complexity")
3530

3631
so_cc_col = [col for col in df.columns if 'Cyclomatic complexity(so)' in col]
@@ -51,21 +46,18 @@
5146
else:
5247
print("Cyclomatic complexity columns not found in the dataset")
5348

54-
# Get metrics columns
49+
5550
so_metrics = [col for col in df.columns if '(so)' in col and 'code_snippet' not in col]
5651
gpt_metrics = [col for col in df.columns if '(gpt)' in col and 'code_snippet' not in col]
5752

5853
print(f"Found {len(so_metrics)} StackOverflow metrics and {len(gpt_metrics)} GPT metrics")
5954

60-
# Function to calculate association metrics from a contingency table
6155
def calculate_association_metrics(table):
62-
# Extract values from the table
6356
try:
64-
# Make sure we're handling string indices
65-
a = table.loc['True', 'True'] # True, True
66-
b = table.loc['True', 'False'] # True, False
67-
c = table.loc['False', 'True'] # False, True
68-
d = table.loc['False', 'False'] # False, False
57+
a = table.loc['True', 'True']
58+
b = table.loc['True', 'False']
59+
c = table.loc['False', 'True']
60+
d = table.loc['False', 'False']
6961

7062
# Calculate Phi coefficient
7163
n = a + b + c + d
@@ -76,7 +68,6 @@ def calculate_association_metrics(table):
7668

7769
# Calculate Odds ratio with handling for zero cells
7870
if b*c == 0:
79-
# Add a small constant to zero cells for odds ratio calculation
8071
adj_b = b if b > 0 else 0.5
8172
adj_c = c if c > 0 else 0.5
8273
odds_ratio = (a*d) / (adj_b*adj_c)
@@ -96,9 +87,9 @@ def calculate_association_metrics(table):
9687
neither_pct = (d / n) * 100 # Neither has TD
9788

9889
# Calculate prevalence rates
99-
gpt_rate = ((a + b) / n) * 100 # GPT TD rate
100-
so_rate = ((a + c) / n) * 100 # SO TD rate
101-
difference = so_rate - gpt_rate # SO - GPT difference
90+
gpt_rate = ((a + b) / n) * 100
91+
so_rate = ((a + c) / n) * 100
92+
difference = so_rate - gpt_rate
10293

10394
return {
10495
'phi': phi,
@@ -126,41 +117,32 @@ def calculate_association_metrics(table):
126117
print(f"Error calculating association metrics: {e}")
127118
return None
128119

129-
# Perform statistical tests between corresponding SO and GPT metrics
130120
print("\n--- Statistical Tests and Association Metrics: SO vs GPT for each metric ---")
131121

132122
all_results = []
133123
distribution_data = []
134124

135125
for so_metric, gpt_metric in zip(so_metrics, gpt_metrics):
136-
# Skip if not categorical
137126
if df[so_metric].dtype == 'float64' or df[gpt_metric].dtype == 'float64':
138127
continue
139128

140-
# Create contingency table (False/True for consistency)
141-
# Convert values to boolean for consistent table formatting
142129
df_temp = df.copy()
143-
# Determine if columns are boolean or have other values (like True/False strings)
144130
if df[so_metric].dtype == bool:
145-
pass # Already boolean
131+
pass
146132
elif set(df[so_metric].dropna().unique()).issubset({True, False, 'True', 'False', 'TRUE', 'FALSE'}):
147133
df_temp[so_metric] = df_temp[so_metric].map(lambda x: x == True or x == 'True' or x == 'TRUE')
148134
else:
149-
# For non-boolean categorical, convert to boolean based on presence of any value
150135
df_temp[so_metric] = df_temp[so_metric].notna() & (df_temp[so_metric] != False) & (df_temp[so_metric] != 'False') & (df_temp[so_metric] != 'FALSE')
151136

152137
if df[gpt_metric].dtype == bool:
153-
pass # Already boolean
138+
pass
154139
elif set(df[gpt_metric].dropna().unique()).issubset({True, False, 'True', 'False', 'TRUE', 'FALSE'}):
155140
df_temp[gpt_metric] = df_temp[gpt_metric].map(lambda x: x == True or x == 'True' or x == 'TRUE')
156141
else:
157-
# For non-boolean categorical, convert to boolean based on presence of any value
158142
df_temp[gpt_metric] = df_temp[gpt_metric].notna() & (df_temp[gpt_metric] != False) & (df_temp[gpt_metric] != 'False') & (df_temp[gpt_metric] != 'FALSE')
159143

160144
contingency_table = pd.crosstab(df_temp[so_metric], df_temp[gpt_metric])
161145

162-
# Ensure the contingency table has the proper structure with False/True values
163-
# Convert index and columns to string to avoid boolean indexing issues
164146
contingency_table.index = contingency_table.index.astype(str)
165147
contingency_table.columns = contingency_table.columns.astype(str)
166148

@@ -175,7 +157,6 @@ def calculate_association_metrics(table):
175157
# Sort to ensure consistent order
176158
contingency_table = contingency_table.reindex(index=['False', 'True'], columns=['False', 'True'])
177159

178-
# Perform statistical tests - use Fisher's exact test instead of chi-square for small counts
179160
try:
180161
# Check if chi-square test is appropriate (expected frequencies > 5)
181162
row_totals = contingency_table.sum(axis=1)
@@ -185,8 +166,7 @@ def calculate_association_metrics(table):
185166
# Calculate expected frequencies
186167
expected = np.outer(row_totals, col_totals) / n
187168

188-
# Determine which test to use
189-
use_fisher = np.any(expected < 1)
169+
use_fisher = np.any(expected < 5)
190170

191171
if use_fisher:
192172
# Use Fisher's exact test
@@ -203,7 +183,6 @@ def calculate_association_metrics(table):
203183
association_metrics = calculate_association_metrics(contingency_table)
204184

205185
if association_metrics:
206-
# Extract metric name (remove "(so)" suffix)
207186
metric_name = so_metric.replace('(so)', '')
208187

209188
# Store results
@@ -266,18 +245,14 @@ def calculate_association_metrics(table):
266245
print(f"Error performing analysis for {so_metric} vs {gpt_metric}: {e}")
267246
print(f"Contingency table: \n{contingency_table}")
268247

269-
# Create summary dataframe and visualizations if results exist
270248
if all_results:
271-
# Create summary dataframe
272249
results_df = pd.DataFrame(all_results)
273250
print("\n--- Analysis Results Summary ---")
274251
print(results_df.to_string())
275252

276253
# Create distributions dataframe
277254
distributions_df = pd.DataFrame(distribution_data)
278-
279-
# ------- Create Visualizations -------
280-
255+
281256
# 1. Test Statistic and Phi Coefficient visualization
282257
plt.figure(figsize=(14, 7))
283258
metrics = results_df['Metric'].tolist()
@@ -286,7 +261,6 @@ def calculate_association_metrics(table):
286261

287262
# Create two-part bar chart
288263
ax = plt.subplot(111)
289-
# Use a normalized test statistic for better visualization across different tests
290264
normalized_stats = []
291265
for i, test in enumerate(results_df['Test']):
292266
if test == "Chi-square test":
@@ -322,12 +296,10 @@ def calculate_association_metrics(table):
322296
# 2. Technical Debt Distribution visualization
323297
plt.figure(figsize=(16, 8))
324298

325-
# Create stacked bar chart for distribution
326299
metrics = distributions_df['Metric'].tolist()
327300
x = np.arange(len(metrics))
328301
width = 0.7
329302

330-
# Stacked bars
331303
plt.bar(x, distributions_df['Both (%)'], width, label='Both have TD', color=custom_palette['Both'])
332304
plt.bar(x, distributions_df['Only GPT (%)'], width, bottom=distributions_df['Both (%)'],
333305
label='Only GPT has TD', color=custom_palette['GPT'])
@@ -338,20 +310,17 @@ def calculate_association_metrics(table):
338310
bottom=distributions_df['Both (%)'] + distributions_df['Only GPT (%)'] + distributions_df['Only SO (%)'],
339311
label='Neither has TD', color=custom_palette['Neither'])
340312

341-
# Add labels and title
342313
plt.ylabel('Percentage of Cases')
343314
plt.title('Technical Debt Distribution by Metric')
344315
plt.xticks(x, metrics, rotation=45, ha='right')
345316
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
346317

347-
# Add percentage labels on each segment (only for segments > 5%)
348318
for i, metric in enumerate(metrics):
349-
# Variables to track the bottom of each bar
350319
bottom = 0
351320
for category, color_key in [('Both (%)', 'Both'), ('Only GPT (%)', 'GPT'),
352321
('Only SO (%)', 'SO'), ('Neither (%)', 'Neither')]:
353322
value = distributions_df.loc[distributions_df['Metric'] == metric, category].values[0]
354-
if value > 5: # Only add text if segment is > 5%
323+
if value > 5:
355324
plt.text(i, bottom + value/2, f'{value:.1f}%', ha='center', va='center',
356325
fontweight='bold', color='black')
357326
bottom += value
@@ -363,18 +332,16 @@ def calculate_association_metrics(table):
363332
# 3. Prevalence Comparison visualization
364333
plt.figure(figsize=(14, 7))
365334

366-
# Create grouped bar chart
367335
x = np.arange(len(metrics))
368336
width = 0.35
369337

370338
plt.bar(x - width/2, results_df['GPT Rate (%)'], width, label='GPT TD Rate', color=custom_palette['GPT'])
371339
plt.bar(x + width/2, results_df['SO Rate (%)'], width, label='SO TD Rate', color=custom_palette['SO'])
372340

373-
# Add text showing the difference
374341
for i, metric in enumerate(metrics):
375342
diff = results_df.loc[results_df['Metric'] == metric, 'Difference (SO-GPT)'].values[0]
376-
if abs(diff) > 2: # Only show difference if it's more than 2 percentage points
377-
color = 'green' if diff < 0 else 'red' # Green if GPT is better (less TD), red if SO is better
343+
if abs(diff) > 2: #
344+
color = 'green' if diff < 0 else 'red'
378345
plt.text(i, max(results_df.loc[i, 'GPT Rate (%)'], results_df.loc[i, 'SO Rate (%)']) + 2,
379346
f"{diff:+.1f}pp", ha='center', va='bottom', color=color, fontweight='bold')
380347

@@ -397,7 +364,6 @@ def calculate_association_metrics(table):
397364
# 4a. Odds ratio (upper left)
398365
ax1 = axes[0, 0]
399366
odds_ratios = results_df['Odds Ratio'].values
400-
# Cap extremely large values for visualization
401367
odds_ratios_capped = [min(x, 10) for x in odds_ratios]
402368

403369
# Use plt.bar directly with metrics for x values
@@ -463,20 +429,18 @@ def calculate_association_metrics(table):
463429
td_dist = distributions_df[distributions_df['Metric'] == 'TD'].iloc[0]
464430

465431
# Create figure with more space between subplots
466-
fig, ax = plt.subplots(2, 2, figsize=(16, 14)) # Increased figure size
467-
plt.subplots_adjust(wspace=0.3, hspace=0.4) # Add more space between subplots
432+
fig, ax = plt.subplots(2, 2, figsize=(16, 14))
433+
plt.subplots_adjust(wspace=0.3, hspace=0.4)
468434

469435
# 5a. TD Distribution (upper left)
470436
labels = ['Both have TD', 'Only GPT has TD', 'Only SO has TD', 'Neither has TD']
471437
sizes = [td_dist['Both (%)'], td_dist['Only GPT (%)'], td_dist['Only SO (%)'], td_dist['Neither (%)']]
472438
colors = [custom_palette['Both'], custom_palette['GPT'], custom_palette['SO'], custom_palette['Neither']]
473439

474440
ax[0, 0].pie(sizes, labels=None, colors=colors, autopct='%1.1f%%', startangle=90)
475-
ax[0, 0].set_title('Technical Debt Distribution', fontsize=14, pad=20) # Add padding to title
441+
ax[0, 0].set_title('Technical Debt Distribution', fontsize=14, pad=20)
476442

477-
# Fix the count for "Only SO has TD" if needed
478443
so_only_count = int(td_dist['Only SO'])
479-
# Add legend with counts - position adjusted to avoid overlap
480444
legend_labels = [
481445
f"Both have TD ({int(td_dist['Both'])})",
482446
f"Only GPT has TD ({int(td_dist['Only GPT'])})",
@@ -492,13 +456,11 @@ def calculate_association_metrics(table):
492456
ax[0, 1].set_title('Technical Debt Prevalence', fontsize=14, pad=20) # Add padding to title
493457
ax[0, 1].set_ylabel('Percentage of Solutions with TD')
494458

495-
# Add text showing the difference - position adjusted to avoid overlap with title
496459
diff = td_data['Difference (SO-GPT)']
497460
color = 'green' if diff > 0 else 'red' # Green if SO has more TD (GPT is better), red otherwise
498461

499-
# Position the difference text at the top of the plot with more space
500462
y_max = max(heights)
501-
ax[0, 1].set_ylim(0, y_max * 1.2) # Extend y-axis to make room for text
463+
ax[0, 1].set_ylim(0, y_max * 1.2)
502464
ax[0, 1].text(0.5, y_max * 1.15, f"Difference: {diff:.1f}pp",
503465
ha='center', va='center', color=color, fontweight='bold', fontsize=12)
504466

@@ -507,14 +469,12 @@ def calculate_association_metrics(table):
507469
heights = [td_data['P(GPT|SO)'], td_data['P(GPT|~SO)'], td_data['P(SO|GPT)'], td_data['P(SO|~GPT)']]
508470
colors = [custom_palette['GPT'], custom_palette['GPT'], custom_palette['SO'], custom_palette['SO']]
509471
ax[1, 0].bar(x_labels, heights, color=colors)
510-
ax[1, 0].set_title('Conditional Probabilities', fontsize=14, pad=20) # Add padding to title
472+
ax[1, 0].set_title('Conditional Probabilities', fontsize=14, pad=20)
511473
ax[1, 0].set_ylabel('Probability')
512474

513-
# Add text for each bar - improved positioning
514475
for i, h in enumerate(heights):
515476
ax[1, 0].text(i, h/2, f"{h*100:.1f}%", ha='center', va='center', color='white', fontweight='bold')
516477

517-
# 5d. Association Metrics (lower right) - FIXED HERE
518478
# Create separate y-axis scales for different metrics
519479
ax1 = ax[1, 1]
520480
ax1.set_title('Association Metrics', fontsize=14, pad=20) # Add padding to title
@@ -545,7 +505,7 @@ def calculate_association_metrics(table):
545505
ax2.text(x_positions[2], td_data['Odds Ratio']/2, f"{td_data['Odds Ratio']:.2f}",
546506
ha='center', va='center', color='white', fontweight='bold')
547507

548-
# Add legend - position it ABOVE the plot to avoid overlap
508+
# Add legend
549509
legend_elements = [
550510
plt.Rectangle((0,0), 1, 1, color='#1f77b4', label='Test Statistic'),
551511
plt.Rectangle((0,0), 1, 1, color='#ff7f0e', label='Phi Coefficient'),
@@ -557,15 +517,14 @@ def calculate_association_metrics(table):
557517
# Ensure proper scaling for secondary y-axis
558518
ax2.set_ylim(0, max(td_data['Phi Coefficient'], td_data['Odds Ratio']) * 1.2)
559519

560-
plt.tight_layout(pad=4.0) # Increased padding between subplots
520+
plt.tight_layout(pad=4.0)
561521
plt.savefig('overall_td_analysis.png', dpi=300, bbox_inches='tight')
562522
print("\nOverall technical debt analysis visualization saved as 'overall_td_analysis.png'")
563523

564524
else:
565525
print("\nNo analysis results to visualize.")
566526

567-
# Print the final contingency table for TD
568527
print("\nContingency table for final TD(so) vs TD(gpt)!")
569528
contingency_table = pd.crosstab(df['TD(so)'], df['TD(gpt)'], margins=True, margins_name="Total")
570529
print("\nContingency Table:")
571-
print(contingency_table)
530+
print(contingency_table)

0 commit comments

Comments
 (0)