33import matplotlib .pyplot as plt
44import seaborn as sns
55from scipy .stats import chi2_contingency , fisher_exact
6- import os
76import math
87from matplotlib .patches import Patch
98
10- # Set style for better visualizations
119plt .style .use ('ggplot' )
1210sns .set (font_scale = 1.1 )
1311sns .set_style ("whitegrid" )
1412
15- # Define custom color palette
1613custom_palette = {
17- 'SO' : '#1f77b4' , # Blue
18- 'GPT' : '#ff7f0e' , # Orange
19- 'Both' : '#2ca02c' , # Green
20- 'Neither' : '#d62728' # Red
14+ 'SO' : '#1f77b4' ,
15+ 'GPT' : '#ff7f0e' ,
16+ 'Both' : '#2ca02c' ,
17+ 'Neither' : '#d62728'
2118}
2219
23- # Define path to dataset
24- path = "final/ML-code-smell-detection-main/finall_dataset_so_vs_gpt .csv"
20+ path = "dataset_so_vs_gpt.csv"
2521df = pd .read_csv (path )
2622
2723print ("\n Dataset Information:" )
3026print ("\n Column names:" )
3127print (df .columns .tolist ())
3228
33- # Process cyclomatic complexity
3429print ("\n Processing cyclomatic complexity" )
3530
3631so_cc_col = [col for col in df .columns if 'Cyclomatic complexity(so)' in col ]
5146else :
5247 print ("Cyclomatic complexity columns not found in the dataset" )
5348
54- # Get metrics columns
49+
5550so_metrics = [col for col in df .columns if '(so)' in col and 'code_snippet' not in col ]
5651gpt_metrics = [col for col in df .columns if '(gpt)' in col and 'code_snippet' not in col ]
5752
5853print (f"Found { len (so_metrics )} StackOverflow metrics and { len (gpt_metrics )} GPT metrics" )
5954
60- # Function to calculate association metrics from a contingency table
6155def calculate_association_metrics (table ):
62- # Extract values from the table
6356 try :
64- # Make sure we're handling string indices
65- a = table .loc ['True' , 'True' ] # True, True
66- b = table .loc ['True' , 'False' ] # True, False
67- c = table .loc ['False' , 'True' ] # False, True
68- d = table .loc ['False' , 'False' ] # False, False
57+ a = table .loc ['True' , 'True' ]
58+ b = table .loc ['True' , 'False' ]
59+ c = table .loc ['False' , 'True' ]
60+ d = table .loc ['False' , 'False' ]
6961
7062 # Calculate Phi coefficient
7163 n = a + b + c + d
@@ -76,7 +68,6 @@ def calculate_association_metrics(table):
7668
7769 # Calculate Odds ratio with handling for zero cells
7870 if b * c == 0 :
79- # Add a small constant to zero cells for odds ratio calculation
8071 adj_b = b if b > 0 else 0.5
8172 adj_c = c if c > 0 else 0.5
8273 odds_ratio = (a * d ) / (adj_b * adj_c )
@@ -96,9 +87,9 @@ def calculate_association_metrics(table):
9687 neither_pct = (d / n ) * 100 # Neither has TD
9788
9889 # Calculate prevalence rates
99- gpt_rate = ((a + b ) / n ) * 100 # GPT TD rate
100- so_rate = ((a + c ) / n ) * 100 # SO TD rate
101- difference = so_rate - gpt_rate # SO - GPT difference
90+ gpt_rate = ((a + b ) / n ) * 100
91+ so_rate = ((a + c ) / n ) * 100
92+ difference = so_rate - gpt_rate
10293
10394 return {
10495 'phi' : phi ,
@@ -126,41 +117,32 @@ def calculate_association_metrics(table):
126117 print (f"Error calculating association metrics: { e } " )
127118 return None
128119
129- # Perform statistical tests between corresponding SO and GPT metrics
130120print ("\n --- Statistical Tests and Association Metrics: SO vs GPT for each metric ---" )
131121
132122all_results = []
133123distribution_data = []
134124
135125for so_metric , gpt_metric in zip (so_metrics , gpt_metrics ):
136- # Skip if not categorical
137126 if df [so_metric ].dtype == 'float64' or df [gpt_metric ].dtype == 'float64' :
138127 continue
139128
140- # Create contingency table (False/True for consistency)
141- # Convert values to boolean for consistent table formatting
142129 df_temp = df .copy ()
143- # Determine if columns are boolean or have other values (like True/False strings)
144130 if df [so_metric ].dtype == bool :
145- pass # Already boolean
131+ pass
146132 elif set (df [so_metric ].dropna ().unique ()).issubset ({True , False , 'True' , 'False' , 'TRUE' , 'FALSE' }):
147133 df_temp [so_metric ] = df_temp [so_metric ].map (lambda x : x == True or x == 'True' or x == 'TRUE' )
148134 else :
149- # For non-boolean categorical, convert to boolean based on presence of any value
150135 df_temp [so_metric ] = df_temp [so_metric ].notna () & (df_temp [so_metric ] != False ) & (df_temp [so_metric ] != 'False' ) & (df_temp [so_metric ] != 'FALSE' )
151136
152137 if df [gpt_metric ].dtype == bool :
153- pass # Already boolean
138+ pass
154139 elif set (df [gpt_metric ].dropna ().unique ()).issubset ({True , False , 'True' , 'False' , 'TRUE' , 'FALSE' }):
155140 df_temp [gpt_metric ] = df_temp [gpt_metric ].map (lambda x : x == True or x == 'True' or x == 'TRUE' )
156141 else :
157- # For non-boolean categorical, convert to boolean based on presence of any value
158142 df_temp [gpt_metric ] = df_temp [gpt_metric ].notna () & (df_temp [gpt_metric ] != False ) & (df_temp [gpt_metric ] != 'False' ) & (df_temp [gpt_metric ] != 'FALSE' )
159143
160144 contingency_table = pd .crosstab (df_temp [so_metric ], df_temp [gpt_metric ])
161145
162- # Ensure the contingency table has the proper structure with False/True values
163- # Convert index and columns to string to avoid boolean indexing issues
164146 contingency_table .index = contingency_table .index .astype (str )
165147 contingency_table .columns = contingency_table .columns .astype (str )
166148
@@ -175,7 +157,6 @@ def calculate_association_metrics(table):
175157 # Sort to ensure consistent order
176158 contingency_table = contingency_table .reindex (index = ['False' , 'True' ], columns = ['False' , 'True' ])
177159
178- # Perform statistical tests - use Fisher's exact test instead of chi-square for small counts
179160 try :
180161 # Check if chi-square test is appropriate (expected frequencies > 5)
181162 row_totals = contingency_table .sum (axis = 1 )
@@ -185,8 +166,7 @@ def calculate_association_metrics(table):
185166 # Calculate expected frequencies
186167 expected = np .outer (row_totals , col_totals ) / n
187168
188- # Determine which test to use
189- use_fisher = np .any (expected < 1 )
169+ use_fisher = np .any (expected < 5 )
190170
191171 if use_fisher :
192172 # Use Fisher's exact test
@@ -203,7 +183,6 @@ def calculate_association_metrics(table):
203183 association_metrics = calculate_association_metrics (contingency_table )
204184
205185 if association_metrics :
206- # Extract metric name (remove "(so)" suffix)
207186 metric_name = so_metric .replace ('(so)' , '' )
208187
209188 # Store results
@@ -266,18 +245,14 @@ def calculate_association_metrics(table):
266245 print (f"Error performing analysis for { so_metric } vs { gpt_metric } : { e } " )
267246 print (f"Contingency table: \n { contingency_table } " )
268247
269- # Create summary dataframe and visualizations if results exist
270248if all_results :
271- # Create summary dataframe
272249 results_df = pd .DataFrame (all_results )
273250 print ("\n --- Analysis Results Summary ---" )
274251 print (results_df .to_string ())
275252
276253 # Create distributions dataframe
277254 distributions_df = pd .DataFrame (distribution_data )
278-
279- # ------- Create Visualizations -------
280-
255+
281256 # 1. Test Statistic and Phi Coefficient visualization
282257 plt .figure (figsize = (14 , 7 ))
283258 metrics = results_df ['Metric' ].tolist ()
@@ -286,7 +261,6 @@ def calculate_association_metrics(table):
286261
287262 # Create two-part bar chart
288263 ax = plt .subplot (111 )
289- # Use a normalized test statistic for better visualization across different tests
290264 normalized_stats = []
291265 for i , test in enumerate (results_df ['Test' ]):
292266 if test == "Chi-square test" :
@@ -322,12 +296,10 @@ def calculate_association_metrics(table):
322296 # 2. Technical Debt Distribution visualization
323297 plt .figure (figsize = (16 , 8 ))
324298
325- # Create stacked bar chart for distribution
326299 metrics = distributions_df ['Metric' ].tolist ()
327300 x = np .arange (len (metrics ))
328301 width = 0.7
329302
330- # Stacked bars
331303 plt .bar (x , distributions_df ['Both (%)' ], width , label = 'Both have TD' , color = custom_palette ['Both' ])
332304 plt .bar (x , distributions_df ['Only GPT (%)' ], width , bottom = distributions_df ['Both (%)' ],
333305 label = 'Only GPT has TD' , color = custom_palette ['GPT' ])
@@ -338,20 +310,17 @@ def calculate_association_metrics(table):
338310 bottom = distributions_df ['Both (%)' ] + distributions_df ['Only GPT (%)' ] + distributions_df ['Only SO (%)' ],
339311 label = 'Neither has TD' , color = custom_palette ['Neither' ])
340312
341- # Add labels and title
342313 plt .ylabel ('Percentage of Cases' )
343314 plt .title ('Technical Debt Distribution by Metric' )
344315 plt .xticks (x , metrics , rotation = 45 , ha = 'right' )
345316 plt .legend (loc = 'upper left' , bbox_to_anchor = (1 , 1 ))
346317
347- # Add percentage labels on each segment (only for segments > 5%)
348318 for i , metric in enumerate (metrics ):
349- # Variables to track the bottom of each bar
350319 bottom = 0
351320 for category , color_key in [('Both (%)' , 'Both' ), ('Only GPT (%)' , 'GPT' ),
352321 ('Only SO (%)' , 'SO' ), ('Neither (%)' , 'Neither' )]:
353322 value = distributions_df .loc [distributions_df ['Metric' ] == metric , category ].values [0 ]
354- if value > 5 : # Only add text if segment is > 5%
323+ if value > 5 :
355324 plt .text (i , bottom + value / 2 , f'{ value :.1f} %' , ha = 'center' , va = 'center' ,
356325 fontweight = 'bold' , color = 'black' )
357326 bottom += value
@@ -363,18 +332,16 @@ def calculate_association_metrics(table):
363332 # 3. Prevalence Comparison visualization
364333 plt .figure (figsize = (14 , 7 ))
365334
366- # Create grouped bar chart
367335 x = np .arange (len (metrics ))
368336 width = 0.35
369337
370338 plt .bar (x - width / 2 , results_df ['GPT Rate (%)' ], width , label = 'GPT TD Rate' , color = custom_palette ['GPT' ])
371339 plt .bar (x + width / 2 , results_df ['SO Rate (%)' ], width , label = 'SO TD Rate' , color = custom_palette ['SO' ])
372340
373- # Add text showing the difference
374341 for i , metric in enumerate (metrics ):
375342 diff = results_df .loc [results_df ['Metric' ] == metric , 'Difference (SO-GPT)' ].values [0 ]
376- if abs (diff ) > 2 : # Only show difference if it's more than 2 percentage points
377- color = 'green' if diff < 0 else 'red' # Green if GPT is better (less TD), red if SO is better
343+ if abs (diff ) > 2 : #
344+ color = 'green' if diff < 0 else 'red'
378345 plt .text (i , max (results_df .loc [i , 'GPT Rate (%)' ], results_df .loc [i , 'SO Rate (%)' ]) + 2 ,
379346 f"{ diff :+.1f} pp" , ha = 'center' , va = 'bottom' , color = color , fontweight = 'bold' )
380347
@@ -397,7 +364,6 @@ def calculate_association_metrics(table):
397364 # 4a. Odds ratio (upper left)
398365 ax1 = axes [0 , 0 ]
399366 odds_ratios = results_df ['Odds Ratio' ].values
400- # Cap extremely large values for visualization
401367 odds_ratios_capped = [min (x , 10 ) for x in odds_ratios ]
402368
403369 # Use plt.bar directly with metrics for x values
@@ -463,20 +429,18 @@ def calculate_association_metrics(table):
463429 td_dist = distributions_df [distributions_df ['Metric' ] == 'TD' ].iloc [0 ]
464430
465431 # Create figure with more space between subplots
466- fig , ax = plt .subplots (2 , 2 , figsize = (16 , 14 )) # Increased figure size
467- plt .subplots_adjust (wspace = 0.3 , hspace = 0.4 ) # Add more space between subplots
432+ fig , ax = plt .subplots (2 , 2 , figsize = (16 , 14 ))
433+ plt .subplots_adjust (wspace = 0.3 , hspace = 0.4 )
468434
469435 # 5a. TD Distribution (upper left)
470436 labels = ['Both have TD' , 'Only GPT has TD' , 'Only SO has TD' , 'Neither has TD' ]
471437 sizes = [td_dist ['Both (%)' ], td_dist ['Only GPT (%)' ], td_dist ['Only SO (%)' ], td_dist ['Neither (%)' ]]
472438 colors = [custom_palette ['Both' ], custom_palette ['GPT' ], custom_palette ['SO' ], custom_palette ['Neither' ]]
473439
474440 ax [0 , 0 ].pie (sizes , labels = None , colors = colors , autopct = '%1.1f%%' , startangle = 90 )
475- ax [0 , 0 ].set_title ('Technical Debt Distribution' , fontsize = 14 , pad = 20 ) # Add padding to title
441+ ax [0 , 0 ].set_title ('Technical Debt Distribution' , fontsize = 14 , pad = 20 )
476442
477- # Fix the count for "Only SO has TD" if needed
478443 so_only_count = int (td_dist ['Only SO' ])
479- # Add legend with counts - position adjusted to avoid overlap
480444 legend_labels = [
481445 f"Both have TD ({ int (td_dist ['Both' ])} )" ,
482446 f"Only GPT has TD ({ int (td_dist ['Only GPT' ])} )" ,
@@ -492,13 +456,11 @@ def calculate_association_metrics(table):
492456 ax [0 , 1 ].set_title ('Technical Debt Prevalence' , fontsize = 14 , pad = 20 ) # Add padding to title
493457 ax [0 , 1 ].set_ylabel ('Percentage of Solutions with TD' )
494458
495- # Add text showing the difference - position adjusted to avoid overlap with title
496459 diff = td_data ['Difference (SO-GPT)' ]
497460 color = 'green' if diff > 0 else 'red' # Green if SO has more TD (GPT is better), red otherwise
498461
499- # Position the difference text at the top of the plot with more space
500462 y_max = max (heights )
501- ax [0 , 1 ].set_ylim (0 , y_max * 1.2 ) # Extend y-axis to make room for text
463+ ax [0 , 1 ].set_ylim (0 , y_max * 1.2 )
502464 ax [0 , 1 ].text (0.5 , y_max * 1.15 , f"Difference: { diff :.1f} pp" ,
503465 ha = 'center' , va = 'center' , color = color , fontweight = 'bold' , fontsize = 12 )
504466
@@ -507,14 +469,12 @@ def calculate_association_metrics(table):
507469 heights = [td_data ['P(GPT|SO)' ], td_data ['P(GPT|~SO)' ], td_data ['P(SO|GPT)' ], td_data ['P(SO|~GPT)' ]]
508470 colors = [custom_palette ['GPT' ], custom_palette ['GPT' ], custom_palette ['SO' ], custom_palette ['SO' ]]
509471 ax [1 , 0 ].bar (x_labels , heights , color = colors )
510- ax [1 , 0 ].set_title ('Conditional Probabilities' , fontsize = 14 , pad = 20 ) # Add padding to title
472+ ax [1 , 0 ].set_title ('Conditional Probabilities' , fontsize = 14 , pad = 20 )
511473 ax [1 , 0 ].set_ylabel ('Probability' )
512474
513- # Add text for each bar - improved positioning
514475 for i , h in enumerate (heights ):
515476 ax [1 , 0 ].text (i , h / 2 , f"{ h * 100 :.1f} %" , ha = 'center' , va = 'center' , color = 'white' , fontweight = 'bold' )
516477
517- # 5d. Association Metrics (lower right) - FIXED HERE
518478 # Create separate y-axis scales for different metrics
519479 ax1 = ax [1 , 1 ]
520480 ax1 .set_title ('Association Metrics' , fontsize = 14 , pad = 20 ) # Add padding to title
@@ -545,7 +505,7 @@ def calculate_association_metrics(table):
545505 ax2 .text (x_positions [2 ], td_data ['Odds Ratio' ]/ 2 , f"{ td_data ['Odds Ratio' ]:.2f} " ,
546506 ha = 'center' , va = 'center' , color = 'white' , fontweight = 'bold' )
547507
548- # Add legend - position it ABOVE the plot to avoid overlap
508+ # Add legend
549509 legend_elements = [
550510 plt .Rectangle ((0 ,0 ), 1 , 1 , color = '#1f77b4' , label = 'Test Statistic' ),
551511 plt .Rectangle ((0 ,0 ), 1 , 1 , color = '#ff7f0e' , label = 'Phi Coefficient' ),
@@ -557,15 +517,14 @@ def calculate_association_metrics(table):
557517 # Ensure proper scaling for secondary y-axis
558518 ax2 .set_ylim (0 , max (td_data ['Phi Coefficient' ], td_data ['Odds Ratio' ]) * 1.2 )
559519
560- plt .tight_layout (pad = 4.0 ) # Increased padding between subplots
520+ plt .tight_layout (pad = 4.0 )
561521 plt .savefig ('overall_td_analysis.png' , dpi = 300 , bbox_inches = 'tight' )
562522 print ("\n Overall technical debt analysis visualization saved as 'overall_td_analysis.png'" )
563523
564524else :
565525 print ("\n No analysis results to visualize." )
566526
567- # Print the final contingency table for TD
568527print ("\n Contingency table for final TD(so) vs TD(gpt)!" )
569528contingency_table = pd .crosstab (df ['TD(so)' ], df ['TD(gpt)' ], margins = True , margins_name = "Total" )
570529print ("\n Contingency Table:" )
571- print (contingency_table )
530+ print (contingency_table )
0 commit comments