Clone_Benchmark/evaluation.py at main · gallegomiguel/Clone_Benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import subprocess
import json
import numpy as np
import os
import sys

NUM_RUNS = 15  # CAMBIAR: Número de veces que repetiremos el experimento

PYTHON_EXE = sys.executable
results = {
    "ASTNN": {"f1": [], "precision": [], "recall": [], "time": []}, # Añadido "time"
    "CodeBERT": {"f1": [], "precision": [], "recall": [], "time": []} # Añadido "time"
}

def run_script(script_path, work_dir, model_name):
    print(f"Ejecutando {model_name}...")
    process = subprocess.Popen(
        [PYTHON_EXE, "-u", script_path, "--lang", "java"],
        cwd=work_dir,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )

    output_json = None
    while True:
        line = process.stdout.readline()
        if not line and process.poll() is not None:
            break
        if line:
            print(f"   [{model_name}] {line.strip()}")
            if "__DATA_JSON__" in line:  # Encontramos la línea con los datos
                json_str = line.split("__DATA_JSON__")[1].strip()
                output_json = json.loads(json_str)

    if output_json:
        print(f"Resultados: F1={output_json['f1']:.4f}")
        return output_json
    else:
        print(f"Error: No se encontraron métricas finales para {model_name}")
        print(process.stderr.read())
        return None

print(f"--- INICIANDO BENCHMARK DE {NUM_RUNS} EJECUCIONES ---")

for i in range(1, NUM_RUNS + 1):
    print(f"\n=== VUELTA {i}/{NUM_RUNS} ===")

    # 1. ASTNN
    astnn_metrics = run_script("train.py", "astnn", "ASTNN")
    if astnn_metrics:
        results["ASTNN"]["f1"].append(astnn_metrics["f1"])
        results["ASTNN"]["precision"].append(astnn_metrics["precision"])
        results["ASTNN"]["recall"].append(astnn_metrics["recall"])
        results["ASTNN"]["time"].append(astnn_metrics["avg_inference_time"])

    # 2. CodeBERT
    cb_metrics = run_script("train_codebert.py", "codebert", "CodeBERT")
    if cb_metrics:
        results["CodeBERT"]["f1"].append(cb_metrics["f1"])
        results["CodeBERT"]["precision"].append(cb_metrics["precision"])
        results["CodeBERT"]["recall"].append(cb_metrics["recall"])
        results["CodeBERT"]["time"].append(cb_metrics["avg_inference_time"])

print("\n\n=== INFORME FINAL DE RESULTADOS ===")

for model in ["ASTNN", "CodeBERT"]:
    print(f"\n🔹 Modelo: {model}")
    if len(results[model]["f1"]) > 0:
        mean_f1 = np.mean(results[model]["f1"])
        std_f1 = np.std(results[model]["f1"])
        mean_time = np.mean(results[model]["time"])
        std_time = np.std(results[model]["time"])

        print(f"   F1-Score      : {mean_f1:.4f} ± {std_f1:.4f}")
        print(f"   Inferencia (s): {mean_time:.6f}s ± {std_time:.6f}s")
        print(f"   Inferencia (ms): {mean_time * 1000:.2f}ms")
    else:
        print("No hay datos disponibles.")

with open("resultados_benchmarking.txt", "w") as f:
    f.write(str(results))