-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze_jplag.py
More file actions
executable file
·98 lines (77 loc) · 3.42 KB
/
analyze_jplag.py
File metadata and controls
executable file
·98 lines (77 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
import os
import json
import time
import sys
import zipfile
from pathlib import Path
# Set the similarity threshold here
SIMILARITY_THRESHOLD = 70
def die(msg):
print(msg, file=sys.stderr)
sys.exit(1)
def get_jplag_jar():
"""Find JPlag JAR file in third_party directory next to this script"""
script_dir = Path(__file__).parent
jplag_jar = script_dir / "third_party" / "jplag.jar"
if not jplag_jar.exists():
die(f"Error: JPlag JAR not found at {jplag_jar}")
return jplag_jar
def get_latest_results_dir():
"""Get the latest subdirectory under jplag_results"""
results_dir = Path("jplag_results")
if not results_dir.exists():
die("Error: jplag_results directory does not exist")
subdirs = [d for d in results_dir.iterdir() if d.is_dir()]
if not subdirs:
die("Error: no JPlag results found in jplag_results")
# Sort by modification time, newest first
subdirs.sort(key=lambda d: d.stat().st_mtime, reverse=True)
return subdirs[0]
def analyze_jplag_results(results_dir):
"""Analyze JPlag results and suggest which reports to investigate"""
csv_start_time = time.time()
# Find all ZIP files in the results directory
zip_files = list(results_dir.glob("*.zip"))
if not zip_files:
die(f"Error: no JPlag results found in {results_dir}")
for zip_file in zip_files:
lang_prob = zip_file.stem
lang, prob = lang_prob.split('-', 1)
try:
with zipfile.ZipFile(zip_file) as zf:
with zf.open("overview.json") as f:
data = json.load(f)
# Find high similarity cases
high_similarity = []
for comparison in data['top_comparisons']:
similarity = comparison['similarities']['MAX'] * 100
if similarity >= SIMILARITY_THRESHOLD:
high_similarity.append((
comparison['first_submission'],
comparison['second_submission'],
similarity
))
if high_similarity:
cases_string = 'case' if len(high_similarity) == 1 else 'cases'
print(f"\n{lang}/{prob} ({len(high_similarity)} {cases_string}):")
for sub1, sub2, sim in high_similarity:
print(f" • {sub1} vs {sub2}: {sim:.2f}%")
# Suggest command to view report
print(f"\n View report:\n java -jar {get_jplag_jar()} {zip_file}")
except Exception as e:
print(f"Error processing {zip_file}: {e}")
csv_time = time.time() - csv_start_time
print(f"\nAnalysis completed in {csv_time:.1f}s")
def main():
if len(sys.argv) > 2:
print(f"Usage: {sys.argv[0]} [results_dir]")
print("Example: python analyze_jplag.py jplag_results/b46e698a7")
sys.exit(1)
# Use provided directory or get latest
results_dir = Path(sys.argv[1]) if len(sys.argv) == 2 else get_latest_results_dir()
if not results_dir.exists():
die(f"Error: results directory '{results_dir}' does not exist")
analyze_jplag_results(results_dir)
if __name__ == "__main__":
main()