|
23 | 23 | logger = logging.getLogger(__file__) |
24 | 24 |
|
25 | 25 |
|
26 | | -def get_file_type(file_path: str, file_extension: str): |
27 | | - file_path = file_path.lower() |
28 | | - |
29 | | - example_indicators = ["test", "examp"] |
30 | | - other_indicators = ["doc/", "documen", ".md", "readme"] |
31 | | - |
32 | | - if any(ind in file_path for ind in example_indicators): |
| 26 | +from pathlib import Path |
| 27 | + |
| 28 | +SCOPES = [ |
| 29 | + "/conf", |
| 30 | + "/dist-packages/", |
| 31 | + "/example", |
| 32 | + "/record", |
| 33 | + "/script", |
| 34 | + "/site-packages/", |
| 35 | + "/src/", |
| 36 | + "/test", |
| 37 | + "/tool", |
| 38 | + |
| 39 | + "/assets/" |
| 40 | +] |
| 41 | + |
| 42 | +example_indicators = ["test", "examp"] |
| 43 | +other_indicators = ["doc/", "documen", ".md", "readme"] |
| 44 | + |
| 45 | +def get_file_type(file_path: str, file_extension: str) -> str: |
| 46 | + """Determine the file type (scope) based on path and known indicators.""" |
| 47 | + posix_path = Path(file_path).as_posix().lower() |
| 48 | + |
| 49 | + |
| 50 | + for scope in SCOPES: |
| 51 | + if scope in posix_path: |
| 52 | + return scope.strip("/") |
| 53 | + |
| 54 | + |
| 55 | + if any(ind in posix_path for ind in example_indicators): |
33 | 56 | return "test" |
34 | | - if any(ind in file_path for ind in other_indicators) or file_extension == "": |
| 57 | + if any(ind in posix_path for ind in other_indicators) or file_extension == "": |
35 | 58 | return "other" |
36 | 59 |
|
37 | | - return "src" |
| 60 | + return "src" |
| 61 | + |
| 62 | +def restructure_dataset(input_dir: Path, output_dir: Path): |
| 63 | + """Reorganize files into scope-based folders.""" |
| 64 | + for file_path in input_dir.glob("**/*"): |
| 65 | + if not file_path.is_file(): |
| 66 | + continue |
38 | 67 |
|
| 68 | + |
| 69 | + project_name = file_path.relative_to(input_dir).parts[0] |
| 70 | + |
| 71 | + |
| 72 | + scope = get_file_type(str(file_path)) |
| 73 | + target_path = output_dir / project_name / scope / file_path.name |
| 74 | + |
| 75 | + |
| 76 | + target_path.parent.mkdir(parents=True, exist_ok=True) |
| 77 | + shutil.copy2(file_path, target_path) |
| 78 | + |
| 79 | +if __name__ == "__main__": |
| 80 | + import argparse |
| 81 | + parser = argparse.ArgumentParser() |
| 82 | + parser.add_argument("--input", type=Path, required=True, help="Raw dataset directory (e.g., ./raw_data)") |
| 83 | + parser.add_argument("--output", type=Path, required=True, help="Processed output directory (e.g., ./scoped_data)") |
| 84 | + args = parser.parse_args() |
| 85 | + restructure_dataset(args.input, args.output) |
39 | 86 |
|
40 | 87 | def collect_licenses(temp_dir, ownername, reponame): |
41 | 88 | license_files = list(pathlib.Path(f"{temp_dir}/{ownername}/{reponame}").glob("*LICEN*")) |
|
0 commit comments