Skip to content

Commit ac92090

Browse files
author
unknown
committed
Refactor scope list to match ml_config.json and remove unused /usr/local/lib/python
1 parent baa19bc commit ac92090

File tree

3 files changed

+56
-97
lines changed

3 files changed

+56
-97
lines changed

download_data.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,66 @@
2323
logger = logging.getLogger(__file__)
2424

2525

26-
def get_file_type(file_path: str, file_extension: str):
27-
file_path = file_path.lower()
28-
29-
example_indicators = ["test", "examp"]
30-
other_indicators = ["doc/", "documen", ".md", "readme"]
31-
32-
if any(ind in file_path for ind in example_indicators):
26+
from pathlib import Path
27+
28+
SCOPES = [
29+
"/conf",
30+
"/dist-packages/",
31+
"/example",
32+
"/record",
33+
"/script",
34+
"/site-packages/",
35+
"/src/",
36+
"/test",
37+
"/tool",
38+
39+
"/assets/"
40+
]
41+
42+
example_indicators = ["test", "examp"]
43+
other_indicators = ["doc/", "documen", ".md", "readme"]
44+
45+
def get_file_type(file_path: str, file_extension: str) -> str:
46+
"""Determine the file type (scope) based on path and known indicators."""
47+
posix_path = Path(file_path).as_posix().lower()
48+
49+
50+
for scope in SCOPES:
51+
if scope in posix_path:
52+
return scope.strip("/")
53+
54+
55+
if any(ind in posix_path for ind in example_indicators):
3356
return "test"
34-
if any(ind in file_path for ind in other_indicators) or file_extension == "":
57+
if any(ind in posix_path for ind in other_indicators) or file_extension == "":
3558
return "other"
3659

37-
return "src"
60+
return "src"
61+
62+
def restructure_dataset(input_dir: Path, output_dir: Path):
63+
"""Reorganize files into scope-based folders."""
64+
for file_path in input_dir.glob("**/*"):
65+
if not file_path.is_file():
66+
continue
3867

68+
69+
project_name = file_path.relative_to(input_dir).parts[0]
70+
71+
72+
scope = get_file_type(str(file_path))
73+
target_path = output_dir / project_name / scope / file_path.name
74+
75+
76+
target_path.parent.mkdir(parents=True, exist_ok=True)
77+
shutil.copy2(file_path, target_path)
78+
79+
if __name__ == "__main__":
80+
import argparse
81+
parser = argparse.ArgumentParser()
82+
parser.add_argument("--input", type=Path, required=True, help="Raw dataset directory (e.g., ./raw_data)")
83+
parser.add_argument("--output", type=Path, required=True, help="Processed output directory (e.g., ./scoped_data)")
84+
args = parser.parse_args()
85+
restructure_dataset(args.input, args.output)
3986

4087
def collect_licenses(temp_dir, ownername, reponame):
4188
license_files = list(pathlib.Path(f"{temp_dir}/{ownername}/{reponame}").glob("*LICEN*"))

migrate_cleanup.py

Lines changed: 0 additions & 32 deletions
This file was deleted.

restructure_by_scope.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

0 commit comments

Comments
 (0)