-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_paths.py
More file actions
108 lines (96 loc) Β· 3.97 KB
/
check_paths.py
File metadata and controls
108 lines (96 loc) Β· 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Quick script to verify all data paths are correctly configured.
Run this before starting the pipeline to ensure data files are accessible.
"""
from pathlib import Path
from config.config import get_config
def check_paths():
"""Check if all required data files and directories exist."""
config = get_config()
print("="*80)
print("CAFA 6 - Data Path Verification")
print("="*80)
# Check data directories
print("\nπ DIRECTORY STATUS:")
print("-" * 80)
dirs_to_check = {
"CAFA Data (base)": config.paths.DATA_DIR,
"Train Directory": config.paths.TRAIN_DIR,
"Test Directory": config.paths.TEST_DIR,
"Output Directory": config.paths.OUTPUT_DIR,
"Models Directory": config.paths.MODEL_DIR,
"Embeddings Directory": config.paths.EMBEDDINGS_DIR,
"Artifacts Directory": config.paths.ARTIFACTS_DIR,
}
for name, path in dirs_to_check.items():
exists = path.exists()
status = "β
" if exists else "β"
print(f"{status} {name:25s} β {path}")
if not exists:
print(f" β οΈ Will be created automatically")
# Check required data files
print("\nπ REQUIRED DATA FILES:")
print("-" * 80)
files_to_check = {
"Train Sequences": config.paths.TRAIN_SEQUENCES_FILE,
"Train Labels": config.paths.TRAIN_LABELS_FILE,
"Train Taxonomy": config.paths.TRAIN_TAXONOMY_FILE,
"Test Sequences": config.paths.TEST_SEQUENCES_FILE,
"Test Taxonomy": config.paths.TEST_TAXONOMY_FILE,
"GO Ontology": config.paths.GO_OBO_PATH,
}
all_files_exist = True
for name, path in files_to_check.items():
exists = path.exists()
status = "β
" if exists else "β MISSING"
print(f"{status} {name:25s} β {path}")
if not exists:
all_files_exist = False
print(f" β ERROR: This file is REQUIRED and must exist!")
# Check artifact cache status
print("\nπΎ CACHED ARTIFACTS STATUS:")
print("-" * 80)
artifact_status = config.paths.get_all_artifacts_status()
artifact_names = {
'go_processor': 'GO Processor',
'labels_matrix': 'Labels Matrix',
'taxonomy_encoder': 'Taxonomy Encoder',
'train_taxon': 'Train Taxon Encodings',
'diamond_db': 'DIAMOND Database',
'blast_results': 'BLAST Results',
}
cached_count = 0
for key, name in artifact_names.items():
if key in artifact_status:
exists = artifact_status[key]
status = "β
CACHED" if exists else "βͺ Will be generated"
if exists:
cached_count += 1
print(f"{status} {name:25s}")
# Summary
print("\n" + "="*80)
print("SUMMARY:")
print("="*80)
if all_files_exist:
print("β
All required data files found!")
print(f"πΎ {cached_count}/{len(artifact_names)} artifacts cached")
if cached_count > 0:
print(f" β‘ This will save ~{cached_count * 5}-{cached_count * 15} minutes!")
print("\n⨠You're ready to start the pipeline!")
print("\nNext steps:")
print(" 1. python main.py check_status # Verify data integrity")
print(" 2. python main.py embeddings # Generate embeddings (4-6 hours)")
print(" 3. python main.py prepare_data # Prepare training data (5-10 mins)")
print(" 4. python main.py train # Train models (2-4 hours)")
return True
else:
print("β MISSING REQUIRED DATA FILES!")
print("\nβ οΈ Please ensure you have downloaded all CAFA 6 data files")
print(" and placed them in the 'CAFA Data' folder:")
print(f"\n π {config.paths.DATA_DIR}")
print("\nπ₯ Download from: https://www.kaggle.com/competitions/cafa-6-protein-function-prediction/data")
return False
if __name__ == "__main__":
import sys
success = check_paths()
sys.exit(0 if success else 1)