CAFA-6/check_paths.py at main · manan-monani/CAFA-6 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Quick script to verify all data paths are correctly configured.
Run this before starting the pipeline to ensure data files are accessible.
"""

from pathlib import Path
from config.config import get_config

def check_paths():
    """Check if all required data files and directories exist."""
    config = get_config()

    print("="*80)
    print("CAFA 6 - Data Path Verification")
    print("="*80)

    # Check data directories
    print("\n📁 DIRECTORY STATUS:")
    print("-" * 80)
    dirs_to_check = {
        "CAFA Data (base)": config.paths.DATA_DIR,
        "Train Directory": config.paths.TRAIN_DIR,
        "Test Directory": config.paths.TEST_DIR,
        "Output Directory": config.paths.OUTPUT_DIR,
        "Models Directory": config.paths.MODEL_DIR,
        "Embeddings Directory": config.paths.EMBEDDINGS_DIR,
        "Artifacts Directory": config.paths.ARTIFACTS_DIR,
    }

    for name, path in dirs_to_check.items():
        exists = path.exists()
        status = "✅" if exists else "❌"
        print(f"{status} {name:25s} → {path}")
        if not exists:
            print(f"   ⚠️  Will be created automatically")

    # Check required data files
    print("\n📄 REQUIRED DATA FILES:")
    print("-" * 80)
    files_to_check = {
        "Train Sequences": config.paths.TRAIN_SEQUENCES_FILE,
        "Train Labels": config.paths.TRAIN_LABELS_FILE,
        "Train Taxonomy": config.paths.TRAIN_TAXONOMY_FILE,
        "Test Sequences": config.paths.TEST_SEQUENCES_FILE,
        "Test Taxonomy": config.paths.TEST_TAXONOMY_FILE,
        "GO Ontology": config.paths.GO_OBO_PATH,
    }

    all_files_exist = True
    for name, path in files_to_check.items():
        exists = path.exists()
        status = "✅" if exists else "❌ MISSING"
        print(f"{status} {name:25s} → {path}")
        if not exists:
            all_files_exist = False
            print(f"   ❌ ERROR: This file is REQUIRED and must exist!")

    # Check artifact cache status
    print("\n💾 CACHED ARTIFACTS STATUS:")
    print("-" * 80)
    artifact_status = config.paths.get_all_artifacts_status()
    artifact_names = {
        'go_processor': 'GO Processor',
        'labels_matrix': 'Labels Matrix',
        'taxonomy_encoder': 'Taxonomy Encoder',
        'train_taxon': 'Train Taxon Encodings',
        'diamond_db': 'DIAMOND Database',
        'blast_results': 'BLAST Results',
    }

    cached_count = 0
    for key, name in artifact_names.items():
        if key in artifact_status:
            exists = artifact_status[key]
            status = "✅ CACHED" if exists else "⚪ Will be generated"
            if exists:
                cached_count += 1
            print(f"{status} {name:25s}")

    # Summary
    print("\n" + "="*80)
    print("SUMMARY:")
    print("="*80)

    if all_files_exist:
        print("✅ All required data files found!")
        print(f"💾 {cached_count}/{len(artifact_names)} artifacts cached")
        if cached_count > 0:
            print(f"   ⚡ This will save ~{cached_count * 5}-{cached_count * 15} minutes!")
        print("\n✨ You're ready to start the pipeline!")
        print("\nNext steps:")
        print("  1. python main.py check_status    # Verify data integrity")
        print("  2. python main.py embeddings      # Generate embeddings (4-6 hours)")
        print("  3. python main.py prepare_data    # Prepare training data (5-10 mins)")
        print("  4. python main.py train           # Train models (2-4 hours)")
        return True
    else:
        print("❌ MISSING REQUIRED DATA FILES!")
        print("\n⚠️  Please ensure you have downloaded all CAFA 6 data files")
        print("    and placed them in the 'CAFA Data' folder:")
        print(f"\n    📂 {config.paths.DATA_DIR}")
        print("\n📥 Download from: https://www.kaggle.com/competitions/cafa-6-protein-function-prediction/data")
        return False

if __name__ == "__main__":
    import sys
    success = check_paths()
    sys.exit(0 if success else 1)