Versatile-OCR-Program/auto_run_stage1.py at main · raphael-seo/Versatile-OCR-Program · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Auto-Run Stage 1: Recursive OCR Processing Wrapper
Wrapper script that provides recursive directory scanning for Stage 1 OCR processing.
Results are stored directly in GCS, no local output directories.
"""

import os
import yaml
import logging
import subprocess
import argparse
from pathlib import Path

# ----------------------------
# [1] Configuration Loader
# ----------------------------
def load_auto_run_config(config_path="auto_run.yaml"):
    """Load auto-run configuration from YAML file"""
    try:
        with open(config_path, 'r', encoding='utf-8') as f:
            config = yaml.safe_load(f)
        return config
    except Exception as e:
        print(f"Error loading auto-run config file: {e}")
        return None

# ----------------------------
# [2] Recursive Directory Scanner
# ----------------------------
def find_pdf_directories(root_dir, extensions=['.pdf']):
    """
    Recursively find all directories containing PDF files.

    Args:
        root_dir (str): Root directory to scan
        extensions (list): List of file extensions to look for

    Returns:
        list: List of directories containing PDF files
    """
    pdf_directories = []

    logging.info(f"Scanning root directory: {root_dir}")

    for root, dirs, files in os.walk(root_dir):
        # Skip hidden directories (starting with .)
        dirs[:] = [d for d in dirs if not d.startswith('.')]

        # Skip if current directory is hidden
        if os.path.basename(root).startswith('.'):
            logging.info(f"Skipping hidden directory: {root}")
            continue

        logging.info(f"Checking directory: {root}")
        logging.info(f"  Files found: {files}")

        # Check if current directory contains PDF files (excluding hidden files)
        pdf_files = [f for f in files
                    if f.lower().endswith(tuple(ext.lower() for ext in extensions))
                    and not f.startswith('.')]

        if pdf_files:
            logging.info(f"  PDF files in {root}: {pdf_files}")
            pdf_directories.append(root)
        else:
            logging.info(f"  No PDF files in {root}")

    logging.info(f"Total directories with PDFs: {pdf_directories}")
    return pdf_directories

# ----------------------------
# [3] Stage 1 OCR Processor
# ----------------------------
def run_stage1_ocr(input_dir):
    """
    Run Stage 1 OCR processing using the existing ocr_stage_1.py script.
    Results are uploaded directly to GCS by the Docker container.

    Args:
        input_dir (str): Input directory containing PDFs

    Returns:
        bool: Success status
    """
    try:
        # Get the directory where this script is located (same as config.yaml)
        script_dir = os.path.dirname(os.path.abspath(__file__))

        # Path to ocr_stage_1.py relative to config.yaml location
        ocr_stage1_path = os.path.join(script_dir, "src", "stages", "ocr_stage_1.py")

        # Build command to run ocr_stage_1.py
        cmd = [
            "python3", ocr_stage1_path,
            "--input", input_dir
        ]

        logging.info(f"Running Stage 1 OCR for directory: {input_dir}")
        logging.info(f"Command: {' '.join(cmd)}")

        # Execute the command
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)

        logging.info(f"Stage 1 OCR completed successfully for: {input_dir}")
        logging.info(f"Results uploaded to GCS automatically by Docker container")

        return True

    except subprocess.CalledProcessError as e:
        logging.error(f"Stage 1 OCR failed for {input_dir}: {e}")
        logging.error(f"STDOUT: {e.stdout}")
        logging.error(f"STDERR: {e.stderr}")
        return False
    except Exception as e:
        logging.error(f"Unexpected error in Stage 1 OCR for {input_dir}: {e}")
        return False

# ----------------------------
# [4] Main Auto-Run Function
# ----------------------------
def main():
    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[logging.StreamHandler()]
    )
    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(description="Auto-Run Stage 1: Recursive OCR Processing")
    parser.add_argument("--config", default="auto_run.yaml",
                        help="Auto-run configuration file path (default: auto_run.yaml)")

    args = parser.parse_args()

    # Load auto-run configuration
    config = load_auto_run_config(args.config)
    if not config:
        logger.error("Failed to load auto-run configuration")
        return False

    # Get Stage 1 configuration
    stage1_config = config.get('stage1', {})
    input_directory = stage1_config.get('input_directory')
    mode = stage1_config.get('mode', 'recursive')

    if not input_directory:
        logger.error("Stage 1 input_directory must be specified in config")
        return False

    # Convert to absolute path if relative
    if not os.path.isabs(input_directory):
        script_dir = os.path.dirname(os.path.abspath(__file__))
        input_directory = os.path.join(script_dir, input_directory)

    if not os.path.exists(input_directory):
        logger.error(f"Input directory does not exist: {input_directory}")
        return False

    logger.info("=== Auto-Run Stage 1: Recursive OCR Processing Starting ===")
    logger.info(f"Input directory: {input_directory}")
    logger.info(f"Mode: {mode}")
    logger.info("Results will be uploaded directly to GCS by Docker container")

    # Find all directories containing PDFs
    if mode == 'recursive':
        pdf_directories = find_pdf_directories(input_directory)
        logger.info(f"Found {len(pdf_directories)} directories containing PDF files")

        if not pdf_directories:
            logger.warning("No directories with PDF files found")
            return True

        # Process each directory
        success_count = 0
        total_count = len(pdf_directories)

        for i, pdf_dir in enumerate(pdf_directories, 1):
            # Show which PDF files are in this directory (excluding hidden files)
            pdf_files = [f for f in os.listdir(pdf_dir)
                        if f.lower().endswith('.pdf') and not f.startswith('.')]
            logger.info(f"Processing directory {i}/{total_count}: {pdf_dir}")
            logger.info(f"  PDF files to process: {pdf_files}")

            # Run Stage 1 OCR for this directory
            success = run_stage1_ocr(pdf_dir)

            if success:
                success_count += 1
                logger.info(f"Successfully processed {i}/{total_count}: {pdf_dir}")
            else:
                logger.error(f"Failed to process {i}/{total_count}: {pdf_dir}")

        logger.info(f"=== Auto-Run Stage 1 Complete: {success_count}/{total_count} directories processed successfully ===")
        return success_count == total_count

    else:
        # Direct processing mode (non-recursive)
        logger.info("Running in direct processing mode")
        success = run_stage1_ocr(input_directory)

        if success:
            logger.info("=== Auto-Run Stage 1 Complete: Successfully processed ===")
        else:
            logger.error("=== Auto-Run Stage 1 Failed ===")

        return success

if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)