-
Notifications
You must be signed in to change notification settings - Fork 49
Expand file tree
/
Copy pathauto_run_stage1.py
More file actions
214 lines (171 loc) · 7.48 KB
/
auto_run_stage1.py
File metadata and controls
214 lines (171 loc) · 7.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Auto-Run Stage 1: Recursive OCR Processing Wrapper
Wrapper script that provides recursive directory scanning for Stage 1 OCR processing.
Results are stored directly in GCS, no local output directories.
"""
import os
import yaml
import logging
import subprocess
import argparse
from pathlib import Path
# ----------------------------
# [1] Configuration Loader
# ----------------------------
def load_auto_run_config(config_path="auto_run.yaml"):
"""Load auto-run configuration from YAML file"""
try:
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
return config
except Exception as e:
print(f"Error loading auto-run config file: {e}")
return None
# ----------------------------
# [2] Recursive Directory Scanner
# ----------------------------
def find_pdf_directories(root_dir, extensions=['.pdf']):
"""
Recursively find all directories containing PDF files.
Args:
root_dir (str): Root directory to scan
extensions (list): List of file extensions to look for
Returns:
list: List of directories containing PDF files
"""
pdf_directories = []
logging.info(f"Scanning root directory: {root_dir}")
for root, dirs, files in os.walk(root_dir):
# Skip hidden directories (starting with .)
dirs[:] = [d for d in dirs if not d.startswith('.')]
# Skip if current directory is hidden
if os.path.basename(root).startswith('.'):
logging.info(f"Skipping hidden directory: {root}")
continue
logging.info(f"Checking directory: {root}")
logging.info(f" Files found: {files}")
# Check if current directory contains PDF files (excluding hidden files)
pdf_files = [f for f in files
if f.lower().endswith(tuple(ext.lower() for ext in extensions))
and not f.startswith('.')]
if pdf_files:
logging.info(f" PDF files in {root}: {pdf_files}")
pdf_directories.append(root)
else:
logging.info(f" No PDF files in {root}")
logging.info(f"Total directories with PDFs: {pdf_directories}")
return pdf_directories
# ----------------------------
# [3] Stage 1 OCR Processor
# ----------------------------
def run_stage1_ocr(input_dir):
"""
Run Stage 1 OCR processing using the existing ocr_stage_1.py script.
Results are uploaded directly to GCS by the Docker container.
Args:
input_dir (str): Input directory containing PDFs
Returns:
bool: Success status
"""
try:
# Get the directory where this script is located (same as config.yaml)
script_dir = os.path.dirname(os.path.abspath(__file__))
# Path to ocr_stage_1.py relative to config.yaml location
ocr_stage1_path = os.path.join(script_dir, "src", "stages", "ocr_stage_1.py")
# Build command to run ocr_stage_1.py
cmd = [
"python3", ocr_stage1_path,
"--input", input_dir
]
logging.info(f"Running Stage 1 OCR for directory: {input_dir}")
logging.info(f"Command: {' '.join(cmd)}")
# Execute the command
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
logging.info(f"Stage 1 OCR completed successfully for: {input_dir}")
logging.info(f"Results uploaded to GCS automatically by Docker container")
return True
except subprocess.CalledProcessError as e:
logging.error(f"Stage 1 OCR failed for {input_dir}: {e}")
logging.error(f"STDOUT: {e.stdout}")
logging.error(f"STDERR: {e.stderr}")
return False
except Exception as e:
logging.error(f"Unexpected error in Stage 1 OCR for {input_dir}: {e}")
return False
# ----------------------------
# [4] Main Auto-Run Function
# ----------------------------
def main():
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)
parser = argparse.ArgumentParser(description="Auto-Run Stage 1: Recursive OCR Processing")
parser.add_argument("--config", default="auto_run.yaml",
help="Auto-run configuration file path (default: auto_run.yaml)")
args = parser.parse_args()
# Load auto-run configuration
config = load_auto_run_config(args.config)
if not config:
logger.error("Failed to load auto-run configuration")
return False
# Get Stage 1 configuration
stage1_config = config.get('stage1', {})
input_directory = stage1_config.get('input_directory')
mode = stage1_config.get('mode', 'recursive')
if not input_directory:
logger.error("Stage 1 input_directory must be specified in config")
return False
# Convert to absolute path if relative
if not os.path.isabs(input_directory):
script_dir = os.path.dirname(os.path.abspath(__file__))
input_directory = os.path.join(script_dir, input_directory)
if not os.path.exists(input_directory):
logger.error(f"Input directory does not exist: {input_directory}")
return False
logger.info("=== Auto-Run Stage 1: Recursive OCR Processing Starting ===")
logger.info(f"Input directory: {input_directory}")
logger.info(f"Mode: {mode}")
logger.info("Results will be uploaded directly to GCS by Docker container")
# Find all directories containing PDFs
if mode == 'recursive':
pdf_directories = find_pdf_directories(input_directory)
logger.info(f"Found {len(pdf_directories)} directories containing PDF files")
if not pdf_directories:
logger.warning("No directories with PDF files found")
return True
# Process each directory
success_count = 0
total_count = len(pdf_directories)
for i, pdf_dir in enumerate(pdf_directories, 1):
# Show which PDF files are in this directory (excluding hidden files)
pdf_files = [f for f in os.listdir(pdf_dir)
if f.lower().endswith('.pdf') and not f.startswith('.')]
logger.info(f"Processing directory {i}/{total_count}: {pdf_dir}")
logger.info(f" PDF files to process: {pdf_files}")
# Run Stage 1 OCR for this directory
success = run_stage1_ocr(pdf_dir)
if success:
success_count += 1
logger.info(f"Successfully processed {i}/{total_count}: {pdf_dir}")
else:
logger.error(f"Failed to process {i}/{total_count}: {pdf_dir}")
logger.info(f"=== Auto-Run Stage 1 Complete: {success_count}/{total_count} directories processed successfully ===")
return success_count == total_count
else:
# Direct processing mode (non-recursive)
logger.info("Running in direct processing mode")
success = run_stage1_ocr(input_directory)
if success:
logger.info("=== Auto-Run Stage 1 Complete: Successfully processed ===")
else:
logger.error("=== Auto-Run Stage 1 Failed ===")
return success
if __name__ == "__main__":
success = main()
exit(0 if success else 1)