Versatile-OCR-Program/config.yaml at main · raphael-seo/Versatile-OCR-Program · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# OCR System Configuration

# Environment
env_file_path: "/home/jupyter/Program/.env"

# Directories
directories:
  credentials: "/home/jupyter/Program/credentials"
  docker_build: "/home/jupyter/Program/OCR/src/ocr"

# Google Cloud Storage
gcs:
  bucket_name: "eju-ocr-results"
  # ↓ Centralized in v3.0_initial (defaults match prior hardcoded values)
  stage1_prefix: "stage_1"
  stage2_prefix: "stage_2"
  validate_max_results: 5

# API Keys (loaded from environment)
api_keys:
  openai: "OPENAI_API_KEY"
  gemini_api_key: "GEMINI_API_KEY"
  mathpix_app_id: "MATHPIX_APP_ID"
  mathpix_app_key: "MATHPIX_APP_KEY"

# Docker Configuration
docker:
  image_name: "cantaloupe"
  gpu_enabled: false
  runtime: "nvidia"
  nvidia_settings:
    visible_devices: "all"
    driver_capabilities: "compute,utility"

  # Source file paths (for Docker build)
  source_files:
    dockerfile: "src/ocr/Dockerfile"
    advanced_ocr: "src/ocr/advanced_ocr.py"
    custom_doclayout_yolo: "src/ocr/custom_doclayout_yolo.py"
    config: "config.yaml"
    prompts: "prompts/"

  # Container paths
  container_paths:
    input: "/app/input"
    output: "/app/output"
    credentials: "/app/credentials"
    config: "/app/config"
    python_path: "/app:/app/doclayout-yolo"
    main_script: "/app/advanced_ocr.py"

  # Volume mounts
  volumes:
    - "/app/input"
    - "/app/output"
    - "/app/credentials"
    - "/app/config"

# Credentials
credentials:
  google_vision_account: "Google_Vision_S.Account.json"

# Logging
logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  stage2_log_file: "ocr_stage2.log"

# OCR Processing
ocr:
  file_extensions: [".pdf"]
  max_display_files: 20
  confidence_threshold: 0.5
  use_cache: true
  cache_dir: "cache"
  # ↓ Centralized in v3.0_initial (defaults match prior hardcoded values)
  language_hints: ["ja", "en", "ko"]
  pdf_dpi: 200
  iou_threshold: 0.5
  image_processing:
    vision_max_dim: 1600
    gemini_max_dim: 1024
    jpeg_quality: 85

# DocLayout-YOLO Configuration
doclayout_yolo:
  model_path: null  # Local model file path (optional)
  device: "auto"  # "auto", "cuda:0", "cpu", etc.
  huggingface_repo_id: "juliozhao/DocLayout-YOLO-DocStructBench"
  huggingface_filename: "doclayout_yolo_docstructbench_imgsz1024.pt"
  fallback_model: "yolov8n.pt"  # Fallback model for ultralytics YOLO
  default_imgsz: 1024
  default_conf: 0.25

# Gemini API (Centralized in v3.0_initial)
gemini:
  model: "gemini-2.0-flash"
  figure_prompt_path: "prompts/gemini_figure.txt"
  table_prompt_path: "prompts/gemini_table.txt"

# Stage 2 Processing (ChatGPT correction)
stage2:
  model: "gpt-5-nano"
  max_tokens: 100000
  # ↓ Centralized in v3.0_initial
  system_prompt_path: "prompts/chatgpt_stage2.md"