ER_OCR/split_pdfs.py at master · bala-actuary/ER_OCR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python3
"""
Split Electoral Roll PDFs into individual page files.

Reads original multi-page PDFs from Input/ER_Downloads/AC-xxx/{english,tamil}/
and splits them into single-page PDFs in Input/split_files/AC-xxx/{english,tamil}/.

All pages are split. Non-data pages (metadata, summary, map, legend) are
auto-detected and skipped during extraction by extract_ocr.py.

Usage:
    python split_pdfs.py                    # Interactive prompt for AC number
    python split_pdfs.py --ac AC-188        # Direct AC specification
    python split_pdfs.py --ac AC-188 --force  # Overwrite existing split files
"""

import argparse
import glob
import os
import sys
from pathlib import Path

from pypdf import PdfReader, PdfWriter

SCRIPT_DIR = Path(__file__).parent
DOWNLOADS_DIR = SCRIPT_DIR / "Input" / "ER_Downloads"
SPLIT_DIR = SCRIPT_DIR / "Input" / "split_files"

# Pages to skip: 0 means split all pages. Non-data pages are auto-detected
# during extraction by extract_ocr.py (metadata, summary, map, legend pages).
SKIP_PAGES = {
    "english": 0,
    "tamil": 0,
}


def list_available_acs() -> list[str]:
    """List AC directories available in ER_Downloads."""
    if not DOWNLOADS_DIR.exists():
        return []
    return sorted(
        d.name for d in DOWNLOADS_DIR.iterdir()
        if d.is_dir() and d.name.startswith("AC-")
    )


def split_pdfs_for_language(input_folder: Path, output_folder: Path, skip_pages: int,
                            language: str, force: bool = False) -> dict:
    """Split all PDFs in input_folder into individual pages.

    Returns dict with stats: {total_files, total_pages, skipped_existing, errors}.
    """
    stats = {"total_files": 0, "total_pages": 0, "skipped_existing": 0, "errors": 0}

    pdf_files = sorted(glob.glob(str(input_folder / "**" / "*.pdf"), recursive=True))
    if not pdf_files:
        print(f"  No PDF files found in {input_folder}")
        return stats

    output_folder.mkdir(parents=True, exist_ok=True)

    for pdf_path in pdf_files:
        stats["total_files"] += 1
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]

        try:
            reader = PdfReader(pdf_path)
            total_pages = len(reader.pages)

            if total_pages <= skip_pages:
                print(f"  WARNING: {os.path.basename(pdf_path)} has only {total_pages} page(s), "
                      f"need >{skip_pages} — skipping")
                continue

            for page_idx in range(skip_pages, total_pages):
                original_page_num = page_idx + 1
                out_filename = f"{base_name}_page_{original_page_num}.pdf"
                out_path = output_folder / out_filename

                if out_path.exists() and not force:
                    stats["skipped_existing"] += 1
                    continue

                writer = PdfWriter()
                writer.add_page(reader.pages[page_idx])
                with open(out_path, "wb") as f:
                    writer.write(f)

                stats["total_pages"] += 1

        except Exception as e:
            print(f"  ERROR processing {os.path.basename(pdf_path)}: {e}")
            stats["errors"] += 1

    return stats


def prompt_ac_number() -> str:
    """Interactively prompt the user for an AC number."""
    available = list_available_acs()
    if available:
        print(f"\nAvailable ACs in {DOWNLOADS_DIR}:")
        for ac in available:
            print(f"  {ac}")
        print()

    ac = input("Please enter the Assembly Constituency No in AC-xxx format, example AC-188: ").strip()
    return ac


def validate_ac(ac_name: str) -> Path:
    """Validate AC directory exists and has english/tamil subdirs. Returns AC path."""
    if not ac_name.startswith("AC-"):
        print(f"ERROR: '{ac_name}' is not in AC-xxx format (e.g., AC-188)")
        sys.exit(1)

    ac_path = DOWNLOADS_DIR / ac_name
    if not ac_path.exists():
        available = list_available_acs()
        print(f"ERROR: Directory not found: {ac_path}")
        print(f"\nPlease create your input folder at: {DOWNLOADS_DIR / 'AC-xxx'}")
        print("Place English PDFs in the 'english' subfolder and Tamil PDFs in the 'tamil' subfolder.")
        if available:
            print(f"\nExisting AC directories: {', '.join(available)}")
        sys.exit(1)

    for lang in ("english", "tamil"):
        lang_path = ac_path / lang
        if not lang_path.exists():
            print(f"ERROR: Missing '{lang}' subfolder in {ac_path}")
            print(f"Expected: {lang_path}")
            sys.exit(1)

    return ac_path


def main():
    parser = argparse.ArgumentParser(
        description="Split Electoral Roll PDFs into individual page files."
    )
    parser.add_argument("--ac", type=str, default=None,
                        help="Assembly Constituency in AC-xxx format (e.g., AC-188)")
    parser.add_argument("--force", action="store_true",
                        help="Overwrite existing split files")
    args = parser.parse_args()

    # Get AC number
    ac_name = args.ac if args.ac else prompt_ac_number()
    ac_path = validate_ac(ac_name)

    # Output directory
    output_base = SPLIT_DIR / ac_name
    print(f"\n{'='*60}")
    print(f"Splitting PDFs for {ac_name}")
    print(f"  Source:  {ac_path}")
    print(f"  Output:  {output_base}")
    if args.force:
        print(f"  Mode:    FORCE (overwriting existing files)")
    print(f"{'='*60}\n")

    grand_total_files = 0
    grand_total_pages = 0

    for language, skip_count in SKIP_PAGES.items():
        input_folder = ac_path / language
        output_folder = output_base / language

        pdf_count = len(glob.glob(str(input_folder / "**" / "*.pdf"), recursive=True))
        skip_msg = f", skipping first {skip_count} page(s) each" if skip_count > 0 else ""
        print(f"[{language.upper()}] Found {pdf_count} PDF(s){skip_msg}, splitting all pages...")

        stats = split_pdfs_for_language(
            input_folder, output_folder, skip_count, language, force=args.force
        )

        grand_total_files += stats["total_files"]
        grand_total_pages += stats["total_pages"]

        print(f"  Split {stats['total_files']} file(s) -> {stats['total_pages']} page(s)")
        if stats["skipped_existing"] > 0:
            print(f"  Skipped {stats['skipped_existing']} existing file(s) (use --force to overwrite)")
        if stats["errors"] > 0:
            print(f"  {stats['errors']} error(s) encountered")
        print()

    print(f"{'='*60}")
    print(f"Done! Total: {grand_total_files} PDF(s) -> {grand_total_pages} page(s)")
    print(f"Split files saved to: {output_base}")
    print(f"\nNext step: python extract_ocr.py {ac_name} --workers 4")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()