Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aiutil/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""A utils Python package for data scientists."""

__version__ = "0.86.1"
__version__ = "0.87.0"
48 changes: 48 additions & 0 deletions aiutil/pdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
"""Manipulating PDFs."""

import datetime
from pathlib import Path
import re
from typing import Iterable
from pypdf import PdfWriter, PdfReader
import pdfplumber

FMT = "%Y%m%d"


def extract_pages(file: str, subfiles: dict[str, int | Iterable[int]]) -> None:
Expand Down Expand Up @@ -42,3 +48,45 @@ def _extract_pages(
writer.add_page(reader.pages[index])
with open(output, "wb") as fout:
writer.write(fout)


def extract_text_first_page(path: str | Path) -> str:
"""Extract the text of the first page of a PDF file.

:param path: The path of the PDF file.
:return: The text of the first page.
"""
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
return page.extract_text()


def _rename_puget_sound_energy(path: Path, text_first_page: str) -> Path:
m = re.search(r"Issued: (\w+ \d{1,2}, \d{4})", text_first_page)
date = datetime.datetime.strptime(m.group(1), "%B %d, %Y").strftime(FMT)
path_new = path.with_name(f"pse_{date}.pdf")
path.rename(path_new)
return path_new


def _rename_bellevue_water(path: Path, text_first_page: str) -> Path:
m = re.search(r"Bill Date: (\d{1,2}/\d{1,2}/\d{4})", text_first_page)
date = datetime.datetime.strptime(m.group(1), "%m/%d/%Y").strftime(FMT)
path_new = path.with_name(f"bellevue_water_{date}.pdf")
path.rename(path_new)
return path_new


def rename_auto(path: str | Path) -> Path:
"""Rename a PDF file automatically based on its content.

:param path: The path of the PDF file.
:return: The path of the renamed PDF file.
"""
if isinstance(path, str):
path = Path(path)
text = extract_text_first_page(path)
if "Puget Sound Energy" in text:
return _rename_puget_sound_energy(path, text)
if "MyUtilityBill.bellevuewa.gov" in text:
return _rename_bellevue_water(path, text)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "aiutil"
version = "0.86.1"
version = "0.87.0"
description = "A utils Python package for data scientists."
authors = [{ name = "Benjamin Du", email = "[email protected]" }]
requires-python = ">=3.10,<3.14"
Expand Down Expand Up @@ -30,6 +30,7 @@ dependencies = [
"paramiko>=3.2.0",
"nbformat>=5.10.4",
"nbconvert>=7.16.6",
"pdfplumber>=0.11.7",
]

[project.optional-dependencies]
Expand Down
51 changes: 50 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading