Collection-Image-Workflow/extracting_id_from_image.py at main · dsm-museum/Collection-Image-Workflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
import base64
from pathlib import Path
from typing import Optional, List
import logging
import argparse

load_dotenv("chatai.env")


def init_client() -> OpenAI:
  api_key = os.getenv("API_KEY")
  base_url = os.getenv("BASE_URL")
  if not api_key:
    raise EnvironmentError("API_KEY is not set in environment")
  return OpenAI(api_key=api_key, base_url=base_url)

def _encode_image_to_data_url(path: Path, mime: str = "image/JPEG") -> str:
  with path.open("rb") as fh:
      b64 = base64.b64encode(fh.read()).decode("utf-8")
  return f"data:{mime};base64,{b64}"

def check_kind_of_image(image_path: Path, client: OpenAI, model: str) -> str:
  """Return 'Dokument', 'Objekt' or 'Unbekannt'."""
  try:
    data_url = _encode_image_to_data_url(image_path)
    response = client.chat.completions.create(
      model=model,
      messages=[
        {
          "role": "system",
          "content": "Du bist ein Bilderkenner. Entscheide, ob das Bild ein Dokument bzw. ein Foto eines Dokuments/Zettels ist oder ein Foto von einem Objekt ist. Wenn es ein Dokument ist, sage 'Dokument'. Wenn es ein Foto von einem Objekt ist, sage 'Objekt'. Wenn du dir nicht sicher bist, sage 'Unbekannt'."
        },
        {
          "role": "user",
          "content": [
            {"type": "text", "text": "Entscheide, ob das Bild ein Dokument oder ein Foto von einem Objekt ist. Wenn es ein Dokument ist, sage 'Dokument'. Wenn es ein Foto von einem Objekt ist, sage 'Objekt'. Wenn du dir nicht sicher bist, sage 'Unbekannt'."},
            {"type": "image_url", "image_url": {"url": data_url}}
          ]
        },
      ],
      temperature=0
    )

    return response.choices[0].message.content.strip()
  except Exception as exc:  # pragma: no cover - external API
    print(f"Fehler beim Bestimmen des Bildtyps: {exc}")
    return "Unbekannt"

def extract_id_from_image(image_path: Path, client: OpenAI, model: str) -> str:
  """Return extracted ID string or empty string if none found."""
  try:
    data_url = _encode_image_to_data_url(image_path)

    response = client.chat.completions.create(
      model=model,
      messages=[
        {
          "role": "system",
          "content": "Du bist ein OCR-Helfer. Suche im Bild nach einer numerischen ID und nach einem Kommentar. Der Kommentar steht meistens unter der ID und ist eine Objektbenennung. Bei der ID gibt es zwei Formate: 1) Die ID hat meistens das Format I/12345/67 oder 1/12345/67 oder IV/12345A/67 oder I/12345/67Pos.1 oder I|12345/67Pos.001. Wenn du eine ID findest, verwandle sie in folgendes Format: `I-12345_67 {Kommentar}` oder `I-12345A_67_Pos.1 {Kommentar}` oder `I-12345_12_Pos.001 {Kommentar}`. `Kommentar` (hier ein Platzhalter) ist dabei die kurze Objektbenennung. Wichtig ist, dass die ID immer mit einem I oder IV beginnt, gefolgt von einem Bindestrich, dann 5 Ziffern und optional einem Buchstaben, dann ein Unterstrich, dann 2 Ziffern, dann optional Pos.1 oder Pos.001 oder Pos.0001 oder Pos.1.2 oder so ähnlich und dann optional ein Kommentar getrennt durch ein Leerzeichen. Wenn du keinen Kommentar findest, lasse ihn einfach weg. Wenn du keine ID findest, gib eine leere Zeichenkette zurück. 2) Die ID hat meistens das Format ID 123456. Wenn du eine ID findest, verwandle sie in folgendes Format: `ObjID123456 {Kommentar}`. Wichtig ist, dass die ID immer mit ObjID beginnt, direkt gefolgt von sechs Ziffern, optional Unterstrich + Kommentar. Auch hier ist mit Kommentar die kurze Objektbenennung gemeint. Wenn du keine ID findest, gib eine leere Zeichenkette zurück."
        },
        {
          "role": "user",
          "content": [
            {"type": "text", "text": "Suche im Bild nach einer numerischen ID."},
            {"type": "image_url", "image_url": {"url": data_url}},
          ]
        },
      ],
      temperature=0
    )

    return response.choices[0].message.content.strip()
  except Exception as exc:  # pragma: no cover - external API
    print(f"Fehler beim Extrahieren der ID: {exc}")
    return ""

def _collect_image_files(directory: Path, patterns: Optional[List[str]] = None) -> List[Path]:
  if patterns is None:
    patterns = ["*.JPG", "*.jpg", "*.JPEG", "*.jpeg", "*.PNG", "*.png"]
  files: List[Path] = []
  for pat in patterns:
    files.extend(sorted(directory.glob(pat)))
  return sorted(set(files))

def process_images(image_dir: Path, client: OpenAI, model: str, output_dir: Path) -> None:
  images = []
  image_files = _collect_image_files(image_dir)

  for image_path in image_files:
    kind_of_image = check_kind_of_image(image_path, client, model)
    logging.info("%s: %s", image_path.name, kind_of_image)

    if kind_of_image == "Objekt":
      images.append(image_path)
      nef_image_path = image_path.with_suffix(".NEF")
      if nef_image_path.exists():
        images.append(nef_image_path)

    if kind_of_image == "Dokument":
      object_id = extract_id_from_image(image_path, client, model)
      object_id = object_id.replace("/", "-")
      if not object_id:
        logging.warning("Keine ID gefunden in %s", image_path)
        object_id = "unknown"
      logging.info("Extracted ID: %s", object_id)

      images.append(image_path)

      dest_dir = output_dir / object_id
      dest_dir.mkdir(parents=True, exist_ok=True)
      object_id_without_comment = object_id.split()[0]
      group_index_by_basename = {}
      next_group_index = 0
      for img in images:
        # NEF and JPG stay together with same index
        base_name = img.with_suffix("").name
        if base_name not in group_index_by_basename:
          group_index_by_basename[base_name] = next_group_index
          next_group_index += 1
        idx = group_index_by_basename[base_name]
        dest = dest_dir / f"{object_id_without_comment}_{idx}{img.suffix}"
        img.rename(dest)
      images = []

def main(argv: Optional[List[str]] = None) -> int:
  parser = argparse.ArgumentParser(description="Extract IDs from images and sort them")
  parser.add_argument("--input", "-i", default="input_images", help="Input images directory")
  parser.add_argument("--output", "-o", default="output_images", help="Output directory")
  parser.add_argument("--model", "-m", default=os.getenv("MODEL"), help="Model name to use")
  args = parser.parse_args(argv)

  logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

  try:
      client = init_client()
  except EnvironmentError as exc:
      logging.error(str(exc))
      return 2

  if not args.model:
      logging.error("MODEL is not set. Provide it via --model or environment variable MODEL.")
      return 2

  process_images(Path(args.input), client, args.model, Path(args.output))
  return 0


if __name__ == "__main__":
    raise SystemExit(main())