|
1 | 1 | import re |
2 | 2 | from google.cloud import vision |
3 | 3 |
|
| 4 | +_client = vision.ImageAnnotatorClient() |
| 5 | + |
4 | 6 |
|
5 | 7 | def parse_receipt(image_bytes: bytes) -> dict: |
6 | | - client = vision.ImageAnnotatorClient() |
7 | 8 | image = vision.Image(content=image_bytes) |
8 | | - response = client.document_text_detection(image=image) |
| 9 | + response = _client.document_text_detection(image=image) |
| 10 | + |
| 11 | + if response.error.message: |
| 12 | + raise RuntimeError(f"Cloud Vision error: {response.error.message}") |
9 | 13 |
|
10 | 14 | if not response.full_text_annotation: |
11 | 15 | return {"merchant": "", "total": 0.0, "date": "", "currency": "USD"} |
12 | 16 |
|
13 | 17 | text = response.full_text_annotation.text |
14 | 18 | lines = [l.strip() for l in text.splitlines() if l.strip()] |
15 | 19 |
|
16 | | - merchant = lines[0] if lines else "" |
| 20 | + merchant = _extract_merchant(lines) |
| 21 | + total = _extract_total(text) |
| 22 | + date = _extract_date(text) |
| 23 | + |
| 24 | + return {"merchant": merchant, "total": total, "date": date, "currency": "USD"} |
17 | 25 |
|
18 | | - # Last dollar amount on the receipt is typically the total |
19 | | - amounts = re.findall(r'\$?\s*(\d+\.\d{2})', text) |
20 | | - total = float(amounts[-1]) if amounts else 0.0 |
21 | 26 |
|
22 | | - date_match = re.search( |
23 | | - r'(\d{4}-\d{2}-\d{2}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4})', text |
| 27 | +def _extract_merchant(lines: list) -> str: |
| 28 | + skip = re.compile( |
| 29 | + r'\d{1,5}\s+\w+\s+(st|ave|blvd|rd|dr|ln|way)\b' |
| 30 | + r'|\d{2}[:/]\d{2}' |
| 31 | + r'|^\d+$', |
| 32 | + re.IGNORECASE, |
24 | 33 | ) |
25 | | - date = date_match.group(1) if date_match else "" |
| 34 | + for line in lines[:5]: |
| 35 | + if not skip.search(line) and len(line) > 2: |
| 36 | + return line |
| 37 | + return lines[0] if lines else "" |
| 38 | + |
| 39 | + |
| 40 | +def _extract_total(text: str) -> float: |
| 41 | + for line in text.splitlines(): |
| 42 | + if re.search(r'\btotal\b', line, re.IGNORECASE): |
| 43 | + match = re.search(r'\$?\s*(\d+\.\d{2})', line) |
| 44 | + if match: |
| 45 | + return float(match.group(1)) |
| 46 | + amounts = re.findall(r'\$?\s*(\d+\.\d{2})', text) |
| 47 | + return max((float(a) for a in amounts), default=0.0) |
26 | 48 |
|
27 | | - return {"merchant": merchant, "total": total, "date": date, "currency": "USD"} |
| 49 | + |
| 50 | +def _extract_date(text: str) -> str: |
| 51 | + match = re.search( |
| 52 | + r'\b(\d{4}-\d{2}-\d{2}|(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\d|3[01])[/-]\d{2,4})\b', |
| 53 | + text, |
| 54 | + ) |
| 55 | + return match.group(1) if match else "" |
0 commit comments