-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
560 lines (466 loc) · 19.3 KB
/
Copy pathapp.py
File metadata and controls
560 lines (466 loc) · 19.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
"""
On-Prem Document Thumbnail Service
----------------------------------
Flow:
1. Client uploads a file (pdf/doc/docx/ppt/pptx).
2. Service:
- Generates a doc_id.
- Saves the original file locally (for RAG / retention).
- Generates a PNG screenshot of the first page/slide.
- Saves that PNG locally.
3. Responds with:
- doc_id
- original_file_url
- thumbnail_url (frontend can <img src=...>)
Extras:
- File size limit check
- Logging
- /files endpoint to list uploaded docs
- /delete/{doc_id} to clean up
- metadata.json to remember uploads
This file is meant to be portable.
Change only the CONFIG section for different machines/environments.
"""
import os
import uuid
import tempfile
import subprocess
import json
import logging
from datetime import datetime
from typing import Literal
import fitz # PyMuPDF
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
# ─────────────────────────────────────────
# CONFIG (edit this part per environment)
# ─────────────────────────────────────────
# Where to store stuff on disk (documents, thumbnails, metadata.json)
BASE_STORAGE_DIR = os.getenv("BASE_STORAGE_DIR", "./storage")
DOCUMENTS_DIR = os.path.join(BASE_STORAGE_DIR, "documents")
THUMBNAILS_DIR = os.path.join(BASE_STORAGE_DIR, "thumbnails")
METADATA_FILE = os.path.join(BASE_STORAGE_DIR, "metadata.json")
# Max allowed upload size in bytes (50 MB default)
MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", str(50 * 1024 * 1024)))
# Path to LibreOffice CLI.
# macOS: /Applications/LibreOffice.app/Contents/MacOS/soffice
# Linux: /usr/bin/libreoffice
# Windows: C:\\Program Files\\LibreOffice\\program\\soffice.exe
SOFFICE_CMD = os.getenv(
"SOFFICE_CMD",
"C:\\Program Files\\LibreOffice\\program\\soffice.exe" if os.name == 'nt' else "/Applications/LibreOffice.app/Contents/MacOS/soffice"
)
# CORS allowed origins. In prod you can set a specific frontend URL.
CORS_ALLOW_ORIGINS = os.getenv("CORS_ALLOW_ORIGINS", "*").split(",")
# ─────────────────────────────────────────
# Setup filesystem + logging
# ─────────────────────────────────────────
os.makedirs(DOCUMENTS_DIR, exist_ok=True)
os.makedirs(THUMBNAILS_DIR, exist_ok=True)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("thumbnail-service")
# ─────────────────────────────────────────
# App + static mounts
# ─────────────────────────────────────────
app = FastAPI(
title="On-Prem Document Thumbnail API",
version="1.0.0",
description="Upload a doc/pdf/pptx/etc and get a first-page thumbnail"
)
app.add_middleware(
CORSMiddleware,
allow_origins=CORS_ALLOW_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Static file serving for browser access:
# /documents/<doc_id>.<ext> (original file)
# /thumbnails/<doc_id>.png (preview image)
app.mount("/documents", StaticFiles(directory=DOCUMENTS_DIR), name="documents")
app.mount("/thumbnails", StaticFiles(directory=THUMBNAILS_DIR), name="thumbnails")
# ─────────────────────────────────────────
# Metadata helpers
# ─────────────────────────────────────────
def load_metadata():
"""Load metadata (doc_id -> info) from disk."""
if os.path.exists(METADATA_FILE):
with open(METADATA_FILE, "r") as f:
return json.load(f)
return {}
def save_metadata(metadata):
"""Persist metadata back to disk."""
with open(METADATA_FILE, "w") as f:
json.dump(metadata, f, indent=2)
def add_file_metadata(doc_id, filename, extension, uploaded_at):
metadata = load_metadata()
metadata[doc_id] = {
"filename": filename,
"extension": extension,
"uploaded_at": uploaded_at
}
save_metadata(metadata)
def check_duplicate_filename(filename):
"""Check if a filename already exists in metadata."""
metadata = load_metadata()
for doc_id, info in metadata.items():
if info["filename"] == filename:
return doc_id, info
return None, None
def remove_file_metadata(doc_id):
metadata = load_metadata()
if doc_id in metadata:
del metadata[doc_id]
save_metadata(metadata)
# ─────────────────────────────────────────
# PDF -> PNG (first page only)
# ─────────────────────────────────────────
def pdf_first_page_to_png(pdf_bytes: bytes) -> bytes:
"""
Convert first page of a PDF (bytes) into PNG bytes.
"""
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
except Exception as e:
logger.error(f"Failed to open PDF: {e}")
raise HTTPException(
status_code=400,
detail="File is not a valid PDF or cannot be opened as PDF."
)
if doc.page_count == 0:
doc.close()
raise HTTPException(status_code=400, detail="PDF has 0 pages.")
page = doc.load_page(0)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for quality
png_bytes = pix.tobytes("png")
doc.close()
return png_bytes
# ─────────────────────────────────────────
# Office -> PDF via LibreOffice/soffice -> PNG
# ─────────────────────────────────────────
def office_first_page_to_png(
file_bytes: bytes,
original_extension: Literal[".doc", ".docx", ".ppt", ".pptx"]
) -> bytes:
"""
1. Save uploaded Office file to a temp file.
2. Use soffice (LibreOffice CLI) to convert to PDF.
3. Render first page of that PDF to PNG bytes.
4. Cleanup temp files.
"""
# 1. write to temp file with correct extension
with tempfile.NamedTemporaryFile(delete=False, suffix=original_extension) as tmp_in:
tmp_in.write(file_bytes)
tmp_in_path = tmp_in.name
# 2. output dir for the converted PDF
tmp_out_dir = tempfile.mkdtemp()
try:
logger.info(f"[{tmp_in_path}] Converting {original_extension} -> PDF with {SOFFICE_CMD}")
result = subprocess.run(
[
SOFFICE_CMD,
"--headless",
"--convert-to", "pdf",
"--outdir", tmp_out_dir,
tmp_in_path,
],
capture_output=True,
check=True,
timeout=30,
)
base_no_ext = os.path.splitext(os.path.basename(tmp_in_path))[0]
guessed_pdf_path = os.path.join(tmp_out_dir, base_no_ext + ".pdf")
if not os.path.exists(guessed_pdf_path):
pdf_candidates = [
os.path.join(tmp_out_dir, f)
for f in os.listdir(tmp_out_dir)
if f.lower().endswith(".pdf")
]
if not pdf_candidates:
logger.error("No PDF produced by soffice")
raise HTTPException(
status_code=500,
detail=(
"LibreOffice/soffice conversion succeeded but no PDF was found. "
"stderr=" + result.stderr.decode("utf-8", "ignore")
),
)
guessed_pdf_path = max(pdf_candidates, key=os.path.getmtime)
with open(guessed_pdf_path, "rb") as pdf_file:
converted_pdf_bytes = pdf_file.read()
png_bytes = pdf_first_page_to_png(converted_pdf_bytes)
logger.info(f"[{tmp_in_path}] Thumbnail generated OK")
return png_bytes
except subprocess.TimeoutExpired:
logger.error("LibreOffice/soffice timed out")
raise HTTPException(
status_code=500,
detail="LibreOffice/soffice conversion timed out."
)
except subprocess.CalledProcessError as e:
logger.error(f"LibreOffice/soffice failed: {e.stderr.decode('utf-8','ignore')}")
raise HTTPException(
status_code=500,
detail=(
"LibreOffice/soffice failed to convert file to PDF. "
"stderr=" + e.stderr.decode("utf-8", "ignore")
),
)
finally:
# cleanup tmp files/dirs
if os.path.exists(tmp_in_path):
try:
os.unlink(tmp_in_path)
except OSError:
pass
if os.path.isdir(tmp_out_dir):
for f in os.listdir(tmp_out_dir):
fp = os.path.join(tmp_out_dir, f)
try:
os.unlink(fp)
except OSError:
pass
try:
os.rmdir(tmp_out_dir)
except OSError:
pass
# ─────────────────────────────────────────
# Routing logic based on extension
# ─────────────────────────────────────────
def generate_thumbnail_png_bytes(file_bytes: bytes, filename: str) -> bytes:
lower = filename.lower()
if lower.endswith(".pdf"):
return pdf_first_page_to_png(file_bytes)
if lower.endswith(".docx"):
return office_first_page_to_png(file_bytes, ".docx")
if lower.endswith(".doc"):
return office_first_page_to_png(file_bytes, ".doc")
if lower.endswith(".pptx"):
return office_first_page_to_png(file_bytes, ".pptx")
if lower.endswith(".ppt"):
return office_first_page_to_png(file_bytes, ".ppt")
raise HTTPException(
status_code=400,
detail={
"error": "Unsupported file type",
"filename": filename,
"supported_formats": ["PDF", "DOC", "DOCX", "PPT", "PPTX"],
"message": "Please upload a PDF or Office document."
}
)
# ─────────────────────────────────────────
# POST /upload
# ─────────────────────────────────────────
@app.post("/upload")
async def upload_document(file: UploadFile = File(...), replace_existing: bool = False):
"""
1. Read upload.
2. Check for duplicate filename.
3. Enforce file size limit.
4. Generate doc_id.
5. Save original file to disk.
6. Generate first-page thumbnail and save it to disk.
7. Store metadata (for /files list).
8. Return URLs so frontend can render thumbnail immediately.
Parameters:
- file: The uploaded file
- replace_existing: If True, replace existing file with same name
Response example:
{
"doc_id": "abc123...",
"original_filename": "slides.pptx",
"uploaded_at": "2025-10-27T10:15:23.123Z",
"original_file_url": "/documents/abc123.pptx",
"thumbnail_url": "/thumbnails/abc123.png",
"duplicate_replaced": false
}
"""
logger.info(f"Upload started: {file.filename}")
content = await file.read()
# 1. Validate not empty
if not content:
raise HTTPException(status_code=400, detail="Empty file upload.")
# 2. Check for duplicate filename
existing_doc_id, existing_info = check_duplicate_filename(file.filename)
duplicate_replaced = False
if existing_doc_id and not replace_existing:
raise HTTPException(
status_code=409,
detail={
"error": "File with this name already exists",
"filename": file.filename,
"existing_doc_id": existing_doc_id,
"existing_uploaded_at": existing_info["uploaded_at"],
"message": "Use replace_existing=true to replace the existing file"
}
)
# 3. Validate size
if len(content) > MAX_FILE_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Max {MAX_FILE_SIZE // (1024*1024)}MB."
)
# 4. Handle replacement if needed
if existing_doc_id and replace_existing:
# Delete existing files
existing_ext = existing_info["extension"]
existing_doc_path = os.path.join(DOCUMENTS_DIR, f"{existing_doc_id}{existing_ext}")
existing_thumb_path = os.path.join(THUMBNAILS_DIR, f"{existing_doc_id}.png")
if os.path.exists(existing_doc_path):
os.unlink(existing_doc_path)
if os.path.exists(existing_thumb_path):
os.unlink(existing_thumb_path)
remove_file_metadata(existing_doc_id)
duplicate_replaced = True
logger.info(f"Replaced existing file: {file.filename}")
# 5. Create doc_id and derive file paths
doc_id = uuid.uuid4().hex
_, ext = os.path.splitext(file.filename)
ext = ext if ext else ""
stored_doc_path = os.path.join(DOCUMENTS_DIR, f"{doc_id}{ext}")
stored_thumb_path = os.path.join(THUMBNAILS_DIR, f"{doc_id}.png")
# 4. Save original file
with open(stored_doc_path, "wb") as f:
f.write(content)
logger.info(f"Saved original file: {stored_doc_path}")
# 5. Generate and save thumbnail
try:
thumbnail_png_bytes = generate_thumbnail_png_bytes(content, file.filename)
except HTTPException as e:
# cleanup doc if thumbnail gen fails
if os.path.exists(stored_doc_path):
os.unlink(stored_doc_path)
raise e
with open(stored_thumb_path, "wb") as t:
t.write(thumbnail_png_bytes)
logger.info(f"Saved thumbnail: {stored_thumb_path}")
# 6. Metadata
uploaded_at = datetime.utcnow().isoformat() + "Z"
add_file_metadata(doc_id, file.filename, ext, uploaded_at)
# 7. Build response payload
original_file_url = f"/documents/{doc_id}{ext}"
thumbnail_url = f"/thumbnails/{doc_id}.png"
resp = {
"doc_id": doc_id,
"original_filename": file.filename,
"uploaded_at": uploaded_at,
"original_file_url": original_file_url,
"thumbnail_url": thumbnail_url,
"duplicate_replaced": duplicate_replaced
}
logger.info(f"Upload completed: doc_id={doc_id}")
return JSONResponse(resp)
# ─────────────────────────────────────────
# GET /check-duplicate/{filename}
# ─────────────────────────────────────────
@app.get("/check-duplicate/{filename}")
async def check_duplicate(filename: str):
"""
Check if a filename already exists in the system.
Returns information about the existing file if found.
"""
existing_doc_id, existing_info = check_duplicate_filename(filename)
if existing_doc_id:
return {
"exists": True,
"filename": filename,
"existing_doc_id": existing_doc_id,
"existing_uploaded_at": existing_info["uploaded_at"],
"existing_file_url": f"/documents/{existing_doc_id}{existing_info['extension']}",
"existing_thumbnail_url": f"/thumbnails/{existing_doc_id}.png"
}
else:
return {
"exists": False,
"filename": filename,
"message": "Filename is available for upload"
}
# ─────────────────────────────────────────
# GET /files (list all stored docs for UI grid)
# ─────────────────────────────────────────
@app.get("/files")
async def list_files():
metadata = load_metadata()
files = []
for doc_id, info in metadata.items():
files.append({
"doc_id": doc_id,
"filename": info["filename"],
"uploaded_at": info["uploaded_at"],
"thumbnail_url": f"/thumbnails/{doc_id}.png",
"file_url": f"/documents/{doc_id}{info['extension']}"
})
return {
"files": files,
"total": len(files)
}
# ─────────────────────────────────────────
# DELETE /delete/{doc_id}
# ─────────────────────────────────────────
@app.delete("/delete/{doc_id}")
async def delete_document(doc_id: str):
logger.info(f"Delete requested: doc_id={doc_id}")
removed_paths = []
# remove original(s)
for fname in os.listdir(DOCUMENTS_DIR):
if fname.startswith(doc_id):
full_path = os.path.join(DOCUMENTS_DIR, fname)
if os.path.isfile(full_path):
try:
os.remove(full_path)
removed_paths.append(full_path)
logger.info(f"Deleted: {full_path}")
except OSError as e:
logger.error(f"Failed to delete {full_path}: {e}")
# remove thumbnail
thumb_path = os.path.join(THUMBNAILS_DIR, f"{doc_id}.png")
if os.path.isfile(thumb_path):
try:
os.remove(thumb_path)
removed_paths.append(thumb_path)
logger.info(f"Deleted: {thumb_path}")
except OSError as e:
logger.error(f"Failed to delete {thumb_path}: {e}")
if not removed_paths:
raise HTTPException(status_code=404, detail="No files found for that doc_id.")
# cleanup metadata
remove_file_metadata(doc_id)
return {
"doc_id": doc_id,
"deleted": True,
"removed_files": removed_paths
}
# ─────────────────────────────────────────
# Health / root
# ─────────────────────────────────────────
@app.get("/health")
async def health():
return {
"status": "ok",
"time": datetime.utcnow().isoformat() + "Z",
"storage_dir": BASE_STORAGE_DIR,
"total_files": len(load_metadata()),
"soffice_cmd": SOFFICE_CMD
}
@app.get("/")
async def root():
return {
"service": "On-Prem Document Thumbnail API",
"version": "1.0.0",
"endpoints": {
"POST /upload": "Upload a document and generate thumbnail",
"GET /files": "List uploaded files + thumbnails",
"DELETE /delete/{doc_id}": "Delete a file and its thumbnail",
"GET /thumbnails/{doc_id}.png": "Thumbnail image",
"GET /documents/{doc_id}.ext": "Original file",
"GET /health": "Health check"
},
"supported_formats": ["PDF", "DOC", "DOCX", "PPT", "PPTX"]
}
# Run local (for dev)
if __name__ == "__main__":
import uvicorn
# host 0.0.0.0 so others on your LAN can hit it if needed
uvicorn.run(app, host="0.0.0.0", port=8000)