|
13 | 13 | from unittest.mock import patch |
14 | 14 | from zodb_pgjsonb.schema import HISTORY_FREE_SCHEMA |
15 | 15 |
|
| 16 | +import os |
16 | 17 | import psycopg |
17 | 18 | import pytest |
18 | 19 | import threading |
@@ -295,3 +296,88 @@ def test_shutdown_flag(self): |
295 | 296 | assert not worker._shutdown.is_set() |
296 | 297 | worker.shutdown() |
297 | 298 | assert worker._shutdown.is_set() |
| 299 | + |
| 300 | + |
| 301 | +# ── Integration tests (require real Tika server) ───────────────────── |
| 302 | + |
| 303 | +TIKA_URL = os.environ.get("PGCATALOG_TIKA_URL", "").strip() |
| 304 | + |
| 305 | + |
| 306 | +class TestWorkerIntegration: |
| 307 | + """End-to-end tests with real Tika server.""" |
| 308 | + |
| 309 | + pytestmark = pytest.mark.skipif(not TIKA_URL, reason="PGCATALOG_TIKA_URL not set") |
| 310 | + |
| 311 | + def test_extract_plain_text(self, worker_db): |
| 312 | + """Worker extracts text from a plain text blob via real Tika.""" |
| 313 | + conn = worker_db |
| 314 | + zoid, tid = 50, 1 |
| 315 | + text_content = b"Important findings about quantum computing research" |
| 316 | + _insert_object_with_blob( |
| 317 | + conn, |
| 318 | + zoid, |
| 319 | + tid, |
| 320 | + blob_data=text_content, |
| 321 | + idx={"Language": "en", "Title": "Research"}, |
| 322 | + ) |
| 323 | + _enqueue_job(conn, zoid, tid, content_type="text/plain") |
| 324 | + |
| 325 | + # Set initial searchable_text |
| 326 | + conn.execute( |
| 327 | + "UPDATE object_state SET searchable_text = " |
| 328 | + "to_tsvector('english', 'Research') " |
| 329 | + "WHERE zoid = %(zoid)s", |
| 330 | + {"zoid": zoid}, |
| 331 | + ) |
| 332 | + conn.commit() |
| 333 | + |
| 334 | + worker = TikaWorker(dsn=DSN, tika_url=TIKA_URL) |
| 335 | + result = worker._process_one() |
| 336 | + assert result is True |
| 337 | + |
| 338 | + # Verify extraction completed |
| 339 | + status = _get_queue_status(conn, zoid) |
| 340 | + assert status["status"] == "done" |
| 341 | + assert status["error"] is None |
| 342 | + |
| 343 | + # Verify searchable_text was updated with extracted terms |
| 344 | + with conn.cursor() as cur: |
| 345 | + cur.execute( |
| 346 | + "SELECT searchable_text::text FROM object_state WHERE zoid = %(zoid)s", |
| 347 | + {"zoid": zoid}, |
| 348 | + ) |
| 349 | + row = cur.fetchone() |
| 350 | + tsv_text = row["searchable_text"] |
| 351 | + assert "quantum" in tsv_text or "comput" in tsv_text |
| 352 | + |
| 353 | + def test_extract_html_content(self, worker_db): |
| 354 | + """Worker extracts text from HTML via real Tika.""" |
| 355 | + conn = worker_db |
| 356 | + zoid, tid = 60, 1 |
| 357 | + html_blob = b"<html><body><h1>PostgreSQL Performance</h1><p>Indexes matter.</p></body></html>" |
| 358 | + _insert_object_with_blob(conn, zoid, tid, blob_data=html_blob) |
| 359 | + _enqueue_job(conn, zoid, tid, content_type="text/html") |
| 360 | + |
| 361 | + conn.execute( |
| 362 | + "UPDATE object_state SET searchable_text = ''::tsvector, " |
| 363 | + 'idx = \'{"Language": "en"}\'::jsonb ' |
| 364 | + "WHERE zoid = %(zoid)s", |
| 365 | + {"zoid": zoid}, |
| 366 | + ) |
| 367 | + conn.commit() |
| 368 | + |
| 369 | + worker = TikaWorker(dsn=DSN, tika_url=TIKA_URL) |
| 370 | + worker._process_one() |
| 371 | + |
| 372 | + status = _get_queue_status(conn, zoid) |
| 373 | + assert status["status"] == "done" |
| 374 | + |
| 375 | + with conn.cursor() as cur: |
| 376 | + cur.execute( |
| 377 | + "SELECT searchable_text::text FROM object_state WHERE zoid = %(zoid)s", |
| 378 | + {"zoid": zoid}, |
| 379 | + ) |
| 380 | + row = cur.fetchone() |
| 381 | + tsv_text = row["searchable_text"] |
| 382 | + # Tika should have extracted "PostgreSQL Performance" and "Indexes matter" |
| 383 | + assert "postgresql" in tsv_text or "perform" in tsv_text or "index" in tsv_text |
0 commit comments