Skip to content

Commit d7c6182

Browse files
jensensclaude
andcommitted
Fix test isolation and add real Tika integration tests
- Clean queue table between tests (DROP + CREATE, not just IF NOT EXISTS) - Use tuple_row cursor in enqueue tests (matches production cursor type) - Add 2 integration tests using real Tika server (plain text + HTML) - All 24 Tika tests pass with PGCATALOG_TIKA_URL set Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 94e1871 commit d7c6182

2 files changed

Lines changed: 94 additions & 5 deletions

File tree

tests/test_tika_enqueue.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from plone.pgcatalog.processor import TIKA_URL
66
from plone.pgcatalog.schema import TEXT_EXTRACTION_QUEUE
77
from psycopg.rows import dict_row
8+
from psycopg.rows import tuple_row
89
from tests.conftest import DSN
910
from tests.conftest import insert_object
1011

@@ -20,10 +21,11 @@
2021
def pg_conn_with_queue(pg_conn_with_catalog):
2122
"""Database with catalog schema + extraction queue table."""
2223
conn = pg_conn_with_catalog
23-
# Install queue schema
24+
# Drop and recreate to ensure clean state between tests
25+
conn.execute("DROP TABLE IF EXISTS text_extraction_queue, blob_state CASCADE")
26+
conn.commit()
2427
conn.execute(TEXT_EXTRACTION_QUEUE)
2528
conn.commit()
26-
# Also create blob_state if it doesn't exist
2729
conn.execute(
2830
"CREATE TABLE IF NOT EXISTS blob_state ("
2931
" zoid BIGINT NOT NULL,"
@@ -101,7 +103,8 @@ def test_enqueue_blob_with_tika_url(self, pg_conn_with_queue):
101103
# Simulate process() accumulating a candidate
102104
proc._tika_candidates = [{"zoid": zoid, "content_type": "application/pdf"}]
103105

104-
with conn.cursor() as cur:
106+
# Use tuple_row to match production cursor (zodb-pgjsonb uses default)
107+
with conn.cursor(row_factory=tuple_row) as cur:
105108
proc._enqueue_tika_jobs(cur)
106109
conn.commit()
107110

@@ -124,7 +127,7 @@ def test_no_enqueue_without_blob(self, pg_conn_with_queue):
124127
proc = CatalogStateProcessor()
125128
proc._tika_candidates = [{"zoid": zoid, "content_type": "application/pdf"}]
126129

127-
with conn.cursor() as cur:
130+
with conn.cursor(row_factory=tuple_row) as cur:
128131
proc._enqueue_tika_jobs(cur)
129132
conn.commit()
130133

@@ -145,7 +148,7 @@ def test_idempotent_enqueue(self, pg_conn_with_queue):
145148
# Enqueue twice
146149
for _ in range(2):
147150
proc._tika_candidates = [{"zoid": zoid, "content_type": "application/pdf"}]
148-
with conn.cursor() as cur:
151+
with conn.cursor(row_factory=tuple_row) as cur:
149152
proc._enqueue_tika_jobs(cur)
150153
conn.commit()
151154

tests/test_tika_worker.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from unittest.mock import patch
1414
from zodb_pgjsonb.schema import HISTORY_FREE_SCHEMA
1515

16+
import os
1617
import psycopg
1718
import pytest
1819
import threading
@@ -295,3 +296,88 @@ def test_shutdown_flag(self):
295296
assert not worker._shutdown.is_set()
296297
worker.shutdown()
297298
assert worker._shutdown.is_set()
299+
300+
301+
# ── Integration tests (require real Tika server) ─────────────────────
302+
303+
TIKA_URL = os.environ.get("PGCATALOG_TIKA_URL", "").strip()
304+
305+
306+
class TestWorkerIntegration:
307+
"""End-to-end tests with real Tika server."""
308+
309+
pytestmark = pytest.mark.skipif(not TIKA_URL, reason="PGCATALOG_TIKA_URL not set")
310+
311+
def test_extract_plain_text(self, worker_db):
312+
"""Worker extracts text from a plain text blob via real Tika."""
313+
conn = worker_db
314+
zoid, tid = 50, 1
315+
text_content = b"Important findings about quantum computing research"
316+
_insert_object_with_blob(
317+
conn,
318+
zoid,
319+
tid,
320+
blob_data=text_content,
321+
idx={"Language": "en", "Title": "Research"},
322+
)
323+
_enqueue_job(conn, zoid, tid, content_type="text/plain")
324+
325+
# Set initial searchable_text
326+
conn.execute(
327+
"UPDATE object_state SET searchable_text = "
328+
"to_tsvector('english', 'Research') "
329+
"WHERE zoid = %(zoid)s",
330+
{"zoid": zoid},
331+
)
332+
conn.commit()
333+
334+
worker = TikaWorker(dsn=DSN, tika_url=TIKA_URL)
335+
result = worker._process_one()
336+
assert result is True
337+
338+
# Verify extraction completed
339+
status = _get_queue_status(conn, zoid)
340+
assert status["status"] == "done"
341+
assert status["error"] is None
342+
343+
# Verify searchable_text was updated with extracted terms
344+
with conn.cursor() as cur:
345+
cur.execute(
346+
"SELECT searchable_text::text FROM object_state WHERE zoid = %(zoid)s",
347+
{"zoid": zoid},
348+
)
349+
row = cur.fetchone()
350+
tsv_text = row["searchable_text"]
351+
assert "quantum" in tsv_text or "comput" in tsv_text
352+
353+
def test_extract_html_content(self, worker_db):
354+
"""Worker extracts text from HTML via real Tika."""
355+
conn = worker_db
356+
zoid, tid = 60, 1
357+
html_blob = b"<html><body><h1>PostgreSQL Performance</h1><p>Indexes matter.</p></body></html>"
358+
_insert_object_with_blob(conn, zoid, tid, blob_data=html_blob)
359+
_enqueue_job(conn, zoid, tid, content_type="text/html")
360+
361+
conn.execute(
362+
"UPDATE object_state SET searchable_text = ''::tsvector, "
363+
'idx = \'{"Language": "en"}\'::jsonb '
364+
"WHERE zoid = %(zoid)s",
365+
{"zoid": zoid},
366+
)
367+
conn.commit()
368+
369+
worker = TikaWorker(dsn=DSN, tika_url=TIKA_URL)
370+
worker._process_one()
371+
372+
status = _get_queue_status(conn, zoid)
373+
assert status["status"] == "done"
374+
375+
with conn.cursor() as cur:
376+
cur.execute(
377+
"SELECT searchable_text::text FROM object_state WHERE zoid = %(zoid)s",
378+
{"zoid": zoid},
379+
)
380+
row = cur.fetchone()
381+
tsv_text = row["searchable_text"]
382+
# Tika should have extracted "PostgreSQL Performance" and "Indexes matter"
383+
assert "postgresql" in tsv_text or "perform" in tsv_text or "index" in tsv_text

0 commit comments

Comments
 (0)