Skip to content

Commit 5a5fd9b

Browse files
committed
fix: Add URL extraction to AsyncDBWriter to populate journal_urls table
The journal_urls table was remaining empty despite successful sync operations because the AsyncDBWriter._batch_write_journals() method was not extracting and storing URLs from journal data. This fix adds comprehensive URL extraction logic that: - Extracts URLs from top-level 'urls' field in journal records - Extracts URLs from metadata['urls'] (Algerian Ministry format) - Extracts URLs from metadata['website_url'] (Kscien format) - Extracts URLs from metadata['source_url'] (general source URLs) - Deduplicates URLs per journal using set() for efficiency - Uses INSERT OR REPLACE SQL pattern consistent with existing code - Follows database schema with proper journal_id foreign key relationships Resolves issue where journal URL data was being collected by data sources but lost during the batch write process, leaving journal_urls table empty.
1 parent bdf0b93 commit 5a5fd9b

File tree

1 file changed

+43
-0
lines changed

1 file changed

+43
-0
lines changed

src/aletheia_probe/cache_sync.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def _batch_write_journals(
137137
name_inserts = []
138138
assessment_inserts = []
139139
metadata_inserts = []
140+
url_inserts = []
140141

141142
# First, collect all normalized names that will be processed
142143
normalized_names = [
@@ -206,6 +207,39 @@ def _batch_write_journals(
206207
# Source assessments
207208
assessment_inserts.append((journal_id, source_id, list_type, 1.0))
208209

210+
# URLs - Extract from multiple sources and deduplicate
211+
urls_to_insert = set() # Use set for automatic deduplication
212+
213+
# Check for top-level urls field
214+
if journal.get("urls"):
215+
for url in journal["urls"]:
216+
if url and isinstance(url, str) and url.strip():
217+
urls_to_insert.add(url.strip())
218+
219+
# Extract URLs from metadata
220+
if metadata:
221+
# Handle Algerian Ministry format: metadata["urls"] as list
222+
if "urls" in metadata and isinstance(metadata["urls"], list):
223+
for url in metadata["urls"]:
224+
if url and isinstance(url, str) and url.strip():
225+
urls_to_insert.add(url.strip())
226+
227+
# Handle Kscien format: metadata["website_url"] as string
228+
if "website_url" in metadata and metadata["website_url"]:
229+
url = metadata["website_url"]
230+
if isinstance(url, str) and url.strip():
231+
urls_to_insert.add(url.strip())
232+
233+
# Handle other potential URL fields in metadata
234+
if "source_url" in metadata and metadata["source_url"]:
235+
url = metadata["source_url"]
236+
if isinstance(url, str) and url.strip():
237+
urls_to_insert.add(url.strip())
238+
239+
# Add deduplicated URLs to batch inserts
240+
for url in urls_to_insert:
241+
url_inserts.append((journal_id, url))
242+
209243
# Metadata
210244
if metadata:
211245
for key, value in metadata.items():
@@ -253,6 +287,15 @@ def _batch_write_journals(
253287
metadata_inserts,
254288
)
255289

290+
# Batch insert/update journal URLs
291+
if url_inserts:
292+
cursor.executemany(
293+
"""INSERT OR REPLACE INTO journal_urls
294+
(journal_id, url, last_seen_at)
295+
VALUES (?, ?, CURRENT_TIMESTAMP)""",
296+
url_inserts,
297+
)
298+
256299
# Commit transaction
257300
conn.execute("COMMIT")
258301

0 commit comments

Comments
 (0)