Skip to content

Commit 808edae

Browse files
fix: exclude DBLP from default sync (#1030)
DBLP venue sync is resource-heavy for routine runs.\n\nMake large datasets opt-in via --include-large-datasets while keeping explicit backend selection unchanged. Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 46fd076 commit 808edae

File tree

2 files changed

+79
-4
lines changed

2 files changed

+79
-4
lines changed

src/aletheia_probe/cli.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
from .utils.dead_code import code_is_used
2626

2727

28+
LARGE_SYNC_BACKENDS: frozenset[str] = frozenset({"dblp_venues"})
29+
30+
2831
# Import cache_sync last: instantiation at module level may raise SchemaVersionError
2932
# if the database schema is outdated. Caught here for a clean error message.
3033
_startup_error: SchemaVersionError | None = None
@@ -225,31 +228,54 @@ def config() -> None:
225228

226229
@main.command()
227230
@click.option("--force", is_flag=True, help="Force sync even if data appears fresh")
231+
@click.option(
232+
"--include-large-datasets",
233+
is_flag=True,
234+
help=(
235+
"Include large datasets in default sync. "
236+
f"Currently: {', '.join(sorted(LARGE_SYNC_BACKENDS))}"
237+
),
238+
)
228239
@click.argument("backend_names", nargs=-1, required=False)
229-
def sync(force: bool, backend_names: tuple[str, ...]) -> None:
240+
def sync(
241+
force: bool, include_large_datasets: bool, backend_names: tuple[str, ...]
242+
) -> None:
230243
"""Manually sync cache with backend configuration.
231244
232245
Optionally specify one or more backend names to sync only those backends.
233246
Examples:
234247
aletheia-probe sync # Sync all backends
248+
aletheia-probe sync --include-large-datasets # Include large datasets
235249
aletheia-probe sync scopus # Sync only scopus
236250
aletheia-probe sync bealls doaj # Sync only bealls and doaj
237251
238252
Args:
239253
force: Whether to force sync even if data appears fresh.
254+
include_large_datasets: Whether to include large datasets in default sync.
240255
backend_names: Optional tuple of backend names to sync.
241256
"""
242257
# Auto-register custom lists before sync
243258
from .cache.custom_list_manager import auto_register_custom_lists
244259

245260
auto_register_custom_lists()
261+
backend_filter: list[str] | None = None
262+
if backend_names:
263+
backend_filter = list(backend_names)
264+
elif not include_large_datasets:
265+
from .backends.base import get_backend_registry
266+
267+
backend_filter = [
268+
backend_name
269+
for backend_name in get_backend_registry().get_backend_names()
270+
if backend_name not in LARGE_SYNC_BACKENDS
271+
]
246272

247273
try:
248274
# The cache_sync_manager handles all output through the dual logger
249275
result = asyncio.run(
250276
cache_sync_manager.sync_cache_with_config(
251277
force=force,
252-
backend_filter=list(backend_names) if backend_names else None,
278+
backend_filter=backend_filter,
253279
show_progress=True,
254280
)
255281
)

tests/unit/test_cli.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,13 +221,62 @@ def run_coro(coro):
221221
coro.close() # Close without running
222222
return {}
223223

224-
with patch("aletheia_probe.cli.asyncio.run", side_effect=run_coro) as mock_run:
224+
mock_registry = Mock()
225+
mock_registry.get_backend_names.return_value = ["bealls", "dblp_venues", "doaj"]
226+
227+
with (
228+
patch(
229+
"aletheia_probe.backends.base.get_backend_registry",
230+
return_value=mock_registry,
231+
),
232+
patch("aletheia_probe.cli.asyncio.run", side_effect=run_coro) as mock_run,
233+
):
225234
result = runner.invoke(main, ["sync", "--force"])
226235

227236
assert result.exit_code == 0
228237
# Verify force=True was passed to sync_cache_with_config
229238
mock_sync.assert_called_once_with(
230-
force=True, backend_filter=None, show_progress=True
239+
force=True,
240+
backend_filter=["bealls", "doaj"],
241+
show_progress=True,
242+
)
243+
244+
def test_sync_command_include_large_datasets(self, runner, mock_cache_sync_manager):
245+
"""Test sync command with include-large-datasets flag."""
246+
original_sync = mock_cache_sync_manager.sync_cache_with_config
247+
mock_sync = Mock(side_effect=original_sync)
248+
mock_cache_sync_manager.sync_cache_with_config = mock_sync
249+
250+
def run_coro(coro):
251+
"""Run coroutine and return empty dict."""
252+
coro.close()
253+
return {}
254+
255+
with patch("aletheia_probe.cli.asyncio.run", side_effect=run_coro):
256+
result = runner.invoke(main, ["sync", "--include-large-datasets"])
257+
258+
assert result.exit_code == 0
259+
mock_sync.assert_called_once_with(
260+
force=False, backend_filter=None, show_progress=True
261+
)
262+
263+
def test_sync_command_explicit_backend_names(self, runner, mock_cache_sync_manager):
264+
"""Test sync command with explicit backend names."""
265+
original_sync = mock_cache_sync_manager.sync_cache_with_config
266+
mock_sync = Mock(side_effect=original_sync)
267+
mock_cache_sync_manager.sync_cache_with_config = mock_sync
268+
269+
def run_coro(coro):
270+
"""Run coroutine and return empty dict."""
271+
coro.close()
272+
return {}
273+
274+
with patch("aletheia_probe.cli.asyncio.run", side_effect=run_coro):
275+
result = runner.invoke(main, ["sync", "dblp_venues"])
276+
277+
assert result.exit_code == 0
278+
mock_sync.assert_called_once_with(
279+
force=False, backend_filter=["dblp_venues"], show_progress=True
231280
)
232281

233282
def test_sync_command_skipped(self, runner):

0 commit comments

Comments
 (0)