Skip to content

Commit 78263af

Browse files
authored
Merge pull request #422 from internetarchive/adam/fix_claim_sites_pre_filter
fix: We were applying the max_sites_to_claim filter too early. Many s…
2 parents 4d1fb31 + 4430605 commit 78263af

File tree

3 files changed

+44
-3
lines changed

3 files changed

+44
-3
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,8 @@ check-format:
5959
format:
6060
$(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I --fix .
6161
$(VIRTUAL_ENV_DIR)/bin/ruff format .
62+
63+
.PHONY: test
64+
test:
65+
uv sync --all-extras
66+
uv run py.test tests

brozzler/frontier.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,6 @@ def filter_claimable_site_ids(
6565
if is_claimable:
6666
claimable_sites.append(site)
6767

68-
if len(claimable_sites) >= max_sites_to_claim:
69-
break
70-
7168
site_ids_to_claim = []
7269
# gather sites that are under the max without going over
7370
for site in claimable_sites:

tests/test_frontier.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,6 +1052,45 @@ def test_max_claimed_sites_cross_job(rethinker):
10521052
rr.table("sites").delete().run()
10531053

10541054

1055+
def test_many_active_claimed_sites_cross_job(rethinker):
1056+
rr = rethinker
1057+
frontier = brozzler.RethinkDbFrontier(rr)
1058+
1059+
# clean slate
1060+
rr.table("jobs").delete().run()
1061+
rr.table("sites").delete().run()
1062+
1063+
job_conf_1 = {
1064+
"id": 1,
1065+
"seeds": [{"url": f"http://example.com/{i}"} for i in range(0, 2000)],
1066+
"max_claimed_sites": 3,
1067+
}
1068+
job_conf_2 = {
1069+
"id": 2,
1070+
"seeds": [
1071+
{"url": "http://example.com/1"},
1072+
{"url": "http://example.com/2"},
1073+
{"url": "http://example.com/3"},
1074+
{"url": "http://example.com/4"},
1075+
{"url": "http://example.com/5"},
1076+
],
1077+
"max_claimed_sites": 5,
1078+
}
1079+
1080+
brozzler.new_job(frontier, job_conf_1)
1081+
1082+
# Claim all possible sites from job 1. We should only get 3 due to max_claimed_sites
1083+
claimed_sites_1 = frontier.claim_sites(4)
1084+
assert len(claimed_sites_1) == 3
1085+
1086+
# Add 5 more seeds
1087+
brozzler.new_job(frontier, job_conf_2)
1088+
1089+
# We shouldn't have trouble getting seeds from job 2
1090+
claimed_sites_1 = frontier.claim_sites(5)
1091+
assert len(claimed_sites_1) == 5
1092+
1093+
10551094
# Works locally, but reliably fails in CI.
10561095
@pytest.mark.xfail
10571096
def test_max_claimed_sites_load_perf(rethinker):

0 commit comments

Comments
 (0)