Skip to content

Commit 533f78c

Browse files
committed
add retries for add_crawl_pages_to_db_from_wacz()
add typing
1 parent f71a89e commit 533f78c

File tree

2 files changed

+63
-37
lines changed

2 files changed

+63
-37
lines changed

backend/btrixcloud/colls.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ async def list_collections(
518518

519519
async def get_collection_crawl_resources(
520520
self, coll_id: UUID, include_preloads=False
521-
):
521+
) -> tuple[List[CrawlFileOut], List[PreloadResource], bool]:
522522
"""Return pre-signed resources for all collection crawl files."""
523523
# Ensure collection exists
524524
_ = await self.get_collection_raw(coll_id)

backend/btrixcloud/pages.py

+62-36
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44

55
import asyncio
66
import re
7-
import time
87
import traceback
98
import urllib.parse
109
from datetime import datetime
1110
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
1211
from uuid import UUID, uuid4
1312

13+
from remotezip import RemoteIOError
14+
1415
from fastapi import Depends, HTTPException, Request, Response
1516
import pymongo
1617

@@ -81,50 +82,75 @@ async def set_ops(self, background_job_ops: BackgroundJobOps):
8182
"""Set ops classes as needed"""
8283
self.background_job_ops = background_job_ops
8384

84-
async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
85+
async def add_crawl_pages_to_db_from_wacz(
86+
self, crawl_id: str, batch_size=100, num_retries=5
87+
):
8588
"""Add pages to database from WACZ files"""
8689
pages_buffer: List[Page] = []
87-
try:
88-
crawl = await self.crawl_ops.get_crawl_out(crawl_id)
89-
stream = await self.storage_ops.sync_stream_wacz_pages(
90-
crawl.resources or []
91-
)
92-
new_uuid = crawl.type == "upload"
93-
seed_count = 0
94-
non_seed_count = 0
95-
for page_dict in stream:
96-
if not page_dict.get("url"):
97-
continue
90+
retry = 0
91+
while True:
92+
try:
93+
crawl = await self.crawl_ops.get_crawl_out(crawl_id)
94+
stream = await self.storage_ops.sync_stream_wacz_pages(
95+
crawl.resources or []
96+
)
97+
new_uuid = crawl.type == "upload"
98+
seed_count = 0
99+
non_seed_count = 0
100+
for page_dict in stream:
101+
if not page_dict.get("url"):
102+
continue
103+
104+
page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get(
105+
"seed"
106+
)
98107

99-
page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get("seed")
108+
if page_dict.get("isSeed"):
109+
seed_count += 1
110+
else:
111+
non_seed_count += 1
100112

101-
if page_dict.get("isSeed"):
102-
seed_count += 1
103-
else:
104-
non_seed_count += 1
113+
if len(pages_buffer) > batch_size:
114+
await self._add_pages_to_db(crawl_id, pages_buffer)
115+
pages_buffer = []
116+
117+
pages_buffer.append(
118+
self._get_page_from_dict(
119+
page_dict, crawl_id, crawl.oid, new_uuid
120+
)
121+
)
105122

106-
if len(pages_buffer) > batch_size:
123+
# Add any remaining pages in buffer to db
124+
if pages_buffer:
107125
await self._add_pages_to_db(crawl_id, pages_buffer)
108-
pages_buffer = []
109126

110-
pages_buffer.append(
111-
self._get_page_from_dict(page_dict, crawl_id, crawl.oid, new_uuid)
127+
await self.set_archived_item_page_counts(crawl_id)
128+
129+
print(
130+
f"Added pages for crawl {crawl_id}: "
131+
+ f"{seed_count} Seed, {non_seed_count} Non-Seed",
132+
flush=True,
112133
)
113134

114-
# Add any remaining pages in buffer to db
115-
if pages_buffer:
116-
await self._add_pages_to_db(crawl_id, pages_buffer)
135+
except RemoteIOError as rio:
136+
msg = str(rio)
137+
if msg.startswith("503") or msg.startswith("429"):
138+
if retry < num_retries:
139+
retry += 1
140+
print(f"Retrying, {retry} of {num_retries}, {msg}")
141+
await asyncio.sleep(5)
142+
continue
117143

118-
await self.set_archived_item_page_counts(crawl_id)
144+
print(f"No more retries, {msg}")
119145

120-
print(
121-
f"Added pages for crawl {crawl_id}: {seed_count} Seed, {non_seed_count} Non-Seed",
122-
flush=True,
123-
)
124-
# pylint: disable=broad-exception-caught, raise-missing-from
125-
except Exception as err:
126-
traceback.print_exc()
127-
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
146+
# pylint: disable=broad-exception-caught, raise-missing-from
147+
except Exception as err:
148+
traceback.print_exc()
149+
print(
150+
f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True
151+
)
152+
153+
break
128154

129155
def _get_page_from_dict(
130156
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool
@@ -982,7 +1008,7 @@ async def process_finished_crawls():
9821008
break
9831009

9841010
print("Running crawls remain, waiting for them to finish")
985-
time.sleep(30)
1011+
await asyncio.sleep(30)
9861012

9871013
await process_finished_crawls()
9881014

@@ -994,7 +1020,7 @@ async def process_finished_crawls():
9941020
if in_progress is None:
9951021
break
9961022
print("Unmigrated crawls remain, finishing job")
997-
time.sleep(5)
1023+
await asyncio.sleep(5)
9981024

9991025

10001026
# ============================================================================

0 commit comments

Comments
 (0)