-
Notifications
You must be signed in to change notification settings - Fork 705
Expand file tree
/
Copy path_request_manager_tandem.py
More file actions
108 lines (82 loc) · 3.97 KB
/
_request_manager_tandem.py
File metadata and controls
108 lines (82 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from __future__ import annotations
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING
from typing_extensions import override
from crawlee._utils.docs import docs_group
from crawlee.request_loaders import RequestManager
if TYPE_CHECKING:
from collections.abc import Sequence
from crawlee import Request
from crawlee.request_loaders import RequestLoader
from crawlee.storage_clients.models import ProcessedRequest
logger = getLogger(__name__)
@docs_group('Request loaders')
class RequestManagerTandem(RequestManager):
"""Implements a tandem behaviour for a pair of `RequestLoader` and `RequestManager`.
In this scenario, the contents of the "loader" get transferred into the "manager", allowing processing the requests
from both sources and also enqueueing new requests (not possible with plain `RequestManager`).
"""
def __init__(self, request_loader: RequestLoader, request_manager: RequestManager) -> None:
self._read_only_loader = request_loader
self._read_write_manager = request_manager
@override
async def get_handled_count(self) -> int:
return await self._read_write_manager.get_handled_count()
@override
async def get_total_count(self) -> int:
return (await self._read_only_loader.get_total_count()) + (await self._read_write_manager.get_total_count())
@override
async def is_empty(self) -> bool:
return (await self._read_only_loader.is_empty()) and (await self._read_write_manager.is_empty())
@override
async def is_finished(self) -> bool:
return (await self._read_only_loader.is_finished()) and (await self._read_write_manager.is_finished())
@override
async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest | None:
return await self._read_write_manager.add_request(request, forefront=forefront)
@override
async def add_requests(
self,
requests: Sequence[str | Request],
*,
forefront: bool = False,
batch_size: int = 1000,
wait_time_between_batches: timedelta = timedelta(seconds=1),
wait_for_all_requests_to_be_added: bool = False,
wait_for_all_requests_to_be_added_timeout: timedelta | None = None,
) -> None:
return await self._read_write_manager.add_requests(
requests,
forefront=forefront,
batch_size=batch_size,
wait_time_between_batches=wait_time_between_batches,
wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,
)
@override
async def fetch_next_request(self) -> Request | None:
if await self._read_only_loader.is_finished():
return await self._read_write_manager.fetch_next_request()
request = await self._read_only_loader.fetch_next_request()
if not request:
return await self._read_write_manager.fetch_next_request()
try:
await self._read_write_manager.add_request(request, forefront=True)
except Exception:
logger.exception(
'Adding request from the RequestLoader to the RequestManager failed, the request has been dropped',
extra={'url': request.url, 'unique_key': request.unique_key},
)
return None
await self._read_only_loader.mark_request_as_handled(request)
return await self._read_write_manager.fetch_next_request()
@override
async def reclaim_request(self, request: Request, *, forefront: bool = False) -> None:
await self._read_write_manager.reclaim_request(request, forefront=forefront)
@override
async def mark_request_as_handled(self, request: Request) -> None:
await self._read_write_manager.mark_request_as_handled(request)
@override
async def drop(self) -> None:
await self._read_write_manager.drop()