Skip to content

Commit 62e47a8

Browse files
ikreymertw4l
andauthored
support overriding crawler image pull policy per channel (#2523)
- add 'imagePullPolicy' field to each crawler channel declaration - if unset, defaults to the setting in the existing 'crawler_image_pull_policy' field. fixes #2522 --------- Co-authored-by: Tessa Walsh <[email protected]>
1 parent df8c80f commit 62e47a8

File tree

11 files changed

+49
-3
lines changed

11 files changed

+49
-3
lines changed

backend/btrixcloud/crawlconfigs.py

+18
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class CrawlConfigOps:
8585

8686
crawler_channels: CrawlerChannels
8787
crawler_images_map: dict[str, str]
88+
crawler_image_pull_policy_map: dict[str, str]
8889

8990
def __init__(
9091
self,
@@ -108,6 +109,9 @@ def __init__(
108109
self.coll_ops = cast(CollectionOps, None)
109110

110111
self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
112+
self.default_crawler_image_pull_policy = os.environ.get(
113+
"DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
114+
)
111115

112116
self.router = APIRouter(
113117
prefix="/crawlconfigs",
@@ -118,13 +122,18 @@ def __init__(
118122
self._file_rx = re.compile("\\W+")
119123

120124
self.crawler_images_map = {}
125+
self.crawler_image_pull_policy_map = {}
121126
channels = []
122127
with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
123128
crawler_list = json.loads(fh.read())
124129
for channel_data in crawler_list:
125130
channel = CrawlerChannel(**channel_data)
126131
channels.append(channel)
127132
self.crawler_images_map[channel.id] = channel.image
133+
if channel.imagePullPolicy:
134+
self.crawler_image_pull_policy_map[channel.id] = (
135+
channel.imagePullPolicy
136+
)
128137

129138
self.crawler_channels = CrawlerChannels(channels=channels)
130139

@@ -960,6 +969,15 @@ def get_channel_crawler_image(
960969
"""Get crawler image name by id"""
961970
return self.crawler_images_map.get(crawler_channel or "")
962971

972+
def get_channel_crawler_image_pull_policy(
973+
self, crawler_channel: Optional[str]
974+
) -> str:
975+
"""Get crawler image name by id"""
976+
return (
977+
self.crawler_image_pull_policy_map.get(crawler_channel or "")
978+
or self.default_crawler_image_pull_policy
979+
)
980+
963981
def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
964982
"""Load CrawlerProxy mapping from config"""
965983
proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]

backend/btrixcloud/crawlmanager.py

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ async def run_profile_browser(
3333
url: str,
3434
storage: StorageRef,
3535
crawler_image: str,
36+
image_pull_policy: str,
3637
baseprofile: str = "",
3738
profile_filename: str = "",
3839
proxy_id: str = "",
@@ -57,6 +58,7 @@ async def run_profile_browser(
5758
"vnc_password": secrets.token_hex(16),
5859
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
5960
"crawler_image": crawler_image,
61+
"image_pull_policy": image_pull_policy,
6062
"proxy_id": proxy_id or DEFAULT_PROXY_ID,
6163
}
6264

backend/btrixcloud/models.py

+1
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ class CrawlerChannel(BaseModel):
603603

604604
id: str
605605
image: str
606+
imagePullPolicy: Optional[str] = None
606607

607608

608609
# ============================================================================

backend/btrixcloud/operator/crawls.py

+5
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,11 @@ async def sync_crawls(self, data: MCSyncData):
279279
)
280280

281281
params["crawler_image"] = status.crawlerImage
282+
pull_policy = self.crawl_config_ops.get_channel_crawler_image_pull_policy(
283+
crawl.crawler_channel
284+
)
285+
if pull_policy:
286+
params["crawler_image_pull_policy"] = pull_policy
282287

283288
if crawl.proxy_id and not crawl.is_qa:
284289
proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)

backend/btrixcloud/operator/profiles.py

+3
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ async def sync_profile_browsers(self, data: MCSyncData):
4545
params["storage_secret"] = storage_secret
4646
params["profile_filename"] = spec.get("profileFilename", "")
4747
params["crawler_image"] = spec["crawlerImage"]
48+
pull_policy = spec.get("imagePullPolicy")
49+
if pull_policy:
50+
params["crawler_image_pull_policy"] = pull_policy
4851

4952
proxy_id = spec.get("proxyId")
5053
if proxy_id:

backend/btrixcloud/profiles.py

+5
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ async def create_new_browser(
110110
if not crawler_image:
111111
raise HTTPException(status_code=404, detail="crawler_not_found")
112112

113+
image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
114+
profile_launch.crawlerChannel
115+
)
116+
113117
# use either specified proxyId or if none, use proxyId from existing profile
114118
proxy_id = profile_launch.proxyId or prev_proxy_id
115119

@@ -122,6 +126,7 @@ async def create_new_browser(
122126
url=str(profile_launch.url),
123127
storage=org.storage,
124128
crawler_image=crawler_image,
129+
image_pull_policy=image_pull_policy,
125130
baseprofile=prev_profile_id,
126131
profile_filename=prev_profile_path,
127132
proxy_id=proxy_id,

chart/app-templates/profile_job.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ spec:
2323

2424
storageName: "{{ storage_name }}"
2525
crawlerImage: "{{ crawler_image }}"
26+
imagePullPolicy: "{{ image_pull_policy }}"
2627

2728
startUrl: "{{ url }}"
2829
profileFilename: "{{ profile_filename }}"

chart/examples/local-config.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@
2222
# crawler_channels:
2323
# - id: default
2424
# image: "docker.io/webrecorder/browsertrix-crawler:latest"
25+
# imagePullPolicy: Always
2526
#
2627
# # Add, remove, or edit additional crawler release channels for example:
2728
# - id: custom_version
2829
# image: "<DOCKER IMAGE>"
30+
# imagePullPolicy: IfNotPresent # optional
2931

3032
# overrides to use existing images in local Docker, otherwise will pull from repository
3133
# backend_pull_policy: "Never"

chart/templates/configmap.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ data:
3434

3535
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
3636

37+
DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}"
38+
3739
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
3840

3941
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"

chart/values.yaml

+6-3
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ replica_deletion_delay_days: 0
104104
# API Image
105105
# =========================================
106106
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
107-
backend_pull_policy: "Always"
107+
backend_pull_policy: "IfNotPresent"
108108

109109
backend_password_secret: "PASSWORD!"
110110

@@ -162,7 +162,7 @@ backend_avg_memory_threshold: 95
162162
# Nginx Image
163163
# =========================================
164164
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
165-
frontend_pull_policy: "Always"
165+
frontend_pull_policy: "IfNotPresent"
166166

167167
frontend_cpu: "10m"
168168

@@ -237,12 +237,15 @@ redis_storage: "3Gi"
237237
crawler_channels:
238238
- id: default
239239
image: "docker.io/webrecorder/browsertrix-crawler:latest"
240+
imagePullPolicy: Always
240241

241242
# Add, remove, or edit additional crawler versions below, for example:
242243
# - id: custom_version
243244
# image: "<DOCKER IMAGE>"
245+
# imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy)
244246

245-
crawler_pull_policy: "Always"
247+
# default crawler pull policy if not set per channel
248+
crawler_pull_policy: "IfNotPresent"
246249

247250
crawler_namespace: "crawlers"
248251

frontend/docs/docs/deploy/customization.md

+4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ The `crawler_channels` setting is used to specify the [_Crawler Release Channel_
1818
crawler_channels:
1919
- id: default
2020
image: "docker.io/webrecorder/browsertrix-crawler:latest"
21+
imagePullPolicy: Always # optional
2122
```
2223
2324
This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
@@ -28,8 +29,11 @@ crawler_channels:
2829
image: "docker.io/webrecorder/browsertrix-crawler:latest"
2930
- id: testing
3031
image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
32+
imagePullPolicy: IfNotPresent
3133
```
3234

35+
The `imagePullPolicy` per channel is optional. If not set, the value set in `crawler_pull_policy` is used as the default.
36+
3337
## Storage
3438

3539
The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.

0 commit comments

Comments
 (0)