support overriding crawler image pull policy per channel (#2523)

ikreymer · tw4l · web-flow · commit 62e47a881715 · 2025-03-31T14:11:41.000-07:00
- add 'imagePullPolicy' field to each crawler channel declaration - if unset, defaults to the setting in the existing 'crawler_image_pull_policy' field. fixes #2522 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -85,6 +85,7 @@ class CrawlConfigOps:
 
     crawler_channels: CrawlerChannels
     crawler_images_map: dict[str, str]
+    crawler_image_pull_policy_map: dict[str, str]
 
     def __init__(
         self,
@@ -108,6 +109,9 @@ def __init__(
         self.coll_ops = cast(CollectionOps, None)
 
         self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
+        self.default_crawler_image_pull_policy = os.environ.get(
+            "DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
+        )
 
         self.router = APIRouter(
             prefix="/crawlconfigs",
@@ -118,13 +122,18 @@ def __init__(
         self._file_rx = re.compile("\\W+")
 
         self.crawler_images_map = {}
+        self.crawler_image_pull_policy_map = {}
         channels = []
         with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
             crawler_list = json.loads(fh.read())
             for channel_data in crawler_list:
                 channel = CrawlerChannel(**channel_data)
                 channels.append(channel)
                 self.crawler_images_map[channel.id] = channel.image
+                if channel.imagePullPolicy:
+                    self.crawler_image_pull_policy_map[channel.id] = (
+                        channel.imagePullPolicy
+                    )
 
             self.crawler_channels = CrawlerChannels(channels=channels)
 
@@ -960,6 +969,15 @@ def get_channel_crawler_image(
         """Get crawler image name by id"""
         return self.crawler_images_map.get(crawler_channel or "")
 
+    def get_channel_crawler_image_pull_policy(
+        self, crawler_channel: Optional[str]
+    ) -> str:
+        """Get crawler image name by id"""
+        return (
+            self.crawler_image_pull_policy_map.get(crawler_channel or "")
+            or self.default_crawler_image_pull_policy
+        )
+
     def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
         """Load CrawlerProxy mapping from config"""
         proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]
diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py
@@ -33,6 +33,7 @@ async def run_profile_browser(
         url: str,
         storage: StorageRef,
         crawler_image: str,
+        image_pull_policy: str,
         baseprofile: str = "",
         profile_filename: str = "",
         proxy_id: str = "",
@@ -57,6 +58,7 @@ async def run_profile_browser(
             "vnc_password": secrets.token_hex(16),
             "expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
             "crawler_image": crawler_image,
+            "image_pull_policy": image_pull_policy,
             "proxy_id": proxy_id or DEFAULT_PROXY_ID,
         }
 
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -603,6 +603,7 @@ class CrawlerChannel(BaseModel):
 
     id: str
     image: str
+    imagePullPolicy: Optional[str] = None
 
 
 # ============================================================================
diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py
@@ -279,6 +279,11 @@ async def sync_crawls(self, data: MCSyncData):
         )
 
         params["crawler_image"] = status.crawlerImage
+        pull_policy = self.crawl_config_ops.get_channel_crawler_image_pull_policy(
+            crawl.crawler_channel
+        )
+        if pull_policy:
+            params["crawler_image_pull_policy"] = pull_policy
 
         if crawl.proxy_id and not crawl.is_qa:
             proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)
diff --git a/backend/btrixcloud/operator/profiles.py b/backend/btrixcloud/operator/profiles.py
@@ -45,6 +45,9 @@ async def sync_profile_browsers(self, data: MCSyncData):
         params["storage_secret"] = storage_secret
         params["profile_filename"] = spec.get("profileFilename", "")
         params["crawler_image"] = spec["crawlerImage"]
+        pull_policy = spec.get("imagePullPolicy")
+        if pull_policy:
+            params["crawler_image_pull_policy"] = pull_policy
 
         proxy_id = spec.get("proxyId")
         if proxy_id:
diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py
@@ -110,6 +110,10 @@ async def create_new_browser(
         if not crawler_image:
             raise HTTPException(status_code=404, detail="crawler_not_found")
 
+        image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
+            profile_launch.crawlerChannel
+        )
+
         # use either specified proxyId or if none, use proxyId from existing profile
         proxy_id = profile_launch.proxyId or prev_proxy_id
 
@@ -122,6 +126,7 @@ async def create_new_browser(
             url=str(profile_launch.url),
             storage=org.storage,
             crawler_image=crawler_image,
+            image_pull_policy=image_pull_policy,
             baseprofile=prev_profile_id,
             profile_filename=prev_profile_path,
             proxy_id=proxy_id,
diff --git a/chart/app-templates/profile_job.yaml b/chart/app-templates/profile_job.yaml
@@ -23,6 +23,7 @@ spec:
 
   storageName: "{{ storage_name }}"
   crawlerImage: "{{ crawler_image }}"
+  imagePullPolicy: "{{ image_pull_policy }}"
 
   startUrl: "{{ url }}"
   profileFilename: "{{ profile_filename }}"
diff --git a/chart/examples/local-config.yaml b/chart/examples/local-config.yaml
@@ -22,10 +22,12 @@
 # crawler_channels:
 #   - id: default
 #     image: "docker.io/webrecorder/browsertrix-crawler:latest"
+#     imagePullPolicy: Always
 #
 #   # Add, remove, or edit additional crawler release channels for example:
 #   - id: custom_version
 #     image: "<DOCKER IMAGE>"
+#     imagePullPolicy: IfNotPresent  # optional
 
 # overrides to use existing images in local Docker, otherwise will pull from repository
 # backend_pull_policy: "Never"
diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml
@@ -34,6 +34,8 @@ data:
 
   DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
 
+  DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}"
+
   MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
 
   IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -104,7 +104,7 @@ replica_deletion_delay_days: 0
 # API Image
 # =========================================
 backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
-backend_pull_policy: "Always"
+backend_pull_policy: "IfNotPresent"
 
 backend_password_secret: "PASSWORD!"
 
@@ -162,7 +162,7 @@ backend_avg_memory_threshold: 95
 # Nginx Image
 # =========================================
 frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
-frontend_pull_policy: "Always"
+frontend_pull_policy: "IfNotPresent"
 
 frontend_cpu: "10m"
 
@@ -237,12 +237,15 @@ redis_storage: "3Gi"
 crawler_channels:
   - id: default
     image: "docker.io/webrecorder/browsertrix-crawler:latest"
+    imagePullPolicy: Always
 
   # Add, remove, or edit additional crawler versions below, for example:
   # - id: custom_version
   #   image: "<DOCKER IMAGE>"
+  #   imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy)
 
-crawler_pull_policy: "Always"
+# default crawler pull policy if not set per channel
+crawler_pull_policy: "IfNotPresent"
 
 crawler_namespace: "crawlers"
 
diff --git a/frontend/docs/docs/deploy/customization.md b/frontend/docs/docs/deploy/customization.md
@@ -18,6 +18,7 @@ The `crawler_channels` setting is used to specify the [_Crawler Release Channel_
 crawler_channels:
   - id: default
     image: "docker.io/webrecorder/browsertrix-crawler:latest"
+    imagePullPolicy: Always # optional
 ```
 
 This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
@@ -28,8 +29,11 @@ crawler_channels:
     image: "docker.io/webrecorder/browsertrix-crawler:latest"
   - id: testing
     image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
+    imagePullPolicy: IfNotPresent
 ```
 
+The `imagePullPolicy` per channel is optional. If not set, the value set in `crawler_pull_policy` is used as the default.
+
 ## Storage
 
 The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.