99
1010import modal
1111
12+ def _download_all_clip_models ():
13+ """Pre-download all CLIP models at image build time."""
14+ from transformers import CLIPModel , CLIPProcessor , CLIPTextModelWithProjection , CLIPTokenizer
15+ model_name = "openai/clip-vit-base-patch32"
16+ # Full model for video processing
17+ CLIPModel .from_pretrained (model_name )
18+ CLIPProcessor .from_pretrained (model_name , use_fast = True )
19+ # Text-only model for search
20+ CLIPTokenizer .from_pretrained (model_name )
21+ CLIPTextModelWithProjection .from_pretrained (model_name )
22+
23+
1224def get_dev_image () -> modal .Image :
1325 """
1426 Create the Modal image for the dev app.
27+
28+ Pre-downloads all models at build time to eliminate cold start downloads.
1529 """
1630 return (
1731 modal .Image .debian_slim (python_version = "3.12" )
@@ -29,6 +43,7 @@ def get_dev_image() -> modal.Image:
2943 "scenedetect" ,
3044 "pillow" ,
3145 )
46+ .run_function (_download_all_clip_models )
3247 .add_local_python_source (
3348 "api" ,
3449 "database" ,
@@ -66,12 +81,22 @@ def get_server_image() -> modal.Image:
6681 )
6782 )
6883
84+ def _download_clip_text_model ():
85+ """Pre-download CLIP text encoder at image build time."""
86+ from transformers import CLIPTextModelWithProjection , CLIPTokenizer
87+ model_name = "openai/clip-vit-base-patch32"
88+ CLIPTokenizer .from_pretrained (model_name )
89+ CLIPTextModelWithProjection .from_pretrained (model_name )
90+
91+
6992def get_search_image () -> modal .Image :
7093 """
7194 Create the Modal image for the Search app.
7295
7396 Medium dependencies - includes CLIP text encoder only.
7497 The text encoder (~150MB) is much lighter than the full CLIP model (~350MB).
98+
99+ Pre-downloads the model at build time to eliminate cold start downloads.
75100 """
76101 return (
77102 modal .Image .debian_slim (python_version = "3.12" )
@@ -83,6 +108,7 @@ def get_search_image() -> modal.Image:
83108 "boto3" ,
84109 "numpy" ,
85110 )
111+ .run_function (_download_clip_text_model )
86112 .add_local_python_source (
87113 "database" ,
88114 "search" ,
@@ -92,12 +118,22 @@ def get_search_image() -> modal.Image:
92118 )
93119
94120
121+ def _download_clip_full_model ():
122+ """Pre-download full CLIP model (vision + text) at image build time."""
123+ from transformers import CLIPModel , CLIPProcessor
124+ model_name = "openai/clip-vit-base-patch32"
125+ CLIPModel .from_pretrained (model_name )
126+ CLIPProcessor .from_pretrained (model_name , use_fast = True )
127+
128+
95129def get_processing_image () -> modal .Image :
96130 """
97131 Create the Modal image for the Processing app.
98132
99133 Heavy dependencies for video processing pipeline.
100134 Includes: ffmpeg, opencv, scenedetect, full CLIP model, etc.
135+
136+ Pre-downloads the model at build time to eliminate cold start downloads.
101137 """
102138 return (
103139 modal .Image .debian_slim (python_version = "3.12" )
@@ -113,6 +149,7 @@ def get_processing_image() -> modal.Image:
113149 "pinecone" ,
114150 "boto3" ,
115151 )
152+ .run_function (_download_clip_full_model )
116153 .add_local_python_source (
117154 "database" ,
118155 "preprocessing" ,
0 commit comments