Add NFS support for model tests (#857)

jmalone-tt · web-flow · commit 671e9d59378a · 2025-03-28T15:22:49.000-05:00
* Adds NFS support for model tests
Adds function to use cached model inputs with url backup
Removes model validation that checks if we reference forked branch -
this would often cause rate limit errors

* Fixed caching so it will work on machines without NFS
diff --git a/.github/workflows/before_merge.yaml b/.github/workflows/before_merge.yaml
@@ -45,10 +45,12 @@ jobs:
 
   model-tests:
     needs: lowering-tests
-    runs-on: ["in-service"]
+    runs-on: ["in-service", "nfs"]
     env:      
       pytest_verbosity: 0
       pytest_report_title: "⭐️ Model Tests - Group ${{ matrix.group }}"    
+      TORCH_HOME: /mnt/tt-metal-pytorch-cache/.cache/torch
+      HF_HOME: /mnt/tt-metal-pytorch-cache/.cache/huggingface
     strategy:
       matrix: # Need to find a way to replace this with a generator
         group: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
diff --git a/tests/models/detr/test_detr.py b/tests/models/detr/test_detr.py
@@ -16,7 +16,9 @@ def _load_model(self):
         The model is from https://github.com/facebookresearch/detr
         """
         # Model
-        model = torch.hub.load("facebookresearch/detr:main", "detr_resnet50", pretrained=True).to(torch.bfloat16)
+        model = torch.hub.load("facebookresearch/detr:main", "detr_resnet50", pretrained=True, skip_validation=True).to(
+            torch.bfloat16
+        )
         return model
 
     def _load_inputs(self):
diff --git a/tests/models/hardnet/test_hardnet.py b/tests/models/hardnet/test_hardnet.py
@@ -14,7 +14,7 @@
 
 class ThisTester(ModelTester):
     def _load_model(self):
-        model = torch.hub.load("PingoLH/Pytorch-HarDNet", "hardnet68", pretrained=False)
+        model = torch.hub.load("PingoLH/Pytorch-HarDNet", "hardnet68", pretrained=False, skip_validation=True)
         checkpoint = "https://github.com/PingoLH/Pytorch-HarDNet/raw/refs/heads/master/hardnet68.pth"
         model.load_state_dict(torch.hub.load_state_dict_from_url(checkpoint, progress=False, map_location="cpu"))
         model = model.to(torch.bfloat16)
diff --git a/tests/models/unet/test_unet.py b/tests/models/unet/test_unet.py
@@ -5,11 +5,12 @@
 
 import numpy as np
 from PIL import Image
+from os import path
 from torchvision import transforms
 import requests
 import torch
 import pytest
-from tests.utils import ModelTester
+from tests.utils import ModelTester, get_cached_image_or_reload
 
 
 class ThisTester(ModelTester):
@@ -21,13 +22,17 @@ def _load_model(self):
             out_channels=1,
             init_features=32,
             pretrained=True,
+            skip_validation=True,
         )
         model = model.to(torch.bfloat16)
         return model
 
     def _load_inputs(self):
-        url = "https://github.com/mateuszbuda/brain-segmentation-pytorch/raw/master/assets/TCGA_CS_4944.png"
-        input_image = Image.open(requests.get(url, stream=True).raw)
+        image_file = get_cached_image_or_reload(
+            relative_cache_path="inputs/TCGA_CS_4944.png",
+            url="https://github.com/mateuszbuda/brain-segmentation-pytorch/raw/master/assets/TCGA_CS_4944.png",
+        )
+        input_image = Image.open(image_file)
         m, s = np.mean(input_image, axis=(0, 1)), np.std(input_image, axis=(0, 1))
         preprocess = transforms.Compose(
             [
diff --git a/tests/models/unet_brain/test_unet_brain.py b/tests/models/unet_brain/test_unet_brain.py
@@ -7,7 +7,7 @@
 from PIL import Image
 from torchvision import transforms
 import pytest
-from tests.utils import ModelTester
+from tests.utils import ModelTester, get_cached_image_or_reload
 
 
 class ThisTester(ModelTester):
@@ -23,19 +23,16 @@ def _load_model(self):
             out_channels=1,
             init_features=32,
             pretrained=True,
+            skip_validation=True,
         )
         model = model.to(torch.bfloat16)
         return model
 
     def _load_inputs(self):
-        url, filename = (
-            "https://github.com/mateuszbuda/brain-segmentation-pytorch/raw/master/assets/TCGA_CS_4944.png",
-            "TCGA_CS_4944.png",
+        filename = get_cached_image_or_reload(
+            relative_cache_path="inputs/TCGA_CS_4944.png",
+            url="https://github.com/mateuszbuda/brain-segmentation-pytorch/raw/master/assets/TCGA_CS_4944.png",
         )
-        try:
-            urllib.URLopener().retrieve(url, filename)
-        except:
-            urllib.request.urlretrieve(url, filename)
 
         input_image = Image.open(filename)
         m, s = np.mean(input_image, axis=(0, 1)), np.std(input_image, axis=(0, 1))
diff --git a/tests/models/yolov5/test_yolov5.py b/tests/models/yolov5/test_yolov5.py
@@ -17,7 +17,9 @@
 class ThisTester(ModelTester):
     def _load_model(self):
         # Model
-        model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True, autoshape=False, device="cpu")
+        model = torch.hub.load(
+            "ultralytics/yolov5", "yolov5s", pretrained=True, autoshape=False, device="cpu", skip_validation=True
+        )
         return model.to(torch.bfloat16)
 
     def _load_inputs(self):
diff --git a/tests/utils.py b/tests/utils.py
@@ -4,6 +4,8 @@
 import torch
 import numpy as np
 import re
+import requests
+from os import path, makedirs
 from collections.abc import Mapping, Sequence
 from typing import List, Dict, Tuple
 
@@ -147,6 +149,33 @@ def test_model(self, as_ttnn=False, option=None):
             raise ValueError(f"Current mode is not supported: {self.mode}")
 
 
+def get_absolute_cache_path(path_relative_to_cache):
+    # convenience method to use NFS if available
+    nfs_cache_base = "/mnt/tt-metal-pytorch-cache/.cache"
+    if path.exists(nfs_cache_base):
+        return path.join(nfs_cache_base, path_relative_to_cache)
+    else:
+        absolute_cache_base = path.expanduser("~/.cache")
+        return path.join(absolute_cache_base, path_relative_to_cache)
+
+
+def get_cached_image_or_reload(relative_cache_path, url):
+    absolute_cache_path = get_absolute_cache_path(relative_cache_path)
+
+    if path.exists(absolute_cache_path):
+        return absolute_cache_path
+
+    dir, _ = path.split(absolute_cache_path)
+    makedirs(dir, exist_ok=True)
+
+    image_file = requests.get(url, stream=True)
+    with open(absolute_cache_path, "wb") as file:
+        for chunk in image_file.iter_content(chunk_size=8192):
+            file.write(chunk)
+
+    return absolute_cache_path
+
+
 # Testing utils copied from tt-metal/tests/ttnn/utils_for_testing.py
 def comp_pcc(golden, calculated, pcc=0.99):
     golden = torch.Tensor(golden)
diff --git a/tools/run_torchvision.py b/tools/run_torchvision.py
@@ -17,7 +17,7 @@ def run_model(
     device=None,
 ):
     if model_name == "dinov2_vits14":
-        m = torch.hub.load("facebookresearch/dinov2", model_name)
+        m = torch.hub.load("facebookresearch/dinov2", model_name, skip_validation=True)
     else:
         try:
             m = torchvision.models.get_model(model_name, pretrained=True)
diff --git a/tools/utils.py b/tools/utils.py
@@ -7,9 +7,9 @@
 
 def get_model(model_name):
     if model_name == "dinov2_vits14":
-        m = torch.hub.load("facebookresearch/dinov2", model_name)
+        m = torch.hub.load("facebookresearch/dinov2", model_name, skip_validation=True)
     elif model_name == "detr_resnet50":
-        m = torch.hub.load("facebookresearch/detr:main", "detr_resnet50", pretrained=True)
+        m = torch.hub.load("facebookresearch/detr:main", "detr_resnet50", pretrained=True, skip_validation=True)
     else:
         try:
             m = torchvision.models.get_model(model_name, pretrained=True)