Merge pull request #48 from google/ignore-existing-predictions

agentmorris · web-flow · commit 910b5267a29b · 2025-11-13T11:21:00.000-08:00
added --ignore_existing_predictions flag to run_model.py to overwrite existing predictions
diff --git a/notebooks/run_speciesnet_in_jupyter.ipynb b/notebooks/run_speciesnet_in_jupyter.ipynb
@@ -48,6 +48,7 @@
     "from speciesnet import SpeciesNet\n",
     "from speciesnet import SUPPORTED_MODELS\n",
     "\n",
+    "\n",
     "def print_predictions(predictions_dict: dict) -> None:\n",
     "    print(\"Predictions:\")\n",
     "    for prediction in predictions_dict[\"predictions\"]:\n",
diff --git a/notebooks/run_speciesnet_on_colab.ipynb b/notebooks/run_speciesnet_on_colab.ipynb
@@ -72,6 +72,7 @@
     "from speciesnet import SpeciesNet\n",
     "from speciesnet import SUPPORTED_MODELS\n",
     "\n",
+    "\n",
     "def print_predictions(predictions_dict: dict) -> None:\n",
     "    print(\"Predictions:\")\n",
     "    for prediction in predictions_dict[\"predictions\"]:\n",
@@ -100,14 +101,15 @@
     "import shutil\n",
     "\n",
     "# Choose the folder we're going to download to\n",
-    "model_path = '/content/models'\n",
+    "model_path = \"/content/models\"\n",
     "os.makedirs(model_path, exist_ok=True)\n",
     "\n",
     "# Download the model (it will go to a folder like /kaggle/input/...)\n",
-    "download_path = kagglehub.model_download('google/speciesnet/PyTorch/v4.0.1a',\n",
-    "                                          force_download=True)\n",
+    "download_path = kagglehub.model_download(\n",
+    "    \"google/speciesnet/PyTorch/v4.0.1a\", force_download=True\n",
+    ")\n",
     "\n",
-    "print('Model downloaded to temporary folder: {}'.format(download_path))\n",
+    "print(\"Model downloaded to temporary folder: {}\".format(download_path))\n",
     "\n",
     "# List the contents of the downloaded directory to identify the actual files/subdirectories\n",
     "model_files = os.listdir(download_path)\n",
@@ -121,7 +123,7 @@
     "    elif os.path.isdir(source_path):\n",
     "        shutil.copytree(source_path, destination_path, dirs_exist_ok=True)\n",
     "\n",
-    "print('{} files copied to: {}'.format(len(model_files),model_path))"
+    "print(\"{} files copied to: {}\".format(len(model_files), model_path))"
    ]
   },
   {
@@ -141,7 +143,7 @@
    },
    "outputs": [],
    "source": [
-    "os.makedirs('/content/images',exist_ok=True)\n",
+    "os.makedirs(\"/content/images\", exist_ok=True)\n",
     "!wget \"https://github.com/google/cameratrapai/blob/main/test_data/african_elephants.jpg?raw=true\" -O \"/content/images/african_elephants.jpg\"\n",
     "!wget \"https://github.com/google/cameratrapai/blob/main/test_data/american_black_bear.jpg?raw=true\" -O \"/content/images/american_black_bear.jpg\""
    ]
@@ -176,9 +178,10 @@
    "source": [
     "# print the contents of the output json\n",
     "import json\n",
-    "with open('/content/predictions-ensemble.json','r') as f:\n",
-    "  d = json.load(f)\n",
-    "print(str(d))\n"
+    "\n",
+    "with open(\"/content/predictions-ensemble.json\", \"r\") as f:\n",
+    "    d = json.load(f)\n",
+    "print(str(d))"
    ]
   },
   {
diff --git a/speciesnet/scripts/run_model.py b/speciesnet/scripts/run_model.py
@@ -140,6 +140,13 @@
     "unexpected files are supplied. --bypass_prompts bypasses prompts, --nobypass_prompts "
     "(default) does not.",
 )
+_IGNORE_EXISTING_PREDICTIONS = flags.DEFINE_bool(
+    "ignore_existing_predictions",
+    False,
+    "Whether to ignore existing predictions in the output JSON file and reprocess all "
+    "instances. --ignore_existing_predictions bypasses loading partial results, "
+    "--noignore_existing_predictions (default) resumes from existing predictions.",
+)
 
 
 def guess_predictions_source(
@@ -313,37 +320,50 @@ def main(argv: list[str]) -> None:
 
     # Check the compatibility of output predictions with existing partial predictions.
     if _PREDICTIONS_JSON.value:
-        partial_predictions, _ = load_partial_predictions(
-            _PREDICTIONS_JSON.value, instances_dict["instances"]
-        )
-        predictions_source = guess_predictions_source(partial_predictions)
-
-        if _CLASSIFIER_ONLY.value and predictions_source not in [
-            "classifier",
-            "unknown",
-        ]:
-            raise RuntimeError(
-                f"The classifier risks overwriting previous predictions from "
-                f"`{_PREDICTIONS_JSON.value}` that were produced by different "
-                f"components. Make sure to provide a different output location to "
-                f"--{_PREDICTIONS_JSON.name}."
-            )
-
-        if _DETECTOR_ONLY.value and predictions_source not in ["detector", "unknown"]:
-            raise RuntimeError(
-                f"The detector risks overwriting previous predictions from "
-                f"`{_PREDICTIONS_JSON.value}` that were produced by different "
-                f"components. Make sure to provide a different output location to "
-                f"--{_PREDICTIONS_JSON.name}."
-            )
-
-        if _ENSEMBLE_ONLY.value and predictions_source not in ["ensemble", "unknown"]:
-            raise RuntimeError(
-                f"The ensemble risks overwriting previous predictions from "
-                f"`{_PREDICTIONS_JSON.value}` that were produced by different "
-                f"components. Make sure to provide a different output location to "
-                f"--{_PREDICTIONS_JSON.name}."
+        if _IGNORE_EXISTING_PREDICTIONS.value:
+            # When ignoring existing predictions, delete the file to ensure all instances
+            # are reprocessed from scratch.
+            if local_file_exists(_PREDICTIONS_JSON.value):
+                print(f"Deleting existing predictions in `{_PREDICTIONS_JSON.value}`.")
+                Path(_PREDICTIONS_JSON.value).unlink()
+        else:
+            partial_predictions, _ = load_partial_predictions(
+                _PREDICTIONS_JSON.value, instances_dict["instances"]
             )
+            predictions_source = guess_predictions_source(partial_predictions)
+
+            if _CLASSIFIER_ONLY.value and predictions_source not in [
+                "classifier",
+                "unknown",
+            ]:
+                raise RuntimeError(
+                    f"The classifier risks overwriting previous predictions from "
+                    f"`{_PREDICTIONS_JSON.value}` that were produced by different "
+                    f"components. Make sure to provide a different output location to "
+                    f"--{_PREDICTIONS_JSON.name}."
+                )
+
+            if _DETECTOR_ONLY.value and predictions_source not in [
+                "detector",
+                "unknown",
+            ]:
+                raise RuntimeError(
+                    f"The detector risks overwriting previous predictions from "
+                    f"`{_PREDICTIONS_JSON.value}` that were produced by different "
+                    f"components. Make sure to provide a different output location to "
+                    f"--{_PREDICTIONS_JSON.name}."
+                )
+
+            if _ENSEMBLE_ONLY.value and predictions_source not in [
+                "ensemble",
+                "unknown",
+            ]:
+                raise RuntimeError(
+                    f"The ensemble risks overwriting previous predictions from "
+                    f"`{_PREDICTIONS_JSON.value}` that were produced by different "
+                    f"components. Make sure to provide a different output location to "
+                    f"--{_PREDICTIONS_JSON.name}."
+                )
 
     else:
         if not say_yes_to_continue(