Update NOAA Agile Modeling Colab to use new Perch Hoplite DB API.

laurenharrell · copybara-github · commit b6da9ad8e278 · 2026-01-26T16:51:02.000-08:00
Updates the agile_modeling_noaa_demo.ipynb notebook to reflect changes in the Perch Hoplite database interface, including:
-   Using `db.get_all_projects()` instead of `db.get_dataset_names()`.
-   Switching from `db.insert_label` to `db.insert_annotation`.
-   Updating label counting methods from `db.get_class_counts()` to `db.count_each_label()`.
-   Adjusting embedding search to use `db.match_window_ids()`.
-   Correcting the class name `SQLiteUsearchDB` to `SQLiteUSearchDB`.
-   Minor adjustments to audio loading sample rate handling.

Additionally, changes to the target audio filepath were made to reflect updates to the NOAA Passive Acoustic Data GCP pathways.

PiperOrigin-RevId: 861406885
diff --git a/chirp/projects/whale_demo/agile_modeling_noaa_demo.ipynb b/chirp/projects/whale_demo/agile_modeling_noaa_demo.ipynb
@@ -96,12 +96,15 @@
         "from etils import epath\n",
         "from IPython.display import display\n",
         "import ipywidgets as widgets\n",
+        "from ml_collections import config_dict\n",
         "import numpy as np\n",
+        "\n",
         "from perch_hoplite.agile import colab_utils\n",
         "from perch_hoplite.agile import embed\n",
         "from perch_hoplite.agile import source_info\n",
         "from perch_hoplite.db import brutalism\n",
-        "from perch_hoplite.db import interface"
+        "from perch_hoplite.db import interface\n",
+        "from perch_hoplite.zoo import taxonomy_model_tf"
       ]
     },
     {
@@ -207,7 +210,7 @@
         "# @markdown For this example, we use the name of the large audio file, but you can use a different name here.\n",
         "dataset_name = 'Saipan_A_06_151006_091215'  # @param {type:'string'}\n",
         "# @markdown 2. Input the filepath for the folder that is containing the input audio files.\n",
-        "dataset_base_path = 'gs://noaa-passive-bioacoustic/pifsc/audio/pipan/saipan/pipan_saipan_06/audio'  #@param {type:'string'}\n",
+        "dataset_base_path = 'gs://noaa-passive-bioacoustic/pifsc/audio/pipan_10/saipan/pipan_saipan_06/audio'  #@param {type:'string'}\n",
         "# @markdown 3. Input the file pattern for the audio files within that folder that you want to embed. Some examples for how to input:\n",
         "# @markdown - All files in the base directory of a specific type (not subdirectories): e.g. `*.wav` (or `*.flac` etc) will generate embeddings for all .wav files (or whichever format) in the dataset_base_path\n",
         "# @markdown - All files in one level of subdirectories within the base directory: `*/*.flac` will generate embeddings for all .flac files\n",
@@ -271,12 +274,13 @@
       },
       "outputs": [],
       "source": [
-        "#@title Initialize the hoplite database (DB) { vertical-output: true }\n",
+        "# @title Initialize the hoplite database (DB) {vertical-output: true}\n",
+        "\n",
         "global db\n",
         "db = configs.db_config.load_db()\n",
         "num_embeddings = db.count_embeddings()\n",
         "\n",
-        "print('Initialized DB located at ', configs.db_config.db_config.db_path)\n",
+        "print('Initialized DB located at:', configs.db_config.db_config.db_path)\n",
         "\n",
         "def drop_and_reload_db(_) -> interface.HopliteDBInterface:\n",
         "  db_path = epath.Path(configs.db_config.db_config.db_path)\n",
@@ -286,14 +290,15 @@
         "  print('\\n Deleted previous db at: ', configs.db_config.db_config.db_path)\n",
         "  db = configs.db_config.load_db()\n",
         "\n",
-        "#@markdown If `drop_existing_db` set to True, when the database already exists and contains embeddings,\n",
-        "#@markdown then those existing embeddings will be erased. You will be prompted to confirm you wish to delete those existing\n",
-        "#@markdown embeddings. If you want to keep existing embeddings in the database, then set to False, which will append the new\n",
-        "#@markdown embeddings to the database.\n",
-        "drop_existing_db = False  #@param {type:'boolean'}\n",
+        "# @markdown If `drop_existing_db` set to True, when the database already exists and contains\n",
+        "# @markdown embeddings, then those existing embeddings will be erased. You will be prompted\n",
+        "# @markdown to confirm you wish to delete those existing embeddings. If you want to keep\n",
+        "# @markdown existing embeddings in the database, then set to False, which will append the new\n",
+        "# @markdown embeddings to the database.\n",
+        "drop_existing_db = False  # @param {type: 'boolean'}\n",
         "\n",
         "if num_embeddings > 0 and drop_existing_db:\n",
-        "  print('Existing DB contains datasets: ', db.get_dataset_names())\n",
+        "  print('Existing DB contains projects: ', db.get_all_projects())\n",
         "  print('num embeddings: ', num_embeddings)\n",
         "  print('\\n\\nClick the button below to confirm you really want to drop the database at ')\n",
         "  print(f'{configs.db_config.db_config.db_path}\\n')\n",
@@ -353,9 +358,16 @@
         "#@title Per dataset statistics { vertical-output: true }\n",
         "#@markdown This tells us how many unique segments are embedded in the database.\n",
         "\n",
-        "for dataset in db.get_dataset_names():\n",
-        "  print(f'\\nDataset \\'{dataset}\\':')\n",
-        "  print('\\tnum embeddings: ', db.get_embeddings_by_source(dataset, source_id=None).shape[0])"
+        "\n",
+        "# @title Per project statistics {vertical-output: true}\n",
+        "\n",
+        "for project in db.get_all_projects():\n",
+        "  window_ids = db.match_window_ids(\n",
+        "      deployments_filter=config_dict.create(eq=dict(project=project))\n",
+        "  )\n",
+        "  print('Project:', project)\n",
+        "  print('>>> num embeddings:', len(window_ids))\n",
+        "  print()"
       ]
     },
     {
@@ -367,14 +379,14 @@
       },
       "outputs": [],
       "source": [
-        "#@title Show example embedding search\n",
-        "#@markdown As an example (and to show that the embedding process worked), this\n",
-        "#@markdown selects a single embedding from the database and outputs the embedding ids of the\n",
-        "#@markdown top-K (k = 128) nearest neighbors in the database.\n",
+        "# @title Show example embedding search\n",
+        "# @markdown As an example (and to show that the embedding process worked), this selects a single\n",
+        "# @markdown embedding from the database and outputs the embedding ids of the top-k (k = 128)\n",
+        "# @markdown nearest neighbors in the database.\n",
         "\n",
-        "q = db.get_embedding(db.get_one_embedding_id())\n",
+        "q = db.get_embedding(db.match_window_ids(limit=1)[0])\n",
         "%time results, scores = brutalism.brute_search(worker.db, query_embedding=q, search_list_size=128, score_fn=np.dot)\n",
-        "print([int(r.embedding_id) for r in results])"
+        "print([int(r.window_id) for r in results])"
       ]
     },
     {
@@ -411,7 +423,8 @@
         "from perch_hoplite.db import score_functions\n",
         "from perch_hoplite.db  import search_results\n",
         "from perch_hoplite.db import sqlite_usearch_impl\n",
-        "from perch_hoplite.zoo import model_configs"
+        "from perch_hoplite.zoo import model_configs\n",
+        "from perch_hoplite.zoo import taxonomy_model_tf"
       ]
     },
     {
@@ -438,9 +451,9 @@
         "#@markdown but note that the model sample rates will be different from this rate.\n",
         "#@markdown If left blank, then the sample rate will be input from the model's\n",
         "#@markdown sample rate.\n",
-        "audio_loader_sample_rate_hz = 10_000  #@param {type:'number'}\n",
+        "audio_loader_sample_rate_hz = None  #@param {type:'number'}\n",
         "\n",
-        "db = sqlite_usearch_impl.SQLiteUsearchDB.create(db_path)\n",
+        "db = sqlite_usearch_impl.SQLiteUSearchDB.create(db_path)\n",
         "db_model_config = db.get_metadata('model_config')\n",
         "embed_config = db.get_metadata('audio_sources')\n",
         "model_class = model_configs.get_model_class(db_model_config.model_key)\n",
@@ -449,7 +462,6 @@
         "\n",
         "if audio_loader_sample_rate_hz == None:\n",
         "  audio_loader_sample_rate_hz = embedding_model.sample_rate\n",
-        "\n",
         "if hasattr(embedding_model, 'window_size_s'):\n",
         "  window_size_s = embedding_model.window_size_s\n",
         "else:\n",
@@ -479,9 +491,11 @@
         "query_uri = 'gs://bioacoustics-www1/multispecies_blog_media/Be_example3.wav'  #@param {type:'string'}\n",
         "query_label = 'Be_biotwang'  #@param {type:'string'}\n",
         "\n",
-        "\n",
         "query = embedding_display.QueryDisplay(\n",
-        "    uri=query_uri, offset_s=0.0, window_size_s=5.0)\n",
+        "    uri=query_uri,\n",
+        "    offset_s=0.0,\n",
+        "    window_size_s=5.0,\n",
+        "    sample_rate_hz=audio_loader_sample_rate_hz)\n",
         "_ = query.display_interactive()"
       ]
     },
@@ -567,33 +581,38 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "G3sIkOqlXzKB"
+        "id": "_Z1zFDksuC05"
       },
       "outputs": [],
       "source": [
         "#@title Save data labels. { vertical-output: true }\n",
         "#@markdown Counts new labels added to the database.\n",
+        "print(\"Annotations before saving new labels:\", len(db.get_all_annotations()))\n",
+        "\n",
+        "for ann in display_results.harvest_labels(annotator_id):\n",
+        "  db.insert_annotation(\n",
+        "      recording_id=ann.recording_id,\n",
+        "      offsets=ann.offsets,\n",
+        "      label=ann.label,\n",
+        "      label_type=ann.label_type,\n",
+        "      provenance=ann.provenance,\n",
+        "      skip_duplicates=True,\n",
+        "  )\n",
         "\n",
-        "prev_lbls, new_lbls = 0, 0\n",
-        "for lbl in display_results.harvest_labels(annotator_id):\n",
-        "  check = db.insert_label(lbl, skip_duplicates=True)\n",
-        "  new_lbls += check\n",
-        "  prev_lbls += (1 - check)\n",
-        "print('\\nNew labels added: ', new_lbls)\n",
-        "print('\\nLabeled query results that already existed: ', prev_lbls)"
+        "print(\"Annotations after saving new labels:\", len(db.get_all_annotations()))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "ouMfqh0KnZS4"
+        "id": "1TF6_7DouC05"
       },
       "outputs": [],
       "source": [
         "#@title Check how many labels of each class exist in the data\n",
-        "print('\\nTotal positive labels per class: ', db.get_class_counts())\n",
-        "print('\\nTotal negative labels per class: ', db.get_class_counts(label_type = interface.LabelType.NEGATIVE))"
+        "print('\\nTotal positive labels per class: ', db.count_each_label(label_type = interface.LabelType.POSITIVE))\n",
+        "print('\\nTotal negative labels per class: ', db.count_each_label(label_type = interface.LabelType.NEGATIVE))"
       ]
     },
     {
@@ -740,33 +759,40 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "IMXI3vdfmX48"
+        "id": "AluZWMMmwE5K"
       },
       "outputs": [],
       "source": [
         "#@title Save data labels. { vertical-output: true }\n",
         "#@markdown This will save the labels to the database, attached to the embedded examples.\n",
         "\n",
-        "prev_lbls, new_lbls = 0, 0\n",
-        "for lbl in display_results.harvest_labels(annotator_id):\n",
-        "  check = db.insert_label(lbl, skip_duplicates=True)\n",
-        "  new_lbls += check\n",
-        "  prev_lbls += (1 - check)\n",
-        "print('\\nNew labels added: ', new_lbls)\n",
-        "print('\\nQuery examples that already existed: ', prev_lbls)"
+        "\n",
+        "print(\"Annotations before saving new labels:\", len(db.get_all_annotations()))\n",
+        "\n",
+        "for ann in display_results.harvest_labels(annotator_id):\n",
+        "  db.insert_annotation(\n",
+        "      recording_id=ann.recording_id,\n",
+        "      offsets=ann.offsets,\n",
+        "      label=ann.label,\n",
+        "      label_type=ann.label_type,\n",
+        "      provenance=ann.provenance,\n",
+        "      skip_duplicates=True,\n",
+        "  )\n",
+        "\n",
+        "print(\"Annotations after saving new labels:\", len(db.get_all_annotations()))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "N6jOL17UbgMo"
+        "id": "iHKlxrpgwE5K"
       },
       "outputs": [],
       "source": [
         "#@title Check how many labels of each class exist in the data\n",
-        "print('\\nTotal positive labels per class: ', db.get_class_counts())\n",
-        "print('\\nTotal negative labels per class: ', db.get_class_counts(label_type = interface.LabelType.NEGATIVE))"
+        "print('\\nTotal positive labels per class: ', db.count_each_label(label_type = interface.LabelType.POSITIVE))\n",
+        "print('\\nTotal negative labels per class: ', db.count_each_label(label_type = interface.LabelType.NEGATIVE))"
       ]
     },
     {
@@ -884,12 +910,7 @@
       },
       "name": "agile_modeling_noaa_demo.ipynb",
       "private_outputs": true,
-      "provenance": [
-        {
-          "file_id": "1ePT3-fDB3kA3_T7trthFtu8xTJQWQBoQ",
-          "timestamp": 1723499538314
-        }
-      ],
+      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {