From 75062fd48c89375969dfe8a92099ef0a46ccdfc5 Mon Sep 17 00:00:00 2001
From: solanyn <14799876+solanyn@users.noreply.github.com>
Date: Wed, 2 Apr 2025 03:15:57 +1100
Subject: [PATCH 1/6] Add question-answer example

Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com>
---
 .../fine-tune-distilbert.ipynb                | 592 ++++++++++++++++++
 1 file changed, 592 insertions(+)
 create mode 100644 examples/pytorch/question-answering/fine-tune-distilbert.ipynb

diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
new file mode 100644
index 0000000000..94f5f4e954
--- /dev/null
+++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
@@ -0,0 +1,592 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b0ee5aff-46d6-47c5-8306-901cfd9206d9",
+   "metadata": {},
+   "source": [
+    "# Fine-tuning DistilBERT for question answering\n",
+    "\n",
+    "This guide describes fine-tuning DistilBERT with Stanford Question Answering Dataset (SQuAD) for question-answering using Kubeflow Trainer.\n",
+    "\n",
+    "This guide is adapted from HuggingFace question answering task recipe page: https://huggingface.co/docs/transformers/en/tasks/question_answering\n",
+    "\n",
+    "Pretrained DistilBERT: https://huggingface.co/docs/transformers/en/model_doc/distilbert\n",
+    "\n",
+    "SQuAD dataset: https://huggingface.co/datasets/rajpurkar/squad"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c31bc8f2",
+   "metadata": {},
+   "source": [
+    "# Install the KubeFlow SDK and dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "10606685",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n",
+      "  Cloning https://github.com/kubeflow/trainer.git (to revision master) to /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/kubeflow/trainer.git /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n",
+      "  Resolved https://github.com/kubeflow/trainer.git to commit 3781eda0e675c655d03bc4cb84cce4362f601e44\n",
+      "  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: kubernetes>=27.2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (32.0.1)\n",
+      "Requirement already satisfied: pydantic>=2.10.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (2.11.0)\n",
+      "Requirement already satisfied: certifi>=14.05.14 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2025.1.31)\n",
+      "Requirement already satisfied: six>=1.9.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.17.0)\n",
+      "Requirement already satisfied: python-dateutil>=2.5.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.9.0.post0)\n",
+      "Requirement already satisfied: pyyaml>=5.4.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (6.0.2)\n",
+      "Requirement already satisfied: google-auth>=1.0.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.38.0)\n",
+      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.8.0)\n",
+      "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.32.3)\n",
+      "Requirement already satisfied: requests-oauthlib in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.0.0)\n",
+      "Requirement already satisfied: oauthlib>=3.2.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (3.2.2)\n",
+      "Requirement already satisfied: urllib3>=1.24.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.3.0)\n",
+      "Requirement already satisfied: durationpy>=0.7 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (0.9)\n",
+      "Requirement already satisfied: annotated-types>=0.6.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.7.0)\n",
+      "Requirement already satisfied: pydantic-core==2.33.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (2.33.0)\n",
+      "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (4.13.0)\n",
+      "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.4.0)\n",
+      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (5.5.2)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.4.2)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (4.9)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.10)\n",
+      "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.6.1)\n",
+      "Requirement already satisfied: cloudpathlib[gs] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (0.21.0)\n",
+      "Requirement already satisfied: transformers[torch] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (4.50.3)\n",
+      "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from cloudpathlib[gs]) (3.1.0)\n",
+      "Requirement already satisfied: filelock in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (3.18.0)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.29.3)\n",
+      "Requirement already satisfied: numpy>=1.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.2.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (24.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (6.0.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2024.11.6)\n",
+      "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.32.3)\n",
+      "Requirement already satisfied: tokenizers<0.22,>=0.21 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.21.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.5.3)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (4.67.1)\n",
+      "Requirement already satisfied: torch>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.6.0)\n",
+      "Requirement already satisfied: accelerate>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (1.6.0)\n",
+      "Requirement already satisfied: psutil in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from accelerate>=0.26.0->transformers[torch]) (7.0.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (2024.12.0)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (4.13.0)\n",
+      "Requirement already satisfied: networkx in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n",
+      "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n",
+      "Requirement already satisfied: setuptools in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (78.1.0)\n",
+      "Requirement already satisfied: sympy==1.13.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.13.1)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from sympy==1.13.1->torch>=2.0->transformers[torch]) (1.3.0)\n",
+      "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.38.0)\n",
+      "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.24.2)\n",
+      "Requirement already satisfied: google-cloud-core<3.0dev,>=2.4.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.4.3)\n",
+      "Requirement already satisfied: google-resumable-media>=2.7.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.7.2)\n",
+      "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (1.7.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.3.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.1.31)\n",
+      "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.69.2)\n",
+      "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (6.30.2)\n",
+      "Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.26.1)\n",
+      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (5.5.2)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.4.2)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (4.9)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from jinja2->torch>=2.0->transformers[torch]) (3.0.2)\n",
+      "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.6.1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n",
+    "!pip install \"cloudpathlib[gs]\" \"transformers[torch]\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a15b91f",
+   "metadata": {},
+   "source": [
+    "# Define the HuggingFace training script\n",
+    "\n",
+    "We need to wrap our training script into a function to create the Kubeflow TrainJob."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "24e7f396-32ce-4d23-b76f-9684de470471",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_distilbert(args):\n",
+    "    import os\n",
+    "\n",
+    "    from cloudpathlib import CloudPath\n",
+    "    from datasets import load_dataset\n",
+    "    import torch\n",
+    "    from transformers import AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering, TrainingArguments, Trainer\n",
+    "\n",
+    "    import torch.distributed as dist\n",
+    "\n",
+    "    # Initialize distributed environment\n",
+    "    _, backend = (\"cuda\", \"nccl\") if torch.cuda.is_available() else (\"cpu\", \"gloo\")\n",
+    "    dist.init_process_group(backend=backend)\n",
+    "\n",
+    "    local_rank = int(os.getenv(\"LOCAL_RANK\", 0))\n",
+    "    print(\n",
+    "        \"Distributed Training with WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}.\".format(\n",
+    "            dist.get_world_size(),\n",
+    "            dist.get_rank(),\n",
+    "            local_rank,\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "    # Download the dataset and tokenizer\n",
+    "    squad = load_dataset(\"squad\", split=\"train[:5000]\")    \n",
+    "\n",
+    "    squad = squad.train_test_split(test_size=0.2)\n",
+    "    \n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"distilbert/distilbert-base-uncased\")\n",
+    "    \n",
+    "    # Define the preprocessing function\n",
+    "    def preprocess_function(examples):\n",
+    "        questions = [q.strip() for q in examples[\"question\"]]\n",
+    "        inputs = tokenizer(\n",
+    "            questions,\n",
+    "            examples[\"context\"],\n",
+    "            max_length=384,\n",
+    "            truncation=\"only_second\",\n",
+    "            return_offsets_mapping=True,\n",
+    "            padding=\"max_length\",\n",
+    "        )\n",
+    "    \n",
+    "        offset_mapping = inputs.pop(\"offset_mapping\")\n",
+    "        answers = examples[\"answers\"]\n",
+    "        start_positions = []\n",
+    "        end_positions = []\n",
+    "    \n",
+    "        for i, offset in enumerate(offset_mapping):\n",
+    "            answer = answers[i]\n",
+    "            start_char = answer[\"answer_start\"][0]\n",
+    "            end_char = answer[\"answer_start\"][0] + len(answer[\"text\"][0])\n",
+    "            sequence_ids = inputs.sequence_ids(i)\n",
+    "    \n",
+    "            # Find the start and end of the context\n",
+    "            idx = 0\n",
+    "            while sequence_ids[idx] != 1:\n",
+    "                idx += 1\n",
+    "            context_start = idx\n",
+    "            while sequence_ids[idx] == 1:\n",
+    "                idx += 1\n",
+    "            context_end = idx - 1\n",
+    "    \n",
+    "            # If the answer is not fully inside the context, label it (0, 0)\n",
+    "            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:\n",
+    "                start_positions.append(0)\n",
+    "                end_positions.append(0)\n",
+    "            else:\n",
+    "                # Otherwise it's the start and end token positions\n",
+    "                idx = context_start\n",
+    "                while idx <= context_end and offset[idx][0] <= start_char:\n",
+    "                    idx += 1\n",
+    "                start_positions.append(idx - 1)\n",
+    "    \n",
+    "                idx = context_end\n",
+    "                while idx >= context_start and offset[idx][1] >= end_char:\n",
+    "                    idx -= 1\n",
+    "                end_positions.append(idx + 1)\n",
+    "    \n",
+    "        inputs[\"start_positions\"] = start_positions\n",
+    "        inputs[\"end_positions\"] = end_positions\n",
+    "        return inputs\n",
+    "        \n",
+    "    # Apply the preprocessing function to the dataset\n",
+    "    tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad[\"train\"].column_names)\n",
+    "        \n",
+    "    # Create a batch of examples using DefaultDataCollator\n",
+    "    data_collator = DefaultDataCollator()\n",
+    "\n",
+    "    # Load the model\n",
+    "    model = AutoModelForQuestionAnswering.from_pretrained(\"distilbert/distilbert-base-uncased\")\n",
+    "\n",
+    "    # Define training hyperparameters\n",
+    "    training_args = TrainingArguments(\n",
+    "        output_dir=args[\"MODEL_NAME\"],\n",
+    "        eval_strategy=\"epoch\",\n",
+    "        learning_rate=2e-5,\n",
+    "        per_device_train_batch_size=16,\n",
+    "        per_device_eval_batch_size=16,\n",
+    "        num_train_epochs=3,\n",
+    "        weight_decay=0.01,\n",
+    "        push_to_hub=False,\n",
+    "    )\n",
+    "    \n",
+    "    # Prepare trainer with configuration\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        args=training_args,\n",
+    "        train_dataset=tokenized_squad[\"train\"],\n",
+    "        eval_dataset=tokenized_squad[\"test\"],\n",
+    "        processing_class=tokenizer,\n",
+    "        data_collator=data_collator,\n",
+    "    )\n",
+    "    \n",
+    "    trainer.train()\n",
+    "\n",
+    "    CloudPath(f'gs://{args[\"BUCKET\"]}/{args[\"MODEL_NAME\"]}').upload_from(args[\"MODEL_NAME\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bf5ab9ba-6054-40d6-839d-f84ff0fba8fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer\n",
+      "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n",
+      "\n",
+      "Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer\n",
+      "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n",
+      "\n",
+      "Name: mpi-distributed, Framework: torch, Trainer Type: CustomTrainer\n",
+      "Entrypoint: ['torchrun']\n",
+      "\n",
+      "Name: torch-distributed, Framework: torch, Trainer Type: CustomTrainer\n",
+      "Entrypoint: ['torchrun']\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from kubeflow.trainer import TrainerClient, CustomTrainer\n",
+    "\n",
+    "for r in TrainerClient().list_runtimes():\n",
+    "    print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\")\n",
+    "    print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")\n",
+    "    print()\n",
+    "\n",
+    "    if r.name == \"torch-distributed\":\n",
+    "        torch_runtime = r"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e3fd0c5f-f359-4c6c-9f0e-2e91904579b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BUCKET = \"tmp-kftrainer\"\n",
+    "MODEL_NAME = \"qa-distilbert\"\n",
+    "args = {\n",
+    "    \"BUCKET\": BUCKET,\n",
+    "    \"MODEL_NAME\": MODEL_NAME,\n",
+    "}\n",
+    "\n",
+    "job_id = TrainerClient().train(\n",
+    "    trainer=CustomTrainer(\n",
+    "        func=train_distilbert,\n",
+    "        func_args=args,\n",
+    "        num_nodes=2,\n",
+    "        packages_to_install=[\"datasets\", \"transformers[torch]\", \"cloudpathlib[gs]\"],\n",
+    "        resources_per_node={\n",
+    "            \"cpu\": \"3\",\n",
+    "            \"memory\": \"8Gi\",\n",
+    "            # Uncomment this to distribute the TrainJob using GPU nodes.\n",
+    "            \"nvidia.com/gpu\": 1,\n",
+    "        },\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "aabade1b-2c0b-492b-be97-03b4f0e037f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ac43d22fc37e'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Train API generates a random TrainJob id.\n",
+    "job_id"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9294ea5",
+   "metadata": {},
+   "source": [
+    "# Check the TrainJob details\n",
+    "\n",
+    "Use `list_jobs()` and `get_job()` APIs to get details about the created TrainJob and its steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fc5de9e8-f798-4cfd-bc6e-f17774cbd235",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TrainJob: ac43d22fc37e, Status: Created, Created at: 2025-04-01 15:34:40+00:00\n"
+     ]
+    }
+   ],
+   "source": [
+    "for job in TrainerClient().list_jobs():\n",
+    "    print(f\"TrainJob: {job.name}, Status: {job.status}, Created at: {job.creation_timestamp}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "a3eec801",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: node-0, Status: Running, Devices: gpu x 1\n",
+      "Step: node-1, Status: Running, Devices: gpu x 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We execute mpirun command on node-0, which functions as the MPI Launcher node.\n",
+    "for c in TrainerClient().get_job(name=job_id).steps:\n",
+    "    print(f\"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30f812d7",
+   "metadata": {},
+   "source": [
+    "# Show the TrainJob logs\n",
+    "\n",
+    "Use `get_job_logs()` API to retrieve the TrainJob logs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "d31f102f-8583-42c4-a6f7-f5a5eb0e7f98",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[node-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\n",
+      "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n",
+      "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 542991.51 examples/s]\n",
+      "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 505234.17 examples/s]\n",
+      "Map: 100%|██████████| 4000/4000 [00:02<00:00, 1966.78 examples/s]\n",
+      "Map: 100%|██████████| 1000/1000 [00:00<00:00, 1873.66 examples/s]\n",
+      "[node-0]: Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n",
+      "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n",
+      "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "  0%|          | 0/375 [00:00<?, ?it/s][rank0]:[W401 15:40:00.952253573 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
+      " 33%|███▎      | 125/375 [01:01<01:45,  2.37it/s]\n",
+      "  0%|          | 0/32 [00:00<?, ?it/s]\u001b[A\n",
+      " 12%|█▎        | 4/32 [00:00<00:01, 24.25it/s]\u001b[A\n",
+      " 22%|██▏       | 7/32 [00:00<00:01, 16.96it/s]\u001b[A\n",
+      " 28%|██▊       | 9/32 [00:00<00:01, 15.60it/s]\u001b[A\n",
+      " 34%|███▍      | 11/32 [00:00<00:01, 14.69it/s]\u001b[A\n",
+      " 41%|████      | 13/32 [00:00<00:01, 14.13it/s]\u001b[A\n",
+      " 47%|████▋     | 15/32 [00:01<00:01, 13.58it/s]\u001b[A\n",
+      " 53%|█████▎    | 17/32 [00:01<00:01, 13.27it/s]\u001b[A\n",
+      " 59%|█████▉    | 19/32 [00:01<00:00, 13.34it/s]\u001b[A\n",
+      " 66%|██████▌   | 21/32 [00:01<00:00, 13.32it/s]\u001b[A\n",
+      " 72%|███████▏  | 23/32 [00:01<00:00, 13.26it/s]\u001b[A\n",
+      " 78%|███████▊  | 25/32 [00:01<00:00, 13.21it/s]\u001b[A\n",
+      " 84%|████████▍ | 27/32 [00:01<00:00, 13.11it/s]\u001b[A\n",
+      " 91%|█████████ | 29/32 [00:02<00:00, 13.03it/s]\u001b[A\n",
+      "                                                 A\n",
+      "[node-0]: {'eval_loss': 2.8629233837127686, 'eval_runtime': 2.4589, 'eval_samples_per_second': 406.689, 'eval_steps_per_second': 13.014, 'epoch': 1.0}\n",
+      " 33%|███▎      | 125/375 [01:03<01:45,  2.37it/s]\n",
+      "100%|██████████| 32/32 [00:02<00:00, 12.98it/s]\u001b[A\n",
+      " 67%|██████▋   | 250/375 [02:19<00:47,  2.63it/s]A\n",
+      "  0%|          | 0/32 [00:00<?, ?it/s]\u001b[A\n",
+      "  9%|▉         | 3/32 [00:00<00:01, 20.59it/s]\u001b[A\n",
+      " 19%|█▉        | 6/32 [00:00<00:01, 15.36it/s]\u001b[A\n",
+      " 25%|██▌       | 8/32 [00:00<00:01, 14.41it/s]\u001b[A\n",
+      " 31%|███▏      | 10/32 [00:00<00:01, 13.83it/s]\u001b[A\n",
+      " 38%|███▊      | 12/32 [00:00<00:01, 13.48it/s]\u001b[A\n",
+      " 44%|████▍     | 14/32 [00:01<00:01, 13.16it/s]\u001b[A\n",
+      " 50%|█████     | 16/32 [00:01<00:01, 13.15it/s]\u001b[A\n",
+      " 56%|█████▋    | 18/32 [00:01<00:01, 13.07it/s]\u001b[A\n",
+      " 62%|██████▎   | 20/32 [00:01<00:00, 12.84it/s]\u001b[A\n",
+      " 69%|██████▉   | 22/32 [00:01<00:00, 12.86it/s]\u001b[A\n",
+      " 75%|███████▌  | 24/32 [00:01<00:00, 12.86it/s]\u001b[A\n",
+      " 81%|████████▏ | 26/32 [00:01<00:00, 12.83it/s]\u001b[A\n",
+      " 88%|████████▊ | 28/32 [00:02<00:00, 12.85it/s]\u001b[A\n",
+      " 94%|█████████▍| 30/32 [00:02<00:00, 12.73it/s]\u001b[A\n",
+      "[node-0]: {'eval_loss': 2.1243765354156494, 'eval_runtime': 2.5006, 'eval_samples_per_second': 399.905, 'eval_steps_per_second': 12.797, 'epoch': 2.0}\n",
+      "                                                 A\n",
+      " 67%|██████▋   | 250/375 [02:21<00:47,  2.63it/s]\n",
+      "100%|██████████| 32/32 [00:02<00:00, 13.06it/s]\u001b[A\n",
+      "100%|██████████| 375/375 [03:26<00:00,  1.70it/s]A\n",
+      "  0%|          | 0/32 [00:00<?, ?it/s]\u001b[A\n",
+      "  9%|▉         | 3/32 [00:00<00:01, 20.97it/s]\u001b[A\n",
+      " 19%|█▉        | 6/32 [00:00<00:01, 15.29it/s]\u001b[A\n",
+      " 25%|██▌       | 8/32 [00:00<00:01, 14.21it/s]\u001b[A\n",
+      " 31%|███▏      | 10/32 [00:00<00:01, 13.74it/s]\u001b[A\n",
+      " 38%|███▊      | 12/32 [00:00<00:01, 13.57it/s]\u001b[A\n",
+      " 44%|████▍     | 14/32 [00:01<00:01, 13.09it/s]\u001b[A\n",
+      " 50%|█████     | 16/32 [00:01<00:01, 12.98it/s]\u001b[A\n",
+      " 56%|█████▋    | 18/32 [00:01<00:01, 12.87it/s]\u001b[A\n",
+      " 62%|██████▎   | 20/32 [00:01<00:00, 12.77it/s]\u001b[A\n",
+      " 69%|██████▉   | 22/32 [00:01<00:00, 12.75it/s]\u001b[A\n",
+      " 75%|███████▌  | 24/32 [00:01<00:00, 12.72it/s]\u001b[A\n",
+      " 81%|████████▏ | 26/32 [00:01<00:00, 12.58it/s]\u001b[A\n",
+      " 88%|████████▊ | 28/32 [00:02<00:00, 12.54it/s]\u001b[A\n",
+      " 94%|█████████▍| 30/32 [00:02<00:00, 12.54it/s]\u001b[A\n",
+      "[node-0]: {'eval_loss': 1.9835596084594727, 'eval_runtime': 2.5234, 'eval_samples_per_second': 396.288, 'eval_steps_per_second': 12.681, 'epoch': 3.0}\n",
+      "                                                 A\n",
+      "100%|██████████| 375/375 [03:30<00:00,  1.70it/s]\n",
+      "100%|██████████| 32/32 [00:02<00:00, 12.76it/s]\u001b[A\n",
+      "[node-0]: {'train_runtime': 210.6231, 'train_samples_per_second': 56.974, 'train_steps_per_second': 1.78, 'train_loss': 2.8177399088541666, 'epoch': 3.0}\n",
+      "100%|██████████| 375/375 [03:30<00:00,  1.78it/s]A\n",
+      "[node-0]: [rank0]:[W401 15:44:47.867488690 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())\n"
+     ]
+    }
+   ],
+   "source": [
+    "_ = TrainerClient().get_job_logs(name=job_id, follow=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66e6f04e-6e0c-4402-86d1-c0ccbe3f8602",
+   "metadata": {},
+   "source": [
+    "# Inference\n",
+    "\n",
+    "Download the model and run inference on some examples."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "1af042a5-abec-456c-a74b-fa78efb8000e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cloudpathlib import CloudPath\n",
+    "from pathlib import Path\n",
+    "\n",
+    "_ = CloudPath(f'gs://tmp-kftrainer/{MODEL_NAME}').download_to(MODEL_NAME)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "cd32e5eb-75e4-4f87-b310-5e1f48b84a18",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use mps:0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'score': 0.13226985931396484,\n",
+       " 'start': 10,\n",
+       " 'end': 95,\n",
+       " 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "question = \"How many programming languages does BLOOM support?\"\n",
+    "context = \"BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages.\"\n",
+    "\n",
+    "question_answerer = pipeline(\"question-answering\", model=f\"./{MODEL_NAME}/checkpoint-375\")\n",
+    "question_answerer(question=question, context=context)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32971b7b",
+   "metadata": {},
+   "source": [
+    "# Clean up\n",
+    "\n",
+    "To delete the TrainJob you can use the `delete_job()` API and pass the generated `job_id`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "16608b6d-87bf-414d-a174-331f174c6add",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = TrainerClient().delete_job(job_id)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From a7f044b8235e069b153b13515c1c3a1392e6f329 Mon Sep 17 00:00:00 2001
From: solanyn <14799876+solanyn@users.noreply.github.com>
Date: Wed, 2 Apr 2025 15:20:27 +1100
Subject: [PATCH 2/6] chore: remove unused lines, add TODO comment

Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com>
---
 .../pytorch/question-answering/fine-tune-distilbert.ipynb  | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
index 94f5f4e954..1d2cb988bd 100644
--- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
+++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
@@ -110,6 +110,7 @@
     }
    ],
    "source": [
+    "# TODO: Change the version of SDK when we have the first release of Trainer SDK\n",
     "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n",
     "!pip install \"cloudpathlib[gs]\" \"transformers[torch]\""
    ]
@@ -280,10 +281,7 @@
     "for r in TrainerClient().list_runtimes():\n",
     "    print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\")\n",
     "    print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")\n",
-    "    print()\n",
-    "\n",
-    "    if r.name == \"torch-distributed\":\n",
-    "        torch_runtime = r"
+    "\n"
    ]
   },
   {
@@ -309,7 +307,6 @@
     "        resources_per_node={\n",
     "            \"cpu\": \"3\",\n",
     "            \"memory\": \"8Gi\",\n",
-    "            # Uncomment this to distribute the TrainJob using GPU nodes.\n",
     "            \"nvidia.com/gpu\": 1,\n",
     "        },\n",
     "    ),\n",

From ec8d1eac1ac04326ea2cac7ecb9b571cff1ff53d Mon Sep 17 00:00:00 2001
From: Andrew Chen <14799876+solanyn@users.noreply.github.com>
Date: Sat, 26 Apr 2025 08:33:20 +1000
Subject: [PATCH 3/6] chore: update example description

Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Signed-off-by: Andrew Chen <14799876+solanyn@users.noreply.github.com>
---
 examples/pytorch/question-answering/fine-tune-distilbert.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
index 1d2cb988bd..a9af34f1ac 100644
--- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
+++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Fine-tuning DistilBERT for question answering\n",
     "\n",
-    "This guide describes fine-tuning DistilBERT with Stanford Question Answering Dataset (SQuAD) for question-answering using Kubeflow Trainer.\n",
+    "This guide describes fine-tuning DistilBERT model with Stanford Question Answering Dataset (SQuAD) for question-answering using Kubeflow Trainer.\n",
     "\n",
     "This guide is adapted from HuggingFace question answering task recipe page: https://huggingface.co/docs/transformers/en/tasks/question_answering\n",
     "\n",

From 5445bc1f80d1de637ce67645992ff3c9b9ee069f Mon Sep 17 00:00:00 2001
From: solanyn <14799876+solanyn@users.noreply.github.com>
Date: Mon, 28 Apr 2025 20:54:47 +1000
Subject: [PATCH 4/6] chore: update question-answering example

* run train job on CPU
* reduce batch size, dataset size and train epochs
* make upload to bucket optional
* add notebook to e2e-test
* set model name as trainjob argument

Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com>
---
 .github/workflows/test-e2e.yaml               |   1 +
 .../fine-tune-distilbert.ipynb                | 361 ++++++++----------
 2 files changed, 151 insertions(+), 211 deletions(-)

diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml
index fed66a1f8b..145c2a87bb 100644
--- a/.github/workflows/test-e2e.yaml
+++ b/.github/workflows/test-e2e.yaml
@@ -56,6 +56,7 @@ jobs:
         run: |
           mkdir -p artifacts/notebooks
           make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb TIMEOUT=900
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/question-answering/fine-tune-distilbert.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb TIMEOUT=900
 
       # TODO (andreyvelich): Discuss how we can upload artifacts for multiple Notebooks.
       - name: Upload Artifacts to GitHub
diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
index a9af34f1ac..0c1e798d76 100644
--- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
+++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
@@ -21,7 +21,9 @@
    "id": "c31bc8f2",
    "metadata": {},
    "source": [
-    "# Install the KubeFlow SDK and dependencies"
+    "# Install the KubeFlow SDK and dependencies\n",
+    "\n",
+    "To install the KubeFlow SDK, see: https://www.kubeflow.org/docs/components/trainer/getting-started/"
    ]
   },
   {
@@ -36,83 +38,87 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Collecting git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n",
-      "  Cloning https://github.com/kubeflow/trainer.git (to revision master) to /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n",
-      "  Running command git clone --filter=blob:none --quiet https://github.com/kubeflow/trainer.git /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n",
-      "  Resolved https://github.com/kubeflow/trainer.git to commit 3781eda0e675c655d03bc4cb84cce4362f601e44\n",
-      "  Installing build dependencies ... \u001b[?25ldone\n",
-      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
-      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
-      "\u001b[?25hRequirement already satisfied: kubernetes>=27.2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (32.0.1)\n",
-      "Requirement already satisfied: pydantic>=2.10.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (2.11.0)\n",
-      "Requirement already satisfied: certifi>=14.05.14 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2025.1.31)\n",
-      "Requirement already satisfied: six>=1.9.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.17.0)\n",
-      "Requirement already satisfied: python-dateutil>=2.5.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.9.0.post0)\n",
-      "Requirement already satisfied: pyyaml>=5.4.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (6.0.2)\n",
-      "Requirement already satisfied: google-auth>=1.0.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.38.0)\n",
-      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.8.0)\n",
-      "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.32.3)\n",
-      "Requirement already satisfied: requests-oauthlib in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.0.0)\n",
-      "Requirement already satisfied: oauthlib>=3.2.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (3.2.2)\n",
-      "Requirement already satisfied: urllib3>=1.24.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.3.0)\n",
-      "Requirement already satisfied: durationpy>=0.7 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (0.9)\n",
-      "Requirement already satisfied: annotated-types>=0.6.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.7.0)\n",
-      "Requirement already satisfied: pydantic-core==2.33.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (2.33.0)\n",
-      "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (4.13.0)\n",
-      "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.4.0)\n",
-      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (5.5.2)\n",
-      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.4.2)\n",
-      "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (4.9)\n",
-      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.4.1)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.10)\n",
-      "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.6.1)\n",
-      "Requirement already satisfied: cloudpathlib[gs] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (0.21.0)\n",
-      "Requirement already satisfied: transformers[torch] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (4.50.3)\n",
-      "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from cloudpathlib[gs]) (3.1.0)\n",
-      "Requirement already satisfied: filelock in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (3.18.0)\n",
-      "Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.29.3)\n",
-      "Requirement already satisfied: numpy>=1.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.2.4)\n",
-      "Requirement already satisfied: packaging>=20.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (24.2)\n",
-      "Requirement already satisfied: pyyaml>=5.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (6.0.2)\n",
-      "Requirement already satisfied: regex!=2019.12.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2024.11.6)\n",
-      "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.32.3)\n",
-      "Requirement already satisfied: tokenizers<0.22,>=0.21 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.21.1)\n",
-      "Requirement already satisfied: safetensors>=0.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.5.3)\n",
-      "Requirement already satisfied: tqdm>=4.27 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (4.67.1)\n",
-      "Requirement already satisfied: torch>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.6.0)\n",
-      "Requirement already satisfied: accelerate>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (1.6.0)\n",
-      "Requirement already satisfied: psutil in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from accelerate>=0.26.0->transformers[torch]) (7.0.0)\n",
-      "Requirement already satisfied: fsspec>=2023.5.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (2024.12.0)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (4.13.0)\n",
-      "Requirement already satisfied: networkx in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n",
-      "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n",
-      "Requirement already satisfied: setuptools in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (78.1.0)\n",
-      "Requirement already satisfied: sympy==1.13.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.13.1)\n",
-      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from sympy==1.13.1->torch>=2.0->transformers[torch]) (1.3.0)\n",
-      "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.38.0)\n",
-      "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.24.2)\n",
-      "Requirement already satisfied: google-cloud-core<3.0dev,>=2.4.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.4.3)\n",
-      "Requirement already satisfied: google-resumable-media>=2.7.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.7.2)\n",
-      "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (1.7.1)\n",
-      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n",
-      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.3.0)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.1.31)\n",
-      "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.69.2)\n",
-      "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (6.30.2)\n",
-      "Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.26.1)\n",
-      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (5.5.2)\n",
-      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.4.2)\n",
-      "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (4.9)\n",
-      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from jinja2->torch>=2.0->transformers[torch]) (3.0.2)\n",
-      "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.6.1)\n"
+      "Requirement already satisfied: cloudpathlib[all] in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (0.21.0)\n",
+      "Requirement already satisfied: transformers[torch] in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (4.51.3)\n",
+      "Requirement already satisfied: filelock in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (3.18.0)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.30.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.30.2)\n",
+      "Requirement already satisfied: numpy>=1.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.2.5)\n",
+      "Requirement already satisfied: packaging>=20.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (25.0)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (6.0.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2024.11.6)\n",
+      "Requirement already satisfied: requests in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.32.3)\n",
+      "Requirement already satisfied: tokenizers<0.22,>=0.21 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.21.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.5.3)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (4.67.1)\n",
+      "Requirement already satisfied: torch>=2.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.7.0)\n",
+      "Requirement already satisfied: accelerate>=0.26.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (1.6.0)\n",
+      "Requirement already satisfied: psutil in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from accelerate>=0.26.0->transformers[torch]) (7.0.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.30.0->transformers[torch]) (2025.3.2)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.30.0->transformers[torch]) (4.13.2)\n",
+      "Requirement already satisfied: setuptools in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (80.0.0)\n",
+      "Requirement already satisfied: sympy>=1.13.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.14.0)\n",
+      "Requirement already satisfied: networkx in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n",
+      "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n",
+      "Collecting azure-storage-blob>=12 (from cloudpathlib[all])\n",
+      "  Using cached azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)\n",
+      "Collecting azure-storage-file-datalake>=12 (from cloudpathlib[all])\n",
+      "  Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl.metadata (16 kB)\n",
+      "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (3.1.0)\n",
+      "Collecting boto3>=1.34.0 (from cloudpathlib[all])\n",
+      "  Using cached boto3-1.38.3-py3-none-any.whl.metadata (6.6 kB)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.4.26)\n",
+      "Collecting azure-core>=1.30.0 (from azure-storage-blob>=12->cloudpathlib[all])\n",
+      "  Using cached azure_core-1.33.0-py3-none-any.whl.metadata (42 kB)\n",
+      "Collecting cryptography>=2.1.4 (from azure-storage-blob>=12->cloudpathlib[all])\n",
+      "  Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)\n",
+      "Collecting isodate>=0.6.1 (from azure-storage-blob>=12->cloudpathlib[all])\n",
+      "  Using cached isodate-0.7.2-py3-none-any.whl.metadata (11 kB)\n",
+      "Collecting botocore<1.39.0,>=1.38.3 (from boto3>=1.34.0->cloudpathlib[all])\n",
+      "  Using cached botocore-1.38.3-py3-none-any.whl.metadata (5.7 kB)\n",
+      "Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.34.0->cloudpathlib[all])\n",
+      "  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n",
+      "Collecting s3transfer<0.13.0,>=0.12.0 (from boto3>=1.34.0->cloudpathlib[all])\n",
+      "  Using cached s3transfer-0.12.0-py3-none-any.whl.metadata (1.7 kB)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from sympy>=1.13.3->torch>=2.0->transformers[torch]) (1.3.0)\n",
+      "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.39.0)\n",
+      "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.25.0rc0)\n",
+      "Requirement already satisfied: google-cloud-core<3.0dev,>=2.4.2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.4.3)\n",
+      "Requirement already satisfied: google-resumable-media>=2.7.2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.7.2)\n",
+      "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (1.7.1)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from jinja2->torch>=2.0->transformers[torch]) (3.0.2)\n",
+      "Requirement already satisfied: six>=1.11.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-core>=1.30.0->azure-storage-blob>=12->cloudpathlib[all]) (1.17.0)\n",
+      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from botocore<1.39.0,>=1.38.3->boto3>=1.34.0->cloudpathlib[all]) (2.9.0.post0)\n",
+      "Requirement already satisfied: cffi>=1.12 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cryptography>=2.1.4->azure-storage-blob>=12->cloudpathlib[all]) (1.17.1)\n",
+      "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[all]) (1.70.0)\n",
+      "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[all]) (6.30.2)\n",
+      "Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[all]) (1.26.1)\n",
+      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (5.5.2)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (0.4.2)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (4.9.1)\n",
+      "Requirement already satisfied: pycparser in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cffi>=1.12->cryptography>=2.1.4->azure-storage-blob>=12->cloudpathlib[all]) (2.22)\n",
+      "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (0.6.1)\n",
+      "Using cached azure_storage_blob-12.25.1-py3-none-any.whl (406 kB)\n",
+      "Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl (263 kB)\n",
+      "Using cached boto3-1.38.3-py3-none-any.whl (139 kB)\n",
+      "Using cached azure_core-1.33.0-py3-none-any.whl (207 kB)\n",
+      "Using cached botocore-1.38.3-py3-none-any.whl (13.5 MB)\n",
+      "Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl (6.7 MB)\n",
+      "Using cached isodate-0.7.2-py3-none-any.whl (22 kB)\n",
+      "Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
+      "Using cached s3transfer-0.12.0-py3-none-any.whl (84 kB)\n",
+      "Installing collected packages: jmespath, isodate, cryptography, botocore, azure-core, s3transfer, azure-storage-blob, boto3, azure-storage-file-datalake\n",
+      "Successfully installed azure-core-1.33.0 azure-storage-blob-12.25.1 azure-storage-file-datalake-12.20.0 boto3-1.38.3 botocore-1.38.3 cryptography-44.0.2 isodate-0.7.2 jmespath-1.0.1 s3transfer-0.12.0\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
      ]
     }
    ],
    "source": [
-    "# TODO: Change the version of SDK when we have the first release of Trainer SDK\n",
-    "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n",
-    "!pip install \"cloudpathlib[gs]\" \"transformers[torch]\""
+    "!pip install \"cloudpathlib[all]\" \"transformers[torch]\""
    ]
   },
   {
@@ -156,11 +162,11 @@
     "    )\n",
     "\n",
     "    # Download the dataset and tokenizer\n",
-    "    squad = load_dataset(\"squad\", split=\"train[:5000]\")    \n",
+    "    squad = load_dataset(\"squad\", split=\"train[:100]\")    \n",
     "\n",
-    "    squad = squad.train_test_split(test_size=0.2)\n",
+    "    squad = squad.train_test_split(test_size=0.2, shuffle=False)\n",
     "    \n",
-    "    tokenizer = AutoTokenizer.from_pretrained(\"distilbert/distilbert-base-uncased\")\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(f'distilbert/{args[\"MODEL_NAME\"]}')\n",
     "    \n",
     "    # Define the preprocessing function\n",
     "    def preprocess_function(examples):\n",
@@ -221,16 +227,16 @@
     "    data_collator = DefaultDataCollator()\n",
     "\n",
     "    # Load the model\n",
-    "    model = AutoModelForQuestionAnswering.from_pretrained(\"distilbert/distilbert-base-uncased\")\n",
+    "    model = AutoModelForQuestionAnswering.from_pretrained(f'distilbert/{args[\"MODEL_NAME\"]}')\n",
     "\n",
     "    # Define training hyperparameters\n",
     "    training_args = TrainingArguments(\n",
     "        output_dir=args[\"MODEL_NAME\"],\n",
     "        eval_strategy=\"epoch\",\n",
     "        learning_rate=2e-5,\n",
-    "        per_device_train_batch_size=16,\n",
-    "        per_device_eval_batch_size=16,\n",
-    "        num_train_epochs=3,\n",
+    "        per_device_train_batch_size=1,\n",
+    "        per_device_eval_batch_size=1,\n",
+    "        num_train_epochs=1,\n",
     "        weight_decay=0.01,\n",
     "        push_to_hub=False,\n",
     "    )\n",
@@ -247,7 +253,9 @@
     "    \n",
     "    trainer.train()\n",
     "\n",
-    "    CloudPath(f'gs://{args[\"BUCKET\"]}/{args[\"MODEL_NAME\"]}').upload_from(args[\"MODEL_NAME\"])"
+    "    # Upload the fine-tuned model\n",
+    "    if args.get(\"BUCKET\", None):\n",
+    "        (CloudPath(args[\"BUCKET\"]) / args[\"MODEL_NAME\"]).upload_from(args[\"MODEL_NAME\"])"
    ]
   },
   {
@@ -262,16 +270,12 @@
      "text": [
       "Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer\n",
       "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n",
-      "\n",
       "Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer\n",
       "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n",
-      "\n",
       "Name: mpi-distributed, Framework: torch, Trainer Type: CustomTrainer\n",
       "Entrypoint: ['torchrun']\n",
-      "\n",
       "Name: torch-distributed, Framework: torch, Trainer Type: CustomTrainer\n",
-      "Entrypoint: ['torchrun']\n",
-      "\n"
+      "Entrypoint: ['torchrun']\n"
      ]
     }
    ],
@@ -280,8 +284,7 @@
     "\n",
     "for r in TrainerClient().list_runtimes():\n",
     "    print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\")\n",
-    "    print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")\n",
-    "\n"
+    "    print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")"
    ]
   },
   {
@@ -291,8 +294,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "BUCKET = \"tmp-kftrainer\"\n",
-    "MODEL_NAME = \"qa-distilbert\"\n",
+    "# To upload to object storage (S3, GCS or Azure Blob Storage), set the bucket with protocol, e.g., \"s3://my-bucket/folder\"\n",
+    "BUCKET = None\n",
+    "\n",
+    "MODEL_NAME = \"distilbert-base-uncased\"\n",
     "args = {\n",
     "    \"BUCKET\": BUCKET,\n",
     "    \"MODEL_NAME\": MODEL_NAME,\n",
@@ -302,12 +307,13 @@
     "    trainer=CustomTrainer(\n",
     "        func=train_distilbert,\n",
     "        func_args=args,\n",
-    "        num_nodes=2,\n",
-    "        packages_to_install=[\"datasets\", \"transformers[torch]\", \"cloudpathlib[gs]\"],\n",
+    "        num_nodes=1,\n",
+    "        packages_to_install=[\"datasets\", \"transformers[torch]\", \"cloudpathlib[all]\"],\n",
     "        resources_per_node={\n",
-    "            \"cpu\": \"3\",\n",
-    "            \"memory\": \"8Gi\",\n",
-    "            \"nvidia.com/gpu\": 1,\n",
+    "            \"cpu\": \"2\",\n",
+    "            \"memory\": \"12Gi\",\n",
+    "            # Uncomment this to distribute the TrainJob using GPU nodes\n",
+    "            # \"nvidia.com/gpu\": 1,\n",
     "        },\n",
     "    ),\n",
     ")"
@@ -322,7 +328,7 @@
     {
      "data": {
       "text/plain": [
-       "'ac43d22fc37e'"
+       "'hb18b5a7847c'"
       ]
      },
      "execution_count": 5,
@@ -347,7 +353,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 6,
    "id": "fc5de9e8-f798-4cfd-bc6e-f17774cbd235",
    "metadata": {},
    "outputs": [
@@ -355,7 +361,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "TrainJob: ac43d22fc37e, Status: Created, Created at: 2025-04-01 15:34:40+00:00\n"
+      "TrainJob: hb18b5a7847c, Status: Created, Created at: 2025-04-28 10:47:54+00:00\n"
      ]
     }
    ],
@@ -366,7 +372,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "id": "a3eec801",
    "metadata": {},
    "outputs": [
@@ -374,8 +380,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Step: node-0, Status: Running, Devices: gpu x 1\n",
-      "Step: node-1, Status: Running, Devices: gpu x 1\n"
+      "Step: node-0, Status: Running, Devices: cpu x 2\n"
      ]
     }
    ],
@@ -397,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 8,
    "id": "d31f102f-8583-42c4-a6f7-f5a5eb0e7f98",
    "metadata": {},
    "outputs": [
@@ -406,77 +411,41 @@
      "output_type": "stream",
      "text": [
       "[node-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\n",
+      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] \n",
+      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n",
+      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n",
+      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n",
       "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n",
-      "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 542991.51 examples/s]\n",
-      "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 505234.17 examples/s]\n",
-      "Map: 100%|██████████| 4000/4000 [00:02<00:00, 1966.78 examples/s]\n",
-      "Map: 100%|██████████| 1000/1000 [00:00<00:00, 1873.66 examples/s]\n",
+      "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 1, LOCAL_RANK: 1.\n",
+      "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 178019.36 examples/s]\n",
+      "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 277318.46 examples/s]\n",
+      "Map: 100%|██████████| 80/80 [00:00<00:00, 1035.67 examples/s]\n",
+      "Map: 100%|██████████| 20/20 [00:00<00:00, 2118.71 examples/s]\n",
+      "Map: 100%|██████████| 80/80 [00:00<00:00, 794.58 examples/s]\n",
+      "Map: 100%|██████████| 20/20 [00:00<00:00, 2195.22 examples/s]\n",
       "[node-0]: Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n",
       "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n",
       "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "  0%|          | 0/375 [00:00<?, ?it/s][rank0]:[W401 15:40:00.952253573 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
-      " 33%|███▎      | 125/375 [01:01<01:45,  2.37it/s]\n",
-      "  0%|          | 0/32 [00:00<?, ?it/s]\u001b[A\n",
-      " 12%|█▎        | 4/32 [00:00<00:01, 24.25it/s]\u001b[A\n",
-      " 22%|██▏       | 7/32 [00:00<00:01, 16.96it/s]\u001b[A\n",
-      " 28%|██▊       | 9/32 [00:00<00:01, 15.60it/s]\u001b[A\n",
-      " 34%|███▍      | 11/32 [00:00<00:01, 14.69it/s]\u001b[A\n",
-      " 41%|████      | 13/32 [00:00<00:01, 14.13it/s]\u001b[A\n",
-      " 47%|████▋     | 15/32 [00:01<00:01, 13.58it/s]\u001b[A\n",
-      " 53%|█████▎    | 17/32 [00:01<00:01, 13.27it/s]\u001b[A\n",
-      " 59%|█████▉    | 19/32 [00:01<00:00, 13.34it/s]\u001b[A\n",
-      " 66%|██████▌   | 21/32 [00:01<00:00, 13.32it/s]\u001b[A\n",
-      " 72%|███████▏  | 23/32 [00:01<00:00, 13.26it/s]\u001b[A\n",
-      " 78%|███████▊  | 25/32 [00:01<00:00, 13.21it/s]\u001b[A\n",
-      " 84%|████████▍ | 27/32 [00:01<00:00, 13.11it/s]\u001b[A\n",
-      " 91%|█████████ | 29/32 [00:02<00:00, 13.03it/s]\u001b[A\n",
-      "                                                 A\n",
-      "[node-0]: {'eval_loss': 2.8629233837127686, 'eval_runtime': 2.4589, 'eval_samples_per_second': 406.689, 'eval_steps_per_second': 13.014, 'epoch': 1.0}\n",
-      " 33%|███▎      | 125/375 [01:03<01:45,  2.37it/s]\n",
-      "100%|██████████| 32/32 [00:02<00:00, 12.98it/s]\u001b[A\n",
-      " 67%|██████▋   | 250/375 [02:19<00:47,  2.63it/s]A\n",
-      "  0%|          | 0/32 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|▉         | 3/32 [00:00<00:01, 20.59it/s]\u001b[A\n",
-      " 19%|█▉        | 6/32 [00:00<00:01, 15.36it/s]\u001b[A\n",
-      " 25%|██▌       | 8/32 [00:00<00:01, 14.41it/s]\u001b[A\n",
-      " 31%|███▏      | 10/32 [00:00<00:01, 13.83it/s]\u001b[A\n",
-      " 38%|███▊      | 12/32 [00:00<00:01, 13.48it/s]\u001b[A\n",
-      " 44%|████▍     | 14/32 [00:01<00:01, 13.16it/s]\u001b[A\n",
-      " 50%|█████     | 16/32 [00:01<00:01, 13.15it/s]\u001b[A\n",
-      " 56%|█████▋    | 18/32 [00:01<00:01, 13.07it/s]\u001b[A\n",
-      " 62%|██████▎   | 20/32 [00:01<00:00, 12.84it/s]\u001b[A\n",
-      " 69%|██████▉   | 22/32 [00:01<00:00, 12.86it/s]\u001b[A\n",
-      " 75%|███████▌  | 24/32 [00:01<00:00, 12.86it/s]\u001b[A\n",
-      " 81%|████████▏ | 26/32 [00:01<00:00, 12.83it/s]\u001b[A\n",
-      " 88%|████████▊ | 28/32 [00:02<00:00, 12.85it/s]\u001b[A\n",
-      " 94%|█████████▍| 30/32 [00:02<00:00, 12.73it/s]\u001b[A\n",
-      "[node-0]: {'eval_loss': 2.1243765354156494, 'eval_runtime': 2.5006, 'eval_samples_per_second': 399.905, 'eval_steps_per_second': 12.797, 'epoch': 2.0}\n",
-      "                                                 A\n",
-      " 67%|██████▋   | 250/375 [02:21<00:47,  2.63it/s]\n",
-      "100%|██████████| 32/32 [00:02<00:00, 13.06it/s]\u001b[A\n",
-      "100%|██████████| 375/375 [03:26<00:00,  1.70it/s]A\n",
-      "  0%|          | 0/32 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|▉         | 3/32 [00:00<00:01, 20.97it/s]\u001b[A\n",
-      " 19%|█▉        | 6/32 [00:00<00:01, 15.29it/s]\u001b[A\n",
-      " 25%|██▌       | 8/32 [00:00<00:01, 14.21it/s]\u001b[A\n",
-      " 31%|███▏      | 10/32 [00:00<00:01, 13.74it/s]\u001b[A\n",
-      " 38%|███▊      | 12/32 [00:00<00:01, 13.57it/s]\u001b[A\n",
-      " 44%|████▍     | 14/32 [00:01<00:01, 13.09it/s]\u001b[A\n",
-      " 50%|█████     | 16/32 [00:01<00:01, 12.98it/s]\u001b[A\n",
-      " 56%|█████▋    | 18/32 [00:01<00:01, 12.87it/s]\u001b[A\n",
-      " 62%|██████▎   | 20/32 [00:01<00:00, 12.77it/s]\u001b[A\n",
-      " 69%|██████▉   | 22/32 [00:01<00:00, 12.75it/s]\u001b[A\n",
-      " 75%|███████▌  | 24/32 [00:01<00:00, 12.72it/s]\u001b[A\n",
-      " 81%|████████▏ | 26/32 [00:01<00:00, 12.58it/s]\u001b[A\n",
-      " 88%|████████▊ | 28/32 [00:02<00:00, 12.54it/s]\u001b[A\n",
-      " 94%|█████████▍| 30/32 [00:02<00:00, 12.54it/s]\u001b[A\n",
-      "[node-0]: {'eval_loss': 1.9835596084594727, 'eval_runtime': 2.5234, 'eval_samples_per_second': 396.288, 'eval_steps_per_second': 12.681, 'epoch': 3.0}\n",
-      "                                                 A\n",
-      "100%|██████████| 375/375 [03:30<00:00,  1.70it/s]\n",
-      "100%|██████████| 32/32 [00:02<00:00, 12.76it/s]\u001b[A\n",
-      "[node-0]: {'train_runtime': 210.6231, 'train_samples_per_second': 56.974, 'train_steps_per_second': 1.78, 'train_loss': 2.8177399088541666, 'epoch': 3.0}\n",
-      "100%|██████████| 375/375 [03:30<00:00,  1.78it/s]A\n",
-      "[node-0]: [rank0]:[W401 15:44:47.867488690 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())\n"
+      "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n",
+      "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "  0%|          | 0/40 [00:00<?, ?it/s][rank0]:[W428 10:48:48.618487122 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
+      "[node-0]: [rank1]:[W428 10:48:48.618663330 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
+      "100%|██████████| 40/40 [02:26<00:00,  3.37s/it]\n",
+      "  0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
+      " 20%|██        | 2/10 [00:00<00:03,  2.49it/s]\u001b[A\n",
+      " 30%|███       | 3/10 [00:01<00:04,  1.75it/s]\u001b[A\n",
+      " 40%|████      | 4/10 [00:02<00:03,  1.53it/s]\u001b[A\n",
+      " 50%|█████     | 5/10 [00:03<00:03,  1.41it/s]\u001b[A\n",
+      " 60%|██████    | 6/10 [00:04<00:03,  1.28it/s]\u001b[A\n",
+      " 70%|███████   | 7/10 [00:05<00:02,  1.15it/s]\u001b[A\n",
+      " 80%|████████  | 8/10 [00:06<00:01,  1.17it/s]\u001b[A\n",
+      " 90%|█████████ | 9/10 [00:06<00:00,  1.19it/s]\u001b[A\n",
+      "[node-0]: {'eval_loss': 5.412436485290527, 'eval_runtime': 8.4252, 'eval_samples_per_second': 2.374, 'eval_steps_per_second': 1.187, 'epoch': 1.0}\n",
+      "                                               \u001b[A\n",
+      "100%|██████████| 40/40 [02:36<00:00,  3.37s/it]\n",
+      "100%|██████████| 10/10 [00:07<00:00,  1.21it/s]\u001b[A\n",
+      "[node-0]: {'train_runtime': 156.169, 'train_samples_per_second': 0.512, 'train_steps_per_second': 0.256, 'train_loss': 5.624436950683593, 'epoch': 1.0}\n",
+      "100%|██████████| 40/40 [02:36<00:00,  3.90s/it]\u001b[A\n"
      ]
     }
    ],
@@ -496,52 +465,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 9,
    "id": "1af042a5-abec-456c-a74b-fa78efb8000e",
    "metadata": {},
    "outputs": [],
    "source": [
     "from cloudpathlib import CloudPath\n",
-    "from pathlib import Path\n",
-    "\n",
-    "_ = CloudPath(f'gs://tmp-kftrainer/{MODEL_NAME}').download_to(MODEL_NAME)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "cd32e5eb-75e4-4f87-b310-5e1f48b84a18",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Device set to use mps:0\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.13226985931396484,\n",
-       " 'start': 10,\n",
-       " 'end': 95,\n",
-       " 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
     "from transformers import pipeline\n",
     "\n",
-    "question = \"How many programming languages does BLOOM support?\"\n",
-    "context = \"BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages.\"\n",
+    "if BUCKET:\n",
+    "    (CloudPath(BUCKET) / MODEL_NAME).download_to(MODEL_NAME)\n",
+    "\n",
+    "    question = \"How many programming languages does BLOOM support?\"\n",
+    "    context = \"BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages.\"\n",
     "\n",
-    "question_answerer = pipeline(\"question-answering\", model=f\"./{MODEL_NAME}/checkpoint-375\")\n",
-    "question_answerer(question=question, context=context)"
+    "    question_answerer = pipeline(\"question-answering\", model=f\"./{MODEL_NAME}/checkpoint-375\")\n",
+    "    question_answerer(question=question, context=context)"
    ]
   },
   {
@@ -556,12 +495,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 10,
    "id": "16608b6d-87bf-414d-a174-331f174c6add",
    "metadata": {},
    "outputs": [],
    "source": [
-    "_ = TrainerClient().delete_job(job_id)"
+    "# _ = TrainerClient().delete_job(job_id)"
    ]
   }
  ],
@@ -581,7 +520,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.1"
+   "version": "3.13.3"
   }
  },
  "nbformat": 4,

From d3e4cf41c636031860c54de77b251542045236d1 Mon Sep 17 00:00:00 2001
From: solanyn <14799876+solanyn@users.noreply.github.com>
Date: Mon, 28 Apr 2025 22:59:56 +1000
Subject: [PATCH 5/6] chore: extend e2e-run-notebook timeout

* e2e tests fail if trainjobs launched by notebook do not finish in 3s
* extends the timeout to 5min to block and wait for longer trainjobs until timeout or trainjob completes

Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com>
---
 hack/e2e-run-notebook.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hack/e2e-run-notebook.sh b/hack/e2e-run-notebook.sh
index d699827ff5..625720dc47 100755
--- a/hack/e2e-run-notebook.sh
+++ b/hack/e2e-run-notebook.sh
@@ -42,7 +42,7 @@ print_results() {
     kubectl describe trainjob
     kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
     kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
-    kubectl wait trainjob --for=condition=Complete --all --timeout 3s
+    kubectl wait trainjob --for=condition=Complete --all --timeout 300s
 }
 
 (papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||

From 920672607b419c974fb6aba7fbf559cb0cae3d6d Mon Sep 17 00:00:00 2001
From: solanyn <14799876+solanyn@users.noreply.github.com>
Date: Tue, 29 Apr 2025 11:28:55 +1000
Subject: [PATCH 6/6] chore: update example to wait for trainjob running status

* revert change to e2e-run-notebook.sh

Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com>
---
 .../fine-tune-distilbert.ipynb                | 132 ++++++++++--------
 hack/e2e-run-notebook.sh                      |   2 +-
 2 files changed, 72 insertions(+), 62 deletions(-)

diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
index 0c1e798d76..a0af152be9 100644
--- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
+++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb
@@ -59,29 +59,20 @@
       "Requirement already satisfied: sympy>=1.13.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.14.0)\n",
       "Requirement already satisfied: networkx in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n",
       "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n",
-      "Collecting azure-storage-blob>=12 (from cloudpathlib[all])\n",
-      "  Using cached azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)\n",
-      "Collecting azure-storage-file-datalake>=12 (from cloudpathlib[all])\n",
-      "  Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl.metadata (16 kB)\n",
+      "Requirement already satisfied: azure-storage-blob>=12 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (12.25.1)\n",
+      "Requirement already satisfied: azure-storage-file-datalake>=12 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (12.20.0)\n",
       "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (3.1.0)\n",
-      "Collecting boto3>=1.34.0 (from cloudpathlib[all])\n",
-      "  Using cached boto3-1.38.3-py3-none-any.whl.metadata (6.6 kB)\n",
+      "Requirement already satisfied: boto3>=1.34.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (1.38.3)\n",
       "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n",
       "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n",
       "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.4.0)\n",
       "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.4.26)\n",
-      "Collecting azure-core>=1.30.0 (from azure-storage-blob>=12->cloudpathlib[all])\n",
-      "  Using cached azure_core-1.33.0-py3-none-any.whl.metadata (42 kB)\n",
-      "Collecting cryptography>=2.1.4 (from azure-storage-blob>=12->cloudpathlib[all])\n",
-      "  Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)\n",
-      "Collecting isodate>=0.6.1 (from azure-storage-blob>=12->cloudpathlib[all])\n",
-      "  Using cached isodate-0.7.2-py3-none-any.whl.metadata (11 kB)\n",
-      "Collecting botocore<1.39.0,>=1.38.3 (from boto3>=1.34.0->cloudpathlib[all])\n",
-      "  Using cached botocore-1.38.3-py3-none-any.whl.metadata (5.7 kB)\n",
-      "Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.34.0->cloudpathlib[all])\n",
-      "  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n",
-      "Collecting s3transfer<0.13.0,>=0.12.0 (from boto3>=1.34.0->cloudpathlib[all])\n",
-      "  Using cached s3transfer-0.12.0-py3-none-any.whl.metadata (1.7 kB)\n",
+      "Requirement already satisfied: azure-core>=1.30.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-storage-blob>=12->cloudpathlib[all]) (1.33.0)\n",
+      "Requirement already satisfied: cryptography>=2.1.4 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-storage-blob>=12->cloudpathlib[all]) (44.0.2)\n",
+      "Requirement already satisfied: isodate>=0.6.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-storage-blob>=12->cloudpathlib[all]) (0.7.2)\n",
+      "Requirement already satisfied: botocore<1.39.0,>=1.38.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from boto3>=1.34.0->cloudpathlib[all]) (1.38.3)\n",
+      "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from boto3>=1.34.0->cloudpathlib[all]) (1.0.1)\n",
+      "Requirement already satisfied: s3transfer<0.13.0,>=0.12.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from boto3>=1.34.0->cloudpathlib[all]) (0.12.0)\n",
       "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from sympy>=1.13.3->torch>=2.0->transformers[torch]) (1.3.0)\n",
       "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.39.0)\n",
       "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.25.0rc0)\n",
@@ -100,17 +91,6 @@
       "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (4.9.1)\n",
       "Requirement already satisfied: pycparser in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cffi>=1.12->cryptography>=2.1.4->azure-storage-blob>=12->cloudpathlib[all]) (2.22)\n",
       "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (0.6.1)\n",
-      "Using cached azure_storage_blob-12.25.1-py3-none-any.whl (406 kB)\n",
-      "Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl (263 kB)\n",
-      "Using cached boto3-1.38.3-py3-none-any.whl (139 kB)\n",
-      "Using cached azure_core-1.33.0-py3-none-any.whl (207 kB)\n",
-      "Using cached botocore-1.38.3-py3-none-any.whl (13.5 MB)\n",
-      "Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl (6.7 MB)\n",
-      "Using cached isodate-0.7.2-py3-none-any.whl (22 kB)\n",
-      "Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
-      "Using cached s3transfer-0.12.0-py3-none-any.whl (84 kB)\n",
-      "Installing collected packages: jmespath, isodate, cryptography, botocore, azure-core, s3transfer, azure-storage-blob, boto3, azure-storage-file-datalake\n",
-      "Successfully installed azure-core-1.33.0 azure-storage-blob-12.25.1 azure-storage-file-datalake-12.20.0 boto3-1.38.3 botocore-1.38.3 cryptography-44.0.2 isodate-0.7.2 jmespath-1.0.1 s3transfer-0.12.0\n",
       "\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1\u001b[0m\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
@@ -328,7 +308,7 @@
     {
      "data": {
       "text/plain": [
-       "'hb18b5a7847c'"
+       "'rafd89de924b'"
       ]
      },
      "execution_count": 5,
@@ -361,7 +341,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "TrainJob: hb18b5a7847c, Status: Created, Created at: 2025-04-28 10:47:54+00:00\n"
+      "TrainJob: rafd89de924b, Status: Unknown, Created at: 2025-04-29 01:22:14+00:00\n"
      ]
     }
    ],
@@ -373,6 +353,36 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "c9f8ef95-b309-4987-9abb-760dc9c1e050",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Waiting for TrainJob running status. Sleep for 5 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n",
+    "import time\n",
+    "\n",
+    "def wait_for_job_running():\n",
+    "    for _ in range(100):\n",
+    "        trainjob = TrainerClient().get_job(name=job_id)\n",
+    "        for c in trainjob.steps:\n",
+    "            if c.status == \"Running\":\n",
+    "                return\n",
+    "        print(\"Waiting for TrainJob running status. Sleep for 5 seconds\")\n",
+    "        time.sleep(5)\n",
+    "\n",
+    "wait_for_job_running()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
    "id": "a3eec801",
    "metadata": {},
    "outputs": [
@@ -402,7 +412,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "d31f102f-8583-42c4-a6f7-f5a5eb0e7f98",
    "metadata": {},
    "outputs": [
@@ -411,41 +421,41 @@
      "output_type": "stream",
      "text": [
       "[node-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\n",
-      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] \n",
-      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n",
-      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n",
-      "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n",
-      "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n",
+      "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] \n",
+      "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] *****************************************\n",
+      "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n",
+      "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] *****************************************\n",
       "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 1, LOCAL_RANK: 1.\n",
-      "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 178019.36 examples/s]\n",
-      "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 277318.46 examples/s]\n",
-      "Map: 100%|██████████| 80/80 [00:00<00:00, 1035.67 examples/s]\n",
-      "Map: 100%|██████████| 20/20 [00:00<00:00, 2118.71 examples/s]\n",
-      "Map: 100%|██████████| 80/80 [00:00<00:00, 794.58 examples/s]\n",
-      "Map: 100%|██████████| 20/20 [00:00<00:00, 2195.22 examples/s]\n",
+      "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n",
+      "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 627416.25 examples/s]\n",
+      "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 1041555.11 examples/s]\n",
+      "Map: 100%|██████████| 80/80 [00:00<00:00, 1361.10 examples/s]\n",
+      "Map: 100%|██████████| 20/20 [00:00<00:00, 2123.27 examples/s]\n",
+      "Map: 100%|██████████| 80/80 [00:00<00:00, 1030.86 examples/s]\n",
+      "Map: 100%|██████████| 20/20 [00:00<00:00, 1877.53 examples/s]\n",
       "[node-0]: Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n",
       "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n",
       "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
       "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n",
       "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "  0%|          | 0/40 [00:00<?, ?it/s][rank0]:[W428 10:48:48.618487122 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
-      "[node-0]: [rank1]:[W428 10:48:48.618663330 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
-      "100%|██████████| 40/40 [02:26<00:00,  3.37s/it]\n",
+      "  0%|          | 0/40 [00:00<?, ?it/s][rank0]:[W429 01:22:58.895439547 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
+      "[node-0]: [rank1]:[W429 01:22:58.895689005 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
+      "100%|██████████| 40/40 [02:36<00:00,  4.10s/it]\n",
       "  0%|          | 0/10 [00:00<?, ?it/s]\u001b[A\n",
-      " 20%|██        | 2/10 [00:00<00:03,  2.49it/s]\u001b[A\n",
-      " 30%|███       | 3/10 [00:01<00:04,  1.75it/s]\u001b[A\n",
-      " 40%|████      | 4/10 [00:02<00:03,  1.53it/s]\u001b[A\n",
-      " 50%|█████     | 5/10 [00:03<00:03,  1.41it/s]\u001b[A\n",
-      " 60%|██████    | 6/10 [00:04<00:03,  1.28it/s]\u001b[A\n",
-      " 70%|███████   | 7/10 [00:05<00:02,  1.15it/s]\u001b[A\n",
-      " 80%|████████  | 8/10 [00:06<00:01,  1.17it/s]\u001b[A\n",
-      " 90%|█████████ | 9/10 [00:06<00:00,  1.19it/s]\u001b[A\n",
-      "[node-0]: {'eval_loss': 5.412436485290527, 'eval_runtime': 8.4252, 'eval_samples_per_second': 2.374, 'eval_steps_per_second': 1.187, 'epoch': 1.0}\n",
+      " 20%|██        | 2/10 [00:00<00:03,  2.54it/s]\u001b[A\n",
+      " 30%|███       | 3/10 [00:01<00:03,  1.80it/s]\u001b[A\n",
+      " 40%|████      | 4/10 [00:02<00:03,  1.55it/s]\u001b[A\n",
+      " 50%|█████     | 5/10 [00:03<00:03,  1.43it/s]\u001b[A\n",
+      " 60%|██████    | 6/10 [00:03<00:02,  1.37it/s]\u001b[A\n",
+      " 70%|███████   | 7/10 [00:04<00:02,  1.32it/s]\u001b[A\n",
+      " 80%|████████  | 8/10 [00:05<00:01,  1.30it/s]\u001b[A\n",
+      " 90%|█████████ | 9/10 [00:06<00:00,  1.29it/s]\u001b[A\n",
       "                                               \u001b[A\n",
-      "100%|██████████| 40/40 [02:36<00:00,  3.37s/it]\n",
-      "100%|██████████| 10/10 [00:07<00:00,  1.21it/s]\u001b[A\n",
-      "[node-0]: {'train_runtime': 156.169, 'train_samples_per_second': 0.512, 'train_steps_per_second': 0.256, 'train_loss': 5.624436950683593, 'epoch': 1.0}\n",
-      "100%|██████████| 40/40 [02:36<00:00,  3.90s/it]\u001b[A\n"
+      "[node-0]: {'eval_loss': 5.543211936950684, 'eval_runtime': 7.9713, 'eval_samples_per_second': 2.509, 'eval_steps_per_second': 1.254, 'epoch': 1.0}\n",
+      "100%|██████████| 40/40 [02:45<00:00,  4.10s/it]\n",
+      "100%|██████████| 10/10 [00:07<00:00,  1.28it/s]\u001b[A\n",
+      "[node-0]: {'train_runtime': 165.12, 'train_samples_per_second': 0.484, 'train_steps_per_second': 0.242, 'train_loss': 5.764264678955078, 'epoch': 1.0}\n",
+      "100%|██████████| 40/40 [02:45<00:00,  4.13s/it]\u001b[A\n"
      ]
     }
    ],
@@ -465,7 +475,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "1af042a5-abec-456c-a74b-fa78efb8000e",
    "metadata": {},
    "outputs": [],
@@ -495,7 +505,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "16608b6d-87bf-414d-a174-331f174c6add",
    "metadata": {},
    "outputs": [],
diff --git a/hack/e2e-run-notebook.sh b/hack/e2e-run-notebook.sh
index 625720dc47..d699827ff5 100755
--- a/hack/e2e-run-notebook.sh
+++ b/hack/e2e-run-notebook.sh
@@ -42,7 +42,7 @@ print_results() {
     kubectl describe trainjob
     kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
     kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
-    kubectl wait trainjob --for=condition=Complete --all --timeout 300s
+    kubectl wait trainjob --for=condition=Complete --all --timeout 3s
 }
 
 (papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||