From 75062fd48c89375969dfe8a92099ef0a46ccdfc5 Mon Sep 17 00:00:00 2001 From: solanyn <14799876+solanyn@users.noreply.github.com> Date: Wed, 2 Apr 2025 03:15:57 +1100 Subject: [PATCH 1/6] Add question-answer example Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com> --- .../fine-tune-distilbert.ipynb | 592 ++++++++++++++++++ 1 file changed, 592 insertions(+) create mode 100644 examples/pytorch/question-answering/fine-tune-distilbert.ipynb diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb new file mode 100644 index 0000000000..94f5f4e954 --- /dev/null +++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb @@ -0,0 +1,592 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b0ee5aff-46d6-47c5-8306-901cfd9206d9", + "metadata": {}, + "source": [ + "# Fine-tuning DistilBERT for question answering\n", + "\n", + "This guide describes fine-tuning DistilBERT with Stanford Question Answering Dataset (SQuAD) for question-answering using Kubeflow Trainer.\n", + "\n", + "This guide is adapted from HuggingFace question answering task recipe page: https://huggingface.co/docs/transformers/en/tasks/question_answering\n", + "\n", + "Pretrained DistilBERT: https://huggingface.co/docs/transformers/en/model_doc/distilbert\n", + "\n", + "SQuAD dataset: https://huggingface.co/datasets/rajpurkar/squad" + ] + }, + { + "cell_type": "markdown", + "id": "c31bc8f2", + "metadata": {}, + "source": [ + "# Install the KubeFlow SDK and dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "10606685", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n", + " Cloning https://github.com/kubeflow/trainer.git (to revision master) to /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n", + " Running command git clone --filter=blob:none --quiet https://github.com/kubeflow/trainer.git /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n", + " Resolved https://github.com/kubeflow/trainer.git to commit 3781eda0e675c655d03bc4cb84cce4362f601e44\n", + " Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", + "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: kubernetes>=27.2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (32.0.1)\n", + "Requirement already satisfied: pydantic>=2.10.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (2.11.0)\n", + "Requirement already satisfied: certifi>=14.05.14 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2025.1.31)\n", + "Requirement already satisfied: six>=1.9.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.17.0)\n", + "Requirement already satisfied: python-dateutil>=2.5.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.9.0.post0)\n", + "Requirement already satisfied: pyyaml>=5.4.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (6.0.2)\n", + "Requirement already satisfied: google-auth>=1.0.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.38.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.8.0)\n", + "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.32.3)\n", + "Requirement already satisfied: requests-oauthlib in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.0.0)\n", + "Requirement already satisfied: oauthlib>=3.2.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (3.2.2)\n", + "Requirement already satisfied: urllib3>=1.24.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.3.0)\n", + "Requirement already satisfied: durationpy>=0.7 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (0.9)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.33.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (2.33.0)\n", + "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (4.13.0)\n", + "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.4.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (5.5.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.4.2)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (4.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.10)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.6.1)\n", + "Requirement already satisfied: cloudpathlib[gs] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (0.21.0)\n", + "Requirement already satisfied: transformers[torch] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (4.50.3)\n", + "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from cloudpathlib[gs]) (3.1.0)\n", + "Requirement already satisfied: filelock in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (3.18.0)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.29.3)\n", + "Requirement already satisfied: numpy>=1.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.2.4)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2024.11.6)\n", + "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.32.3)\n", + "Requirement already satisfied: tokenizers<0.22,>=0.21 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.21.1)\n", + "Requirement already satisfied: safetensors>=0.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.5.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (4.67.1)\n", + "Requirement already satisfied: torch>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.6.0)\n", + "Requirement already satisfied: accelerate>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (1.6.0)\n", + "Requirement already satisfied: psutil in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from accelerate>=0.26.0->transformers[torch]) (7.0.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (2024.12.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (4.13.0)\n", + "Requirement already satisfied: networkx in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n", + "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n", + "Requirement already satisfied: setuptools in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (78.1.0)\n", + "Requirement already satisfied: sympy==1.13.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.13.1)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from sympy==1.13.1->torch>=2.0->transformers[torch]) (1.3.0)\n", + "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.38.0)\n", + "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.24.2)\n", + "Requirement already satisfied: google-cloud-core<3.0dev,>=2.4.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.4.3)\n", + "Requirement already satisfied: google-resumable-media>=2.7.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.7.2)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (1.7.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.1.31)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.69.2)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (6.30.2)\n", + "Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.26.1)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (5.5.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.4.2)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (4.9)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from jinja2->torch>=2.0->transformers[torch]) (3.0.2)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.6.1)\n" + ] + } + ], + "source": [ + "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n", + "!pip install \"cloudpathlib[gs]\" \"transformers[torch]\"" + ] + }, + { + "cell_type": "markdown", + "id": "2a15b91f", + "metadata": {}, + "source": [ + "# Define the HuggingFace training script\n", + "\n", + "We need to wrap our training script into a function to create the Kubeflow TrainJob." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "24e7f396-32ce-4d23-b76f-9684de470471", + "metadata": {}, + "outputs": [], + "source": [ + "def train_distilbert(args):\n", + " import os\n", + "\n", + " from cloudpathlib import CloudPath\n", + " from datasets import load_dataset\n", + " import torch\n", + " from transformers import AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering, TrainingArguments, Trainer\n", + "\n", + " import torch.distributed as dist\n", + "\n", + " # Initialize distributed environment\n", + " _, backend = (\"cuda\", \"nccl\") if torch.cuda.is_available() else (\"cpu\", \"gloo\")\n", + " dist.init_process_group(backend=backend)\n", + "\n", + " local_rank = int(os.getenv(\"LOCAL_RANK\", 0))\n", + " print(\n", + " \"Distributed Training with WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}.\".format(\n", + " dist.get_world_size(),\n", + " dist.get_rank(),\n", + " local_rank,\n", + " )\n", + " )\n", + "\n", + " # Download the dataset and tokenizer\n", + " squad = load_dataset(\"squad\", split=\"train[:5000]\") \n", + "\n", + " squad = squad.train_test_split(test_size=0.2)\n", + " \n", + " tokenizer = AutoTokenizer.from_pretrained(\"distilbert/distilbert-base-uncased\")\n", + " \n", + " # Define the preprocessing function\n", + " def preprocess_function(examples):\n", + " questions = [q.strip() for q in examples[\"question\"]]\n", + " inputs = tokenizer(\n", + " questions,\n", + " examples[\"context\"],\n", + " max_length=384,\n", + " truncation=\"only_second\",\n", + " return_offsets_mapping=True,\n", + " padding=\"max_length\",\n", + " )\n", + " \n", + " offset_mapping = inputs.pop(\"offset_mapping\")\n", + " answers = examples[\"answers\"]\n", + " start_positions = []\n", + " end_positions = []\n", + " \n", + " for i, offset in enumerate(offset_mapping):\n", + " answer = answers[i]\n", + " start_char = answer[\"answer_start\"][0]\n", + " end_char = answer[\"answer_start\"][0] + len(answer[\"text\"][0])\n", + " sequence_ids = inputs.sequence_ids(i)\n", + " \n", + " # Find the start and end of the context\n", + " idx = 0\n", + " while sequence_ids[idx] != 1:\n", + " idx += 1\n", + " context_start = idx\n", + " while sequence_ids[idx] == 1:\n", + " idx += 1\n", + " context_end = idx - 1\n", + " \n", + " # If the answer is not fully inside the context, label it (0, 0)\n", + " if offset[context_start][0] > end_char or offset[context_end][1] < start_char:\n", + " start_positions.append(0)\n", + " end_positions.append(0)\n", + " else:\n", + " # Otherwise it's the start and end token positions\n", + " idx = context_start\n", + " while idx <= context_end and offset[idx][0] <= start_char:\n", + " idx += 1\n", + " start_positions.append(idx - 1)\n", + " \n", + " idx = context_end\n", + " while idx >= context_start and offset[idx][1] >= end_char:\n", + " idx -= 1\n", + " end_positions.append(idx + 1)\n", + " \n", + " inputs[\"start_positions\"] = start_positions\n", + " inputs[\"end_positions\"] = end_positions\n", + " return inputs\n", + " \n", + " # Apply the preprocessing function to the dataset\n", + " tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad[\"train\"].column_names)\n", + " \n", + " # Create a batch of examples using DefaultDataCollator\n", + " data_collator = DefaultDataCollator()\n", + "\n", + " # Load the model\n", + " model = AutoModelForQuestionAnswering.from_pretrained(\"distilbert/distilbert-base-uncased\")\n", + "\n", + " # Define training hyperparameters\n", + " training_args = TrainingArguments(\n", + " output_dir=args[\"MODEL_NAME\"],\n", + " eval_strategy=\"epoch\",\n", + " learning_rate=2e-5,\n", + " per_device_train_batch_size=16,\n", + " per_device_eval_batch_size=16,\n", + " num_train_epochs=3,\n", + " weight_decay=0.01,\n", + " push_to_hub=False,\n", + " )\n", + " \n", + " # Prepare trainer with configuration\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_squad[\"train\"],\n", + " eval_dataset=tokenized_squad[\"test\"],\n", + " processing_class=tokenizer,\n", + " data_collator=data_collator,\n", + " )\n", + " \n", + " trainer.train()\n", + "\n", + " CloudPath(f'gs://{args[\"BUCKET\"]}/{args[\"MODEL_NAME\"]}').upload_from(args[\"MODEL_NAME\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bf5ab9ba-6054-40d6-839d-f84ff0fba8fc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer\n", + "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", + "\n", + "Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer\n", + "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", + "\n", + "Name: mpi-distributed, Framework: torch, Trainer Type: CustomTrainer\n", + "Entrypoint: ['torchrun']\n", + "\n", + "Name: torch-distributed, Framework: torch, Trainer Type: CustomTrainer\n", + "Entrypoint: ['torchrun']\n", + "\n" + ] + } + ], + "source": [ + "from kubeflow.trainer import TrainerClient, CustomTrainer\n", + "\n", + "for r in TrainerClient().list_runtimes():\n", + " print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\")\n", + " print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")\n", + " print()\n", + "\n", + " if r.name == \"torch-distributed\":\n", + " torch_runtime = r" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e3fd0c5f-f359-4c6c-9f0e-2e91904579b3", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"tmp-kftrainer\"\n", + "MODEL_NAME = \"qa-distilbert\"\n", + "args = {\n", + " \"BUCKET\": BUCKET,\n", + " \"MODEL_NAME\": MODEL_NAME,\n", + "}\n", + "\n", + "job_id = TrainerClient().train(\n", + " trainer=CustomTrainer(\n", + " func=train_distilbert,\n", + " func_args=args,\n", + " num_nodes=2,\n", + " packages_to_install=[\"datasets\", \"transformers[torch]\", \"cloudpathlib[gs]\"],\n", + " resources_per_node={\n", + " \"cpu\": \"3\",\n", + " \"memory\": \"8Gi\",\n", + " # Uncomment this to distribute the TrainJob using GPU nodes.\n", + " \"nvidia.com/gpu\": 1,\n", + " },\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "aabade1b-2c0b-492b-be97-03b4f0e037f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ac43d22fc37e'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train API generates a random TrainJob id.\n", + "job_id" + ] + }, + { + "cell_type": "markdown", + "id": "e9294ea5", + "metadata": {}, + "source": [ + "# Check the TrainJob details\n", + "\n", + "Use `list_jobs()` and `get_job()` APIs to get details about the created TrainJob and its steps." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fc5de9e8-f798-4cfd-bc6e-f17774cbd235", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TrainJob: ac43d22fc37e, Status: Created, Created at: 2025-04-01 15:34:40+00:00\n" + ] + } + ], + "source": [ + "for job in TrainerClient().list_jobs():\n", + " print(f\"TrainJob: {job.name}, Status: {job.status}, Created at: {job.creation_timestamp}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a3eec801", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: node-0, Status: Running, Devices: gpu x 1\n", + "Step: node-1, Status: Running, Devices: gpu x 1\n" + ] + } + ], + "source": [ + "# We execute mpirun command on node-0, which functions as the MPI Launcher node.\n", + "for c in TrainerClient().get_job(name=job_id).steps:\n", + " print(f\"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "30f812d7", + "metadata": {}, + "source": [ + "# Show the TrainJob logs\n", + "\n", + "Use `get_job_logs()` API to retrieve the TrainJob logs." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d31f102f-8583-42c4-a6f7-f5a5eb0e7f98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[node-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\n", + "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n", + "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 542991.51 examples/s]\n", + "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 505234.17 examples/s]\n", + "Map: 100%|██████████| 4000/4000 [00:02<00:00, 1966.78 examples/s]\n", + "Map: 100%|██████████| 1000/1000 [00:00<00:00, 1873.66 examples/s]\n", + "[node-0]: Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n", + "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n", + "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + " 0%| | 0/375 [00:00 Date: Wed, 2 Apr 2025 15:20:27 +1100 Subject: [PATCH 2/6] chore: remove unused lines, add TODO comment Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com> --- .../pytorch/question-answering/fine-tune-distilbert.ipynb | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb index 94f5f4e954..1d2cb988bd 100644 --- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb +++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb @@ -110,6 +110,7 @@ } ], "source": [ + "# TODO: Change the version of SDK when we have the first release of Trainer SDK\n", "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n", "!pip install \"cloudpathlib[gs]\" \"transformers[torch]\"" ] @@ -280,10 +281,7 @@ "for r in TrainerClient().list_runtimes():\n", " print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\")\n", " print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")\n", - " print()\n", - "\n", - " if r.name == \"torch-distributed\":\n", - " torch_runtime = r" + "\n" ] }, { @@ -309,7 +307,6 @@ " resources_per_node={\n", " \"cpu\": \"3\",\n", " \"memory\": \"8Gi\",\n", - " # Uncomment this to distribute the TrainJob using GPU nodes.\n", " \"nvidia.com/gpu\": 1,\n", " },\n", " ),\n", From ec8d1eac1ac04326ea2cac7ecb9b571cff1ff53d Mon Sep 17 00:00:00 2001 From: Andrew Chen <14799876+solanyn@users.noreply.github.com> Date: Sat, 26 Apr 2025 08:33:20 +1000 Subject: [PATCH 3/6] chore: update example description Co-authored-by: Andrey Velichkevich Signed-off-by: Andrew Chen <14799876+solanyn@users.noreply.github.com> --- examples/pytorch/question-answering/fine-tune-distilbert.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb index 1d2cb988bd..a9af34f1ac 100644 --- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb +++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb @@ -7,7 +7,7 @@ "source": [ "# Fine-tuning DistilBERT for question answering\n", "\n", - "This guide describes fine-tuning DistilBERT with Stanford Question Answering Dataset (SQuAD) for question-answering using Kubeflow Trainer.\n", + "This guide describes fine-tuning DistilBERT model with Stanford Question Answering Dataset (SQuAD) for question-answering using Kubeflow Trainer.\n", "\n", "This guide is adapted from HuggingFace question answering task recipe page: https://huggingface.co/docs/transformers/en/tasks/question_answering\n", "\n", From 5445bc1f80d1de637ce67645992ff3c9b9ee069f Mon Sep 17 00:00:00 2001 From: solanyn <14799876+solanyn@users.noreply.github.com> Date: Mon, 28 Apr 2025 20:54:47 +1000 Subject: [PATCH 4/6] chore: update question-answering example * run train job on CPU * reduce batch size, dataset size and train epochs * make upload to bucket optional * add notebook to e2e-test * set model name as trainjob argument Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com> --- .github/workflows/test-e2e.yaml | 1 + .../fine-tune-distilbert.ipynb | 361 ++++++++---------- 2 files changed, 151 insertions(+), 211 deletions(-) diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml index fed66a1f8b..145c2a87bb 100644 --- a/.github/workflows/test-e2e.yaml +++ b/.github/workflows/test-e2e.yaml @@ -56,6 +56,7 @@ jobs: run: | mkdir -p artifacts/notebooks make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb TIMEOUT=900 + make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/question-answering/fine-tune-distilbert.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb TIMEOUT=900 # TODO (andreyvelich): Discuss how we can upload artifacts for multiple Notebooks. - name: Upload Artifacts to GitHub diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb index a9af34f1ac..0c1e798d76 100644 --- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb +++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb @@ -21,7 +21,9 @@ "id": "c31bc8f2", "metadata": {}, "source": [ - "# Install the KubeFlow SDK and dependencies" + "# Install the KubeFlow SDK and dependencies\n", + "\n", + "To install the KubeFlow SDK, see: https://www.kubeflow.org/docs/components/trainer/getting-started/" ] }, { @@ -36,83 +38,87 @@ "name": "stdout", "output_type": "stream", "text": [ - "Collecting git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n", - " Cloning https://github.com/kubeflow/trainer.git (to revision master) to /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n", - " Running command git clone --filter=blob:none --quiet https://github.com/kubeflow/trainer.git /private/var/folders/_v/2h8yrb15367bgt_lf2y9tly00000gn/T/pip-req-build-gpc3o8se\n", - " Resolved https://github.com/kubeflow/trainer.git to commit 3781eda0e675c655d03bc4cb84cce4362f601e44\n", - " Installing build dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", - "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: kubernetes>=27.2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (32.0.1)\n", - "Requirement already satisfied: pydantic>=2.10.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubeflow==0.1.0) (2.11.0)\n", - "Requirement already satisfied: certifi>=14.05.14 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2025.1.31)\n", - "Requirement already satisfied: six>=1.9.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.17.0)\n", - "Requirement already satisfied: python-dateutil>=2.5.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.9.0.post0)\n", - "Requirement already satisfied: pyyaml>=5.4.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (6.0.2)\n", - "Requirement already satisfied: google-auth>=1.0.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.38.0)\n", - "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (1.8.0)\n", - "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.32.3)\n", - "Requirement already satisfied: requests-oauthlib in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.0.0)\n", - "Requirement already satisfied: oauthlib>=3.2.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (3.2.2)\n", - "Requirement already satisfied: urllib3>=1.24.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (2.3.0)\n", - "Requirement already satisfied: durationpy>=0.7 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from kubernetes>=27.2.0->kubeflow==0.1.0) (0.9)\n", - "Requirement already satisfied: annotated-types>=0.6.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.7.0)\n", - "Requirement already satisfied: pydantic-core==2.33.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (2.33.0)\n", - "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (4.13.0)\n", - "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pydantic>=2.10.0->kubeflow==0.1.0) (0.4.0)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (5.5.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.4.2)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (4.9)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.4.1)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->kubernetes>=27.2.0->kubeflow==0.1.0) (3.10)\n", - "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=27.2.0->kubeflow==0.1.0) (0.6.1)\n", - "Requirement already satisfied: cloudpathlib[gs] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (0.21.0)\n", - "Requirement already satisfied: transformers[torch] in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (4.50.3)\n", - "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from cloudpathlib[gs]) (3.1.0)\n", - "Requirement already satisfied: filelock in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (3.18.0)\n", - "Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.29.3)\n", - "Requirement already satisfied: numpy>=1.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.2.4)\n", - "Requirement already satisfied: packaging>=20.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (24.2)\n", - "Requirement already satisfied: pyyaml>=5.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (6.0.2)\n", - "Requirement already satisfied: regex!=2019.12.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2024.11.6)\n", - "Requirement already satisfied: requests in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.32.3)\n", - "Requirement already satisfied: tokenizers<0.22,>=0.21 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.21.1)\n", - "Requirement already satisfied: safetensors>=0.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.5.3)\n", - "Requirement already satisfied: tqdm>=4.27 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (4.67.1)\n", - "Requirement already satisfied: torch>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.6.0)\n", - "Requirement already satisfied: accelerate>=0.26.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from transformers[torch]) (1.6.0)\n", - "Requirement already satisfied: psutil in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from accelerate>=0.26.0->transformers[torch]) (7.0.0)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (2024.12.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers[torch]) (4.13.0)\n", - "Requirement already satisfied: networkx in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n", - "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n", - "Requirement already satisfied: setuptools in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (78.1.0)\n", - "Requirement already satisfied: sympy==1.13.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.13.1)\n", - "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from sympy==1.13.1->torch>=2.0->transformers[torch]) (1.3.0)\n", - "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.38.0)\n", - "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.24.2)\n", - "Requirement already satisfied: google-cloud-core<3.0dev,>=2.4.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.4.3)\n", - "Requirement already satisfied: google-resumable-media>=2.7.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (2.7.2)\n", - "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[gs]) (1.7.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.3.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.1.31)\n", - "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.69.2)\n", - "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (6.30.2)\n", - "Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[gs]) (1.26.1)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (5.5.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.4.2)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (4.9)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from jinja2->torch>=2.0->transformers[torch]) (3.0.2)\n", - "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/examples/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[gs]) (0.6.1)\n" + "Requirement already satisfied: cloudpathlib[all] in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (0.21.0)\n", + "Requirement already satisfied: transformers[torch] in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (4.51.3)\n", + "Requirement already satisfied: filelock in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (3.18.0)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.30.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.30.2)\n", + "Requirement already satisfied: numpy>=1.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.2.5)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (25.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2024.11.6)\n", + "Requirement already satisfied: requests in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.32.3)\n", + "Requirement already satisfied: tokenizers<0.22,>=0.21 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.21.1)\n", + "Requirement already satisfied: safetensors>=0.4.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (0.5.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (4.67.1)\n", + "Requirement already satisfied: torch>=2.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (2.7.0)\n", + "Requirement already satisfied: accelerate>=0.26.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from transformers[torch]) (1.6.0)\n", + "Requirement already satisfied: psutil in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from accelerate>=0.26.0->transformers[torch]) (7.0.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.30.0->transformers[torch]) (2025.3.2)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from huggingface-hub<1.0,>=0.30.0->transformers[torch]) (4.13.2)\n", + "Requirement already satisfied: setuptools in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (80.0.0)\n", + "Requirement already satisfied: sympy>=1.13.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.14.0)\n", + "Requirement already satisfied: networkx in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n", + "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n", + "Collecting azure-storage-blob>=12 (from cloudpathlib[all])\n", + " Using cached azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)\n", + "Collecting azure-storage-file-datalake>=12 (from cloudpathlib[all])\n", + " Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl.metadata (16 kB)\n", + "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (3.1.0)\n", + "Collecting boto3>=1.34.0 (from cloudpathlib[all])\n", + " Using cached boto3-1.38.3-py3-none-any.whl.metadata (6.6 kB)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.4.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.4.26)\n", + "Collecting azure-core>=1.30.0 (from azure-storage-blob>=12->cloudpathlib[all])\n", + " Using cached azure_core-1.33.0-py3-none-any.whl.metadata (42 kB)\n", + "Collecting cryptography>=2.1.4 (from azure-storage-blob>=12->cloudpathlib[all])\n", + " Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)\n", + "Collecting isodate>=0.6.1 (from azure-storage-blob>=12->cloudpathlib[all])\n", + " Using cached isodate-0.7.2-py3-none-any.whl.metadata (11 kB)\n", + "Collecting botocore<1.39.0,>=1.38.3 (from boto3>=1.34.0->cloudpathlib[all])\n", + " Using cached botocore-1.38.3-py3-none-any.whl.metadata (5.7 kB)\n", + "Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.34.0->cloudpathlib[all])\n", + " Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n", + "Collecting s3transfer<0.13.0,>=0.12.0 (from boto3>=1.34.0->cloudpathlib[all])\n", + " Using cached s3transfer-0.12.0-py3-none-any.whl.metadata (1.7 kB)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from sympy>=1.13.3->torch>=2.0->transformers[torch]) (1.3.0)\n", + "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.39.0)\n", + "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.25.0rc0)\n", + "Requirement already satisfied: google-cloud-core<3.0dev,>=2.4.2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.4.3)\n", + "Requirement already satisfied: google-resumable-media>=2.7.2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.7.2)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (1.7.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from jinja2->torch>=2.0->transformers[torch]) (3.0.2)\n", + "Requirement already satisfied: six>=1.11.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-core>=1.30.0->azure-storage-blob>=12->cloudpathlib[all]) (1.17.0)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from botocore<1.39.0,>=1.38.3->boto3>=1.34.0->cloudpathlib[all]) (2.9.0.post0)\n", + "Requirement already satisfied: cffi>=1.12 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cryptography>=2.1.4->azure-storage-blob>=12->cloudpathlib[all]) (1.17.1)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[all]) (1.70.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[all]) (6.30.2)\n", + "Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->cloudpathlib[all]) (1.26.1)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (5.5.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (0.4.2)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (4.9.1)\n", + "Requirement already satisfied: pycparser in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cffi>=1.12->cryptography>=2.1.4->azure-storage-blob>=12->cloudpathlib[all]) (2.22)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (0.6.1)\n", + "Using cached azure_storage_blob-12.25.1-py3-none-any.whl (406 kB)\n", + "Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl (263 kB)\n", + "Using cached boto3-1.38.3-py3-none-any.whl (139 kB)\n", + "Using cached azure_core-1.33.0-py3-none-any.whl (207 kB)\n", + "Using cached botocore-1.38.3-py3-none-any.whl (13.5 MB)\n", + "Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl (6.7 MB)\n", + "Using cached isodate-0.7.2-py3-none-any.whl (22 kB)\n", + "Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)\n", + "Using cached s3transfer-0.12.0-py3-none-any.whl (84 kB)\n", + "Installing collected packages: jmespath, isodate, cryptography, botocore, azure-core, s3transfer, azure-storage-blob, boto3, azure-storage-file-datalake\n", + "Successfully installed azure-core-1.33.0 azure-storage-blob-12.25.1 azure-storage-file-datalake-12.20.0 boto3-1.38.3 botocore-1.38.3 cryptography-44.0.2 isodate-0.7.2 jmespath-1.0.1 s3transfer-0.12.0\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], "source": [ - "# TODO: Change the version of SDK when we have the first release of Trainer SDK\n", - "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk\n", - "!pip install \"cloudpathlib[gs]\" \"transformers[torch]\"" + "!pip install \"cloudpathlib[all]\" \"transformers[torch]\"" ] }, { @@ -156,11 +162,11 @@ " )\n", "\n", " # Download the dataset and tokenizer\n", - " squad = load_dataset(\"squad\", split=\"train[:5000]\") \n", + " squad = load_dataset(\"squad\", split=\"train[:100]\") \n", "\n", - " squad = squad.train_test_split(test_size=0.2)\n", + " squad = squad.train_test_split(test_size=0.2, shuffle=False)\n", " \n", - " tokenizer = AutoTokenizer.from_pretrained(\"distilbert/distilbert-base-uncased\")\n", + " tokenizer = AutoTokenizer.from_pretrained(f'distilbert/{args[\"MODEL_NAME\"]}')\n", " \n", " # Define the preprocessing function\n", " def preprocess_function(examples):\n", @@ -221,16 +227,16 @@ " data_collator = DefaultDataCollator()\n", "\n", " # Load the model\n", - " model = AutoModelForQuestionAnswering.from_pretrained(\"distilbert/distilbert-base-uncased\")\n", + " model = AutoModelForQuestionAnswering.from_pretrained(f'distilbert/{args[\"MODEL_NAME\"]}')\n", "\n", " # Define training hyperparameters\n", " training_args = TrainingArguments(\n", " output_dir=args[\"MODEL_NAME\"],\n", " eval_strategy=\"epoch\",\n", " learning_rate=2e-5,\n", - " per_device_train_batch_size=16,\n", - " per_device_eval_batch_size=16,\n", - " num_train_epochs=3,\n", + " per_device_train_batch_size=1,\n", + " per_device_eval_batch_size=1,\n", + " num_train_epochs=1,\n", " weight_decay=0.01,\n", " push_to_hub=False,\n", " )\n", @@ -247,7 +253,9 @@ " \n", " trainer.train()\n", "\n", - " CloudPath(f'gs://{args[\"BUCKET\"]}/{args[\"MODEL_NAME\"]}').upload_from(args[\"MODEL_NAME\"])" + " # Upload the fine-tuned model\n", + " if args.get(\"BUCKET\", None):\n", + " (CloudPath(args[\"BUCKET\"]) / args[\"MODEL_NAME\"]).upload_from(args[\"MODEL_NAME\"])" ] }, { @@ -262,16 +270,12 @@ "text": [ "Name: deepspeed-distributed, Framework: deepspeed, Trainer Type: CustomTrainer\n", "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", - "\n", "Name: mlx-distributed, Framework: mlx, Trainer Type: CustomTrainer\n", "Entrypoint: ['mpirun', '--hostfile', '/etc/mpi/hostfile']\n", - "\n", "Name: mpi-distributed, Framework: torch, Trainer Type: CustomTrainer\n", "Entrypoint: ['torchrun']\n", - "\n", "Name: torch-distributed, Framework: torch, Trainer Type: CustomTrainer\n", - "Entrypoint: ['torchrun']\n", - "\n" + "Entrypoint: ['torchrun']\n" ] } ], @@ -280,8 +284,7 @@ "\n", "for r in TrainerClient().list_runtimes():\n", " print(f\"Name: {r.name}, Framework: {r.trainer.framework.value}, Trainer Type: {r.trainer.trainer_type.value}\")\n", - " print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")\n", - "\n" + " print(f\"Entrypoint: {r.trainer.entrypoint[:3]}\")" ] }, { @@ -291,8 +294,10 @@ "metadata": {}, "outputs": [], "source": [ - "BUCKET = \"tmp-kftrainer\"\n", - "MODEL_NAME = \"qa-distilbert\"\n", + "# To upload to object storage (S3, GCS or Azure Blob Storage), set the bucket with protocol, e.g., \"s3://my-bucket/folder\"\n", + "BUCKET = None\n", + "\n", + "MODEL_NAME = \"distilbert-base-uncased\"\n", "args = {\n", " \"BUCKET\": BUCKET,\n", " \"MODEL_NAME\": MODEL_NAME,\n", @@ -302,12 +307,13 @@ " trainer=CustomTrainer(\n", " func=train_distilbert,\n", " func_args=args,\n", - " num_nodes=2,\n", - " packages_to_install=[\"datasets\", \"transformers[torch]\", \"cloudpathlib[gs]\"],\n", + " num_nodes=1,\n", + " packages_to_install=[\"datasets\", \"transformers[torch]\", \"cloudpathlib[all]\"],\n", " resources_per_node={\n", - " \"cpu\": \"3\",\n", - " \"memory\": \"8Gi\",\n", - " \"nvidia.com/gpu\": 1,\n", + " \"cpu\": \"2\",\n", + " \"memory\": \"12Gi\",\n", + " # Uncomment this to distribute the TrainJob using GPU nodes\n", + " # \"nvidia.com/gpu\": 1,\n", " },\n", " ),\n", ")" @@ -322,7 +328,7 @@ { "data": { "text/plain": [ - "'ac43d22fc37e'" + "'hb18b5a7847c'" ] }, "execution_count": 5, @@ -347,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "id": "fc5de9e8-f798-4cfd-bc6e-f17774cbd235", "metadata": {}, "outputs": [ @@ -355,7 +361,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "TrainJob: ac43d22fc37e, Status: Created, Created at: 2025-04-01 15:34:40+00:00\n" + "TrainJob: hb18b5a7847c, Status: Created, Created at: 2025-04-28 10:47:54+00:00\n" ] } ], @@ -366,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "id": "a3eec801", "metadata": {}, "outputs": [ @@ -374,8 +380,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Step: node-0, Status: Running, Devices: gpu x 1\n", - "Step: node-1, Status: Running, Devices: gpu x 1\n" + "Step: node-0, Status: Running, Devices: cpu x 2\n" ] } ], @@ -397,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "id": "d31f102f-8583-42c4-a6f7-f5a5eb0e7f98", "metadata": {}, "outputs": [ @@ -406,77 +411,41 @@ "output_type": "stream", "text": [ "[node-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\n", + "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] \n", + "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n", + "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n", + "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n", "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n", - "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 542991.51 examples/s]\n", - "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 505234.17 examples/s]\n", - "Map: 100%|██████████| 4000/4000 [00:02<00:00, 1966.78 examples/s]\n", - "Map: 100%|██████████| 1000/1000 [00:00<00:00, 1873.66 examples/s]\n", + "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 1, LOCAL_RANK: 1.\n", + "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 178019.36 examples/s]\n", + "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 277318.46 examples/s]\n", + "Map: 100%|██████████| 80/80 [00:00<00:00, 1035.67 examples/s]\n", + "Map: 100%|██████████| 20/20 [00:00<00:00, 2118.71 examples/s]\n", + "Map: 100%|██████████| 80/80 [00:00<00:00, 794.58 examples/s]\n", + "Map: 100%|██████████| 20/20 [00:00<00:00, 2195.22 examples/s]\n", "[node-0]: Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n", "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n", "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - " 0%| | 0/375 [00:00 Date: Mon, 28 Apr 2025 22:59:56 +1000 Subject: [PATCH 5/6] chore: extend e2e-run-notebook timeout * e2e tests fail if trainjobs launched by notebook do not finish in 3s * extends the timeout to 5min to block and wait for longer trainjobs until timeout or trainjob completes Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com> --- hack/e2e-run-notebook.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/e2e-run-notebook.sh b/hack/e2e-run-notebook.sh index d699827ff5..625720dc47 100755 --- a/hack/e2e-run-notebook.sh +++ b/hack/e2e-run-notebook.sh @@ -42,7 +42,7 @@ print_results() { kubectl describe trainjob kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1 - kubectl wait trainjob --for=condition=Complete --all --timeout 3s + kubectl wait trainjob --for=condition=Complete --all --timeout 300s } (papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) || From 920672607b419c974fb6aba7fbf559cb0cae3d6d Mon Sep 17 00:00:00 2001 From: solanyn <14799876+solanyn@users.noreply.github.com> Date: Tue, 29 Apr 2025 11:28:55 +1000 Subject: [PATCH 6/6] chore: update example to wait for trainjob running status * revert change to e2e-run-notebook.sh Signed-off-by: solanyn <14799876+solanyn@users.noreply.github.com> --- .../fine-tune-distilbert.ipynb | 132 ++++++++++-------- hack/e2e-run-notebook.sh | 2 +- 2 files changed, 72 insertions(+), 62 deletions(-) diff --git a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb index 0c1e798d76..a0af152be9 100644 --- a/examples/pytorch/question-answering/fine-tune-distilbert.ipynb +++ b/examples/pytorch/question-answering/fine-tune-distilbert.ipynb @@ -59,29 +59,20 @@ "Requirement already satisfied: sympy>=1.13.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (1.14.0)\n", "Requirement already satisfied: networkx in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.4.2)\n", "Requirement already satisfied: jinja2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from torch>=2.0->transformers[torch]) (3.1.6)\n", - "Collecting azure-storage-blob>=12 (from cloudpathlib[all])\n", - " Using cached azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)\n", - "Collecting azure-storage-file-datalake>=12 (from cloudpathlib[all])\n", - " Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl.metadata (16 kB)\n", + "Requirement already satisfied: azure-storage-blob>=12 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (12.25.1)\n", + "Requirement already satisfied: azure-storage-file-datalake>=12 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (12.20.0)\n", "Requirement already satisfied: google-cloud-storage in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (3.1.0)\n", - "Collecting boto3>=1.34.0 (from cloudpathlib[all])\n", - " Using cached boto3-1.38.3-py3-none-any.whl.metadata (6.6 kB)\n", + "Requirement already satisfied: boto3>=1.34.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cloudpathlib[all]) (1.38.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2.4.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from requests->transformers[torch]) (2025.4.26)\n", - "Collecting azure-core>=1.30.0 (from azure-storage-blob>=12->cloudpathlib[all])\n", - " Using cached azure_core-1.33.0-py3-none-any.whl.metadata (42 kB)\n", - "Collecting cryptography>=2.1.4 (from azure-storage-blob>=12->cloudpathlib[all])\n", - " Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)\n", - "Collecting isodate>=0.6.1 (from azure-storage-blob>=12->cloudpathlib[all])\n", - " Using cached isodate-0.7.2-py3-none-any.whl.metadata (11 kB)\n", - "Collecting botocore<1.39.0,>=1.38.3 (from boto3>=1.34.0->cloudpathlib[all])\n", - " Using cached botocore-1.38.3-py3-none-any.whl.metadata (5.7 kB)\n", - "Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.34.0->cloudpathlib[all])\n", - " Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n", - "Collecting s3transfer<0.13.0,>=0.12.0 (from boto3>=1.34.0->cloudpathlib[all])\n", - " Using cached s3transfer-0.12.0-py3-none-any.whl.metadata (1.7 kB)\n", + "Requirement already satisfied: azure-core>=1.30.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-storage-blob>=12->cloudpathlib[all]) (1.33.0)\n", + "Requirement already satisfied: cryptography>=2.1.4 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-storage-blob>=12->cloudpathlib[all]) (44.0.2)\n", + "Requirement already satisfied: isodate>=0.6.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from azure-storage-blob>=12->cloudpathlib[all]) (0.7.2)\n", + "Requirement already satisfied: botocore<1.39.0,>=1.38.3 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from boto3>=1.34.0->cloudpathlib[all]) (1.38.3)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from boto3>=1.34.0->cloudpathlib[all]) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.13.0,>=0.12.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from boto3>=1.34.0->cloudpathlib[all]) (0.12.0)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from sympy>=1.13.3->torch>=2.0->transformers[torch]) (1.3.0)\n", "Requirement already satisfied: google-auth<3.0dev,>=2.26.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.39.0)\n", "Requirement already satisfied: google-api-core<3.0.0dev,>=2.15.0 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-cloud-storage->cloudpathlib[all]) (2.25.0rc0)\n", @@ -100,17 +91,6 @@ "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (4.9.1)\n", "Requirement already satisfied: pycparser in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from cffi>=1.12->cryptography>=2.1.4->azure-storage-blob>=12->cloudpathlib[all]) (2.22)\n", "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /Users/andrew/git/trainer/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->cloudpathlib[all]) (0.6.1)\n", - "Using cached azure_storage_blob-12.25.1-py3-none-any.whl (406 kB)\n", - "Using cached azure_storage_file_datalake-12.20.0-py3-none-any.whl (263 kB)\n", - "Using cached boto3-1.38.3-py3-none-any.whl (139 kB)\n", - "Using cached azure_core-1.33.0-py3-none-any.whl (207 kB)\n", - "Using cached botocore-1.38.3-py3-none-any.whl (13.5 MB)\n", - "Using cached cryptography-44.0.2-cp39-abi3-macosx_10_9_universal2.whl (6.7 MB)\n", - "Using cached isodate-0.7.2-py3-none-any.whl (22 kB)\n", - "Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)\n", - "Using cached s3transfer-0.12.0-py3-none-any.whl (84 kB)\n", - "Installing collected packages: jmespath, isodate, cryptography, botocore, azure-core, s3transfer, azure-storage-blob, boto3, azure-storage-file-datalake\n", - "Successfully installed azure-core-1.33.0 azure-storage-blob-12.25.1 azure-storage-file-datalake-12.20.0 boto3-1.38.3 botocore-1.38.3 cryptography-44.0.2 isodate-0.7.2 jmespath-1.0.1 s3transfer-0.12.0\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" @@ -328,7 +308,7 @@ { "data": { "text/plain": [ - "'hb18b5a7847c'" + "'rafd89de924b'" ] }, "execution_count": 5, @@ -361,7 +341,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "TrainJob: hb18b5a7847c, Status: Created, Created at: 2025-04-28 10:47:54+00:00\n" + "TrainJob: rafd89de924b, Status: Unknown, Created at: 2025-04-29 01:22:14+00:00\n" ] } ], @@ -373,6 +353,36 @@ { "cell_type": "code", "execution_count": 7, + "id": "c9f8ef95-b309-4987-9abb-760dc9c1e050", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for TrainJob running status. Sleep for 5 seconds\n" + ] + } + ], + "source": [ + "# TODO (andreyvelich): Use wait_for_job_status API from TrainerClient() when it is implemented.\n", + "import time\n", + "\n", + "def wait_for_job_running():\n", + " for _ in range(100):\n", + " trainjob = TrainerClient().get_job(name=job_id)\n", + " for c in trainjob.steps:\n", + " if c.status == \"Running\":\n", + " return\n", + " print(\"Waiting for TrainJob running status. Sleep for 5 seconds\")\n", + " time.sleep(5)\n", + "\n", + "wait_for_job_running()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "id": "a3eec801", "metadata": {}, "outputs": [ @@ -402,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "d31f102f-8583-42c4-a6f7-f5a5eb0e7f98", "metadata": {}, "outputs": [ @@ -411,41 +421,41 @@ "output_type": "stream", "text": [ "[node-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\n", - "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] \n", - "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n", - "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n", - "[node-0]: W0428 10:48:15.864000 1 site-packages/torch/distributed/run.py:793] *****************************************\n", - "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n", + "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] \n", + "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] *****************************************\n", + "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n", + "[node-0]: W0429 01:22:31.907000 1 site-packages/torch/distributed/run.py:793] *****************************************\n", "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 1, LOCAL_RANK: 1.\n", - "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 178019.36 examples/s]\n", - "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 277318.46 examples/s]\n", - "Map: 100%|██████████| 80/80 [00:00<00:00, 1035.67 examples/s]\n", - "Map: 100%|██████████| 20/20 [00:00<00:00, 2118.71 examples/s]\n", - "Map: 100%|██████████| 80/80 [00:00<00:00, 794.58 examples/s]\n", - "Map: 100%|██████████| 20/20 [00:00<00:00, 2195.22 examples/s]\n", + "[node-0]: Distributed Training with WORLD_SIZE: 2, RANK: 0, LOCAL_RANK: 0.\n", + "Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 627416.25 examples/s]\n", + "Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 1041555.11 examples/s]\n", + "Map: 100%|██████████| 80/80 [00:00<00:00, 1361.10 examples/s]\n", + "Map: 100%|██████████| 20/20 [00:00<00:00, 2123.27 examples/s]\n", + "Map: 100%|██████████| 80/80 [00:00<00:00, 1030.86 examples/s]\n", + "Map: 100%|██████████| 20/20 [00:00<00:00, 1877.53 examples/s]\n", "[node-0]: Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n", "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n", "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "[node-0]: Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n", "[node-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - " 0%| | 0/40 [00:00