diff --git a/notebooks/inference.ipynb b/notebooks/inference.ipynb index dbc84c2..59e44be 100644 --- a/notebooks/inference.ipynb +++ b/notebooks/inference.ipynb @@ -1,206 +1,430 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": {}, - "source": [ - "# Alpamayo-R1 Demo" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Alpamayo 1 Demo" + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": {}, + "source": [ + "This notebook will load some example data from the NVIDIA [PhysicalAI-AV Dataset](https://huggingface.co/datasets/nvidia/PhysicalAI-Autonomous-Vehicles) and run the Alpamayo 1 model on it, producing and visualizing output trajectories and associated reasoning traces." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "import copy\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import torch\n", + "from alpamayo_r1.models.alpamayo_r1 import AlpamayoR1\n", + "from alpamayo_r1.load_physical_aiavdataset import load_physical_aiavdataset\n", + "from alpamayo_r1 import helper" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "### Load model and construct data preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1199ecb4", + "metadata": {}, + "outputs": [], + "source": [ + "# check the gpu\n", + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "model = AlpamayoR1.from_pretrained(\"nvidia/Alpamayo-R1-10B\", dtype=torch.bfloat16).to(\"cuda\")\n", + "processor = helper.get_processor(model.tokenizer)" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, + "source": [ + "### Load and prepare data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "clip_ids = pd.read_parquet(\"clip_ids.parquet\")[\"clip_id\"].tolist()\n", + "clip_id = clip_ids[774]\n", + "# examples\n", + "# 774 clip_id = '030c760c-ae38-49aa-9ad8-f5650a545d26'\n", + "\n", + "data = load_physical_aiavdataset(clip_id)\n", + "\n", + "messages = helper.create_message(data[\"image_frames\"].flatten(0, 1))\n", + "\n", + "inputs = processor.apply_chat_template(\n", + " messages,\n", + " tokenize=True,\n", + " add_generation_prompt=False,\n", + " continue_final_message=True,\n", + " return_dict=True,\n", + " return_tensors=\"pt\",\n", + ")\n", + "print(\"seq length:\", inputs.input_ids.shape)\n", + "model_inputs = {\n", + " \"tokenized_data\": inputs,\n", + " \"ego_history_xyz\": data[\"ego_history_xyz\"],\n", + " \"ego_history_rot\": data[\"ego_history_rot\"],\n", + "}\n", + "model_inputs = helper.to_device(model_inputs, \"cuda\")" + ] + }, + { + "cell_type": "markdown", + "id": "e34c9783", + "metadata": {}, + "source": [ + "### Inspect input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0afd0b70", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"model_inputs:\")\n", + "# full input data\n", + "for key, value in model_inputs.items():\n", + " if isinstance(value, torch.Tensor):\n", + " print(key, value.shape)\n", + " else:\n", + " print(key, type(value))\n", + "\n", + "# data fed into model.vlm\n", + "print(\"\\nmodel_inputs['tokenized_data']:\")\n", + "for key, value in model_inputs[\"tokenized_data\"].items():\n", + " if isinstance(value, torch.Tensor):\n", + " print(key, value.shape)\n", + " else:\n", + " print(key, type(value))\n" + ] + }, + { + "cell_type": "markdown", + "id": "440734a7", + "metadata": {}, + "source": [ + "### Visualization of the ego history in BEV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a816bd61", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Extract ego history — (1, 1, T, 3) → (T, 3)\n", + "ego_xyz = model_inputs[\"ego_history_xyz\"].cpu().numpy().squeeze() # (T, 3)\n", + "\n", + "x, y = ego_xyz[:, 0], ego_xyz[:, 1] # ignore z\n", + "T = len(x)\n", + "\n", + "# BEV mapping: plot_x = -y (right), plot_y = x (forward up)\n", + "px, py = -y, x\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 8))\n", + "\n", + "# Trajectory line + markers\n", + "ax.plot(px, py, color=\"royalblue\", linewidth=2, alpha=0.6, zorder=2)\n", + "ax.scatter(px, py, c=np.arange(T), cmap=\"viridis\", s=30, zorder=3)\n", + "\n", + "# Start & End\n", + "ax.scatter(px[0], py[0], color=\"green\", s=100, marker=\"D\", zorder=4, label=\"Ego History Start\")\n", + "ax.scatter(px[-1], py[-1], color=\"red\", s=100, marker=\"D\", zorder=4, label=\"Ego History End (t0)\")\n", + "\n", + "ax.set_xlabel(\"Y (right) [m]\")\n", + "ax.set_ylabel(\"X (forward) [m]\")\n", + "ax.set_title(\"BEV Ego History\")\n", + "ax.set_aspect(\"equal\")\n", + "\n", + "# Make plot square\n", + "v_span = py.max() - py.min()\n", + "h_span = px.max() - px.min()\n", + "span = max(v_span, h_span) * 1.1 # 10% padding\n", + "h_mid = (px.max() + px.min()) / 2\n", + "v_mid = (py.max() + py.min()) / 2\n", + "ax.set_xlim(h_mid - span / 2, h_mid + span / 2)\n", + "ax.set_ylim(v_mid - span / 2, v_mid + span / 2)\n", + "\n", + "ax.legend()\n", + "\n", + "ax.grid(True, alpha=0.3)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad7c2dc7", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Camera names corresponding to the sorted camera indices\n", + "# Map camera indices to names (based on avdi.features.CAMERA enum)\n", + "CAMERA_INDEX_TO_NAME = {\n", + " 0: \"Cross Left\",\n", + " 1: \"Front Wide\",\n", + " 2: \"Cross Right\",\n", + " 6: \"Front Tele\",\n", + "}\n", + "frame_idx_to_time = [\n", + " f\"t={t - data['relative_timestamps'][-1].tolist()[-1]:.1f}s\"\n", + " for t in data[\"relative_timestamps\"][-1].tolist()\n", + "]\n", + "camera_names = [\n", + " CAMERA_INDEX_TO_NAME.get(idx.item(), f\"Camera {idx.item()}\") for idx in data[\"camera_indices\"]\n", + "]\n", + "\n", + "# image_frames shape: (N_cameras, num_frames, 3, H, W)\n", + "images = (\n", + " data[\"image_frames\"].permute(0, 1, 3, 4, 2).cpu().numpy()\n", + ") # (N_cameras, num_frames, H, W, 3)\n", + "n_cameras, n_frames = images.shape[:2]\n", + "\n", + "fig, axes = plt.subplots(n_cameras, n_frames, figsize=(12, 8))\n", + "plt.subplots_adjust(wspace=0.02, hspace=0.1, left=0.12, right=0.98, top=0.92, bottom=0.02)\n", + "\n", + "for cam_idx in range(n_cameras):\n", + " for frame_idx in range(n_frames):\n", + " ax = axes[cam_idx, frame_idx]\n", + " ax.imshow(images[cam_idx, frame_idx])\n", + " ax.axis(\"off\")\n", + "\n", + " # Add column labels (timesteps) at the top row\n", + " if cam_idx == 0:\n", + " ax.set_title(frame_idx_to_time[frame_idx], fontsize=11)\n", + "\n", + " # Add row labels (camera names) on the left column\n", + " if frame_idx == 0:\n", + " ax.text(\n", + " -0.05,\n", + " 0.5,\n", + " camera_names[cam_idx],\n", + " fontsize=10,\n", + " rotation=0,\n", + " ha=\"right\",\n", + " va=\"center\",\n", + " transform=ax.transAxes,\n", + " )\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52a76e4c", + "metadata": {}, + "outputs": [], + "source": [ + "# check the input sequence\n", + "input_ids = model_inputs[\"tokenized_data\"][\"input_ids\"]\n", + "tokenizer = model.tokenizer\n", + "input_seq = tokenizer.decode(input_ids[0])\n", + "\n", + "# replacing special tokens for easier reading\n", + "input_seq = input_seq.replace(\"<|image_pad|>\", \"I\")\n", + "input_seq = input_seq.replace(\"<|traj_history|>\", \"H\")\n", + "input_seq = input_seq.replace(\"<|traj_future|>\", \"F\")\n", + "input_seq = input_seq.replace(\"<|vision_end|>\", \"<|vision_end|>\\n\")\n", + "input_seq = input_seq.replace(\"<|traj_history_end|>\", \"<|traj_history_end|>\\n\")\n", + "\n", + "print(input_seq)\n" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": {}, + "source": [ + "### Model inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "torch.cuda.manual_seed_all(42)\n", + "with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n", + " pred_xyz, pred_rot, extra = model.sample_trajectories_from_data_with_vlm_rollout(\n", + " data=copy.deepcopy(model_inputs),\n", + " top_p=0.98,\n", + " temperature=0.6,\n", + " num_traj_samples=4, # Feel free to raise this for more output trajectories and CoC traces.\n", + " max_generation_length=256,\n", + " return_extra=True,\n", + " )\n", + "\n", + "# the size is [batch_size, num_traj_sets, num_traj_samples]\n", + "print(\"Chain-of-Causation (per trajectory):\\n\", extra[\"cot\"][0])" + ] + }, + { + "cell_type": "markdown", + "id": "9", + "metadata": {}, + "source": [ + "## Visualizing data and results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# --- Ego history (BEV) ---\n", + "hist_xyz = model_inputs[\"ego_history_xyz\"].cpu().numpy().squeeze() # (T_h, 3)\n", + "hx, hy = -hist_xyz[:, 1], hist_xyz[:, 0] # BEV: plot_x=-y, plot_y=x\n", + "\n", + "# --- GT future ---\n", + "gt_xyz = data[\"ego_future_xyz\"].cpu().numpy().squeeze() # (T_f, 3)\n", + "gx, gy = -gt_xyz[:, 1], gt_xyz[:, 0]\n", + "\n", + "# --- Predicted futures ---\n", + "n_samples = pred_xyz.shape[2]\n", + "pred_np = pred_xyz.cpu().numpy()[0, 0] # (n_samples, T_f, 3)\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 8))\n", + "\n", + "# Ego history\n", + "ax.plot(hx, hy, color=\"royalblue\", linewidth=2, alpha=0.6, zorder=2)\n", + "ax.scatter(hx, hy, c=np.arange(len(hx)), cmap=\"viridis\", s=30, zorder=3)\n", + "ax.scatter(hx[0], hy[0], color=\"green\", s=100, marker=\"D\", zorder=4, label=\"History Start\")\n", + "ax.scatter(hx[-1], hy[-1], color=\"red\", s=100, marker=\"D\", zorder=4, label=\"t0\")\n", + "\n", + "# Predicted trajectories\n", + "colors = plt.cm.tab10(np.linspace(0, 1, n_samples))\n", + "for i in range(n_samples):\n", + " px_pred = -pred_np[i, :, 1]\n", + " py_pred = pred_np[i, :, 0]\n", + " ax.plot(\n", + " px_pred,\n", + " py_pred,\n", + " \"o-\",\n", + " color=colors[i],\n", + " markersize=4,\n", + " linewidth=1.5,\n", + " alpha=0.8,\n", + " zorder=5,\n", + " label=f\"Pred #{i + 1}\",\n", + " )\n", + "\n", + "# GT future\n", + "ax.plot(gx, gy, \"s-\", color=\"red\", markersize=4, linewidth=2, zorder=6, label=\"GT Future\")\n", + "\n", + "ax.set_xlabel(\"Y (right) [m]\")\n", + "ax.set_ylabel(\"X (forward) [m]\")\n", + "ax.set_title(\"BEV: Ego History + Predicted & GT Trajectories\")\n", + "ax.set_aspect(\"equal\")\n", + "\n", + "# Make plot square\n", + "all_px = np.concatenate([hx, gx] + [-pred_np[i, :, 1] for i in range(n_samples)]) # horizontal\n", + "all_py = np.concatenate([hy, gy] + [pred_np[i, :, 0] for i in range(n_samples)]) # vertical\n", + "v_span = all_py.max() - all_py.min()\n", + "h_span = all_px.max() - all_px.min()\n", + "span = max(v_span, h_span) * 1.1 # 10% padding\n", + "h_mid = (all_px.max() + all_px.min()) / 2\n", + "v_mid = (all_py.max() + all_py.min()) / 2\n", + "ax.set_xlim(h_mid - span / 2, h_mid + span / 2)\n", + "ax.set_ylim(v_mid - span / 2, v_mid + span / 2)\n", + "\n", + "ax.legend()\n", + "ax.grid(True, alpha=0.3)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], + "source": [ + "pred_xy = pred_xyz.cpu().numpy()[0, 0, :, :, :2].transpose(0, 2, 1)\n", + "gt_xy = data[\"ego_future_xyz\"].cpu().numpy()[0, 0, :, :2].T\n", + "diff = np.linalg.norm(pred_xy - gt_xy[None, ...], axis=1).mean(-1)\n", + "print(\"minADE:\", diff.min(), \"meters\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fde18ba", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ar1_venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } }, - { - "cell_type": "markdown", - "id": "1", - "metadata": {}, - "source": [ - "This notebook will load some example data from the NVIDIA [PhysicalAI-AV Dataset](https://huggingface.co/datasets/nvidia/PhysicalAI-Autonomous-Vehicles) and run the Alpamayo-R1 model on it, producing and visualizing output trajectories and associated reasoning traces." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "import copy\n", - "import numpy as np\n", - "import mediapy as mp\n", - "import pandas as pd\n", - "\n", - "import torch\n", - "from alpamayo_r1.models.alpamayo_r1 import AlpamayoR1\n", - "from alpamayo_r1.load_physical_aiavdataset import load_physical_aiavdataset\n", - "from alpamayo_r1 import helper" - ] - }, - { - "cell_type": "markdown", - "id": "3", - "metadata": {}, - "source": [ - "### Load model and construct data preprocessor" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "model = AlpamayoR1.from_pretrained(\"nvidia/Alpamayo-R1-10B\", dtype=torch.bfloat16).to(\"cuda\")\n", - "processor = helper.get_processor(model.tokenizer)" - ] - }, - { - "cell_type": "markdown", - "id": "5", - "metadata": {}, - "source": [ - "### Load and prepare data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "clip_ids = pd.read_parquet(\"clip_ids.parquet\")[\"clip_id\"].tolist()\n", - "clip_id = clip_ids[774]\n", - "# clip_id = '030c760c-ae38-49aa-9ad8-f5650a545d26'\n", - "\n", - "data = load_physical_aiavdataset(clip_id)\n", - "\n", - "messages = helper.create_message(data[\"image_frames\"].flatten(0, 1))\n", - "\n", - "inputs = processor.apply_chat_template(\n", - " messages,\n", - " tokenize=True,\n", - " add_generation_prompt=False,\n", - " continue_final_message=True,\n", - " return_dict=True,\n", - " return_tensors=\"pt\",\n", - ")\n", - "print(\"seq length:\", inputs.input_ids.shape)\n", - "model_inputs = {\n", - " \"tokenized_data\": inputs,\n", - " \"ego_history_xyz\": data[\"ego_history_xyz\"],\n", - " \"ego_history_rot\": data[\"ego_history_rot\"],\n", - "}\n", - "model_inputs = helper.to_device(model_inputs, \"cuda\")" - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": {}, - "source": [ - "### Model inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": {}, - "outputs": [], - "source": [ - "torch.cuda.manual_seed_all(42)\n", - "with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n", - " pred_xyz, pred_rot, extra = model.sample_trajectories_from_data_with_vlm_rollout(\n", - " data=copy.deepcopy(model_inputs),\n", - " top_p=0.98,\n", - " temperature=0.6,\n", - " num_traj_samples=1, # Feel free to raise this for more output trajectories and CoC traces.\n", - " max_generation_length=256,\n", - " return_extra=True,\n", - " )\n", - "\n", - "# the size is [batch_size, num_traj_sets, num_traj_samples]\n", - "print(\"Chain-of-Causation (per trajectory):\\n\", extra[\"cot\"][0])" - ] - }, - { - "cell_type": "markdown", - "id": "9", - "metadata": {}, - "source": [ - "## Visualizing data and results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "mp.show_images(data[\"image_frames\"].flatten(0, 1).permute(0, 2, 3, 1), columns=4, width=200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "def rotate_90cc(xy):\n", - " # Rotate (x, y) by 90 deg CCW -> (y, -x)\n", - " return np.stack([-xy[1], xy[0]], axis=0)\n", - "\n", - "\n", - "for i in range(pred_xyz.shape[2]):\n", - " pred_xy = pred_xyz.cpu()[0, 0, i, :, :2].T.numpy()\n", - " pred_xy_rot = rotate_90cc(pred_xy)\n", - " gt_xy = data[\"ego_future_xyz\"].cpu()[0, 0, :, :2].T.numpy()\n", - " gt_xy_rot = rotate_90cc(gt_xy)\n", - " plt.plot(*pred_xy_rot, \"o-\", label=f\"Predicted Trajectory #{i + 1}\")\n", - "plt.ylabel(\"y coordinate (meters)\")\n", - "plt.xlabel(\"x coordinate (meters)\")\n", - "plt.plot(*gt_xy_rot, \"r-\", label=\"Ground Truth Trajectory\")\n", - "plt.legend(loc=\"best\")\n", - "plt.axis(\"equal\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12", - "metadata": {}, - "outputs": [], - "source": [ - "pred_xy = pred_xyz.cpu().numpy()[0, 0, :, :, :2].transpose(0, 2, 1)\n", - "diff = np.linalg.norm(pred_xy - gt_xy[None, ...], axis=1).mean(-1)\n", - "print(\"minADE:\", diff.min(), \"meters\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 }