fix: Colab notebook clones repo instead of pip install — benchmarks need repo files

RBKunnela · RBKunnela · commit df8a987848d8 · 2026-04-14T13:45:59.000+03:00
diff --git a/benchmarks/colab_benchmark.ipynb b/benchmarks/colab_benchmark.ipynb
@@ -51,46 +51,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Install ALMA from GitHub (latest main branch)\n",
-    "!pip install -q git+https://github.com/RBKunnela/ALMA-memory.git\n",
-    "\n",
-    "# Install GPU-accelerated FAISS if available, fall back to CPU\n",
-    "import subprocess\n",
-    "import sys\n",
-    "\n",
-    "try:\n",
-    "    subprocess.check_call(\n",
-    "        [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"faiss-gpu\"],\n",
-    "        stdout=subprocess.DEVNULL,\n",
-    "        stderr=subprocess.DEVNULL,\n",
-    "    )\n",
-    "    print(\"Installed: faiss-gpu\")\n",
-    "except subprocess.CalledProcessError:\n",
-    "    subprocess.check_call(\n",
-    "        [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"faiss-cpu\"],\n",
-    "        stdout=subprocess.DEVNULL,\n",
-    "        stderr=subprocess.DEVNULL,\n",
-    "    )\n",
-    "    print(\"Installed: faiss-cpu (GPU not available)\")\n",
-    "\n",
-    "# Install visualization and embedding dependencies\n",
-    "!pip install -q sentence-transformers matplotlib seaborn\n",
-    "\n",
-    "# Verify GPU\n",
-    "import torch\n",
-    "print(f\"\\nPyTorch version: {torch.__version__}\")\n",
-    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
-    "if torch.cuda.is_available():\n",
-    "    print(f\"GPU device: {torch.cuda.get_device_name(0)}\")\n",
-    "    print(f\"GPU memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB\")\n",
-    "else:\n",
-    "    print(\"Running on CPU -- embeddings will be slower but benchmarks still work.\")\n",
-    "\n",
-    "# Verify ALMA installation\n",
-    "import alma\n",
-    "print(f\"\\nALMA version: {alma.__version__}\")"
-   ]
+   "source": "# Clone the ALMA repo (benchmarks are not part of the pip package)\nimport os\n\nif not os.path.exists(\"/content/ALMA-memory\"):\n    !git clone https://github.com/RBKunnela/ALMA-memory.git /content/ALMA-memory\n    print(\"Cloned ALMA repo.\")\nelse:\n    !cd /content/ALMA-memory && git pull\n    print(\"Updated ALMA repo.\")\n\n# Install ALMA and dependencies from the cloned repo\nos.chdir(\"/content/ALMA-memory\")\n!pip install -q -e .\n\n# Install GPU-accelerated FAISS if available, fall back to CPU\nimport subprocess\nimport sys\n\ntry:\n    subprocess.check_call(\n        [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"faiss-gpu\"],\n        stdout=subprocess.DEVNULL,\n        stderr=subprocess.DEVNULL,\n    )\n    print(\"Installed: faiss-gpu\")\nexcept subprocess.CalledProcessError:\n    subprocess.check_call(\n        [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"faiss-cpu\"],\n        stdout=subprocess.DEVNULL,\n        stderr=subprocess.DEVNULL,\n    )\n    print(\"Installed: faiss-cpu (GPU not available)\")\n\n# Install visualization and embedding dependencies\n!pip install -q sentence-transformers matplotlib seaborn\n\n# Verify GPU\nimport torch\nprint(f\"\\nPyTorch version: {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU device: {torch.cuda.get_device_name(0)}\")\n    print(f\"GPU memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB\")\nelse:\n    print(\"Running on CPU -- embeddings will be slower but benchmarks still work.\")\n\n# Verify ALMA installation\nimport alma\nprint(f\"\\nALMA version: {alma.__version__}\")"
   },
   {
    "cell_type": "markdown",
@@ -110,32 +71,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import os\n",
-    "import json\n",
-    "\n",
-    "DATA_DIR = \"/tmp/alma-benchmark-data\"\n",
-    "DATA_FILE = os.path.join(DATA_DIR, \"longmemeval_s_cleaned.json\")\n",
-    "RESULTS_DIR = \"/tmp/alma-benchmark-results\"\n",
-    "\n",
-    "os.makedirs(DATA_DIR, exist_ok=True)\n",
-    "os.makedirs(RESULTS_DIR, exist_ok=True)\n",
-    "\n",
-    "if not os.path.exists(DATA_FILE):\n",
-    "    print(\"Downloading LongMemEval dataset from HuggingFace...\")\n",
-    "    !curl -fsSL -o {DATA_FILE} \\\n",
-    "      https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json\n",
-    "    print(\"Download complete.\")\n",
-    "else:\n",
-    "    print(\"Dataset already downloaded.\")\n",
-    "\n",
-    "# Quick sanity check\n",
-    "with open(DATA_FILE) as f:\n",
-    "    data = json.load(f)\n",
-    "print(f\"Questions loaded: {len(data)}\")\n",
-    "print(f\"File size: {os.path.getsize(DATA_FILE) / 1e6:.1f} MB\")\n",
-    "print(f\"Sample question: {data[0].get('question', data[0].get('query', 'N/A'))[:100]}...\")"
-   ]
+   "source": "import os\nimport json\n\n# Ensure we're in the repo directory\nos.chdir(\"/content/ALMA-memory\")\n\nDATA_DIR = \"/tmp/alma-benchmark-data\"\nDATA_FILE = os.path.join(DATA_DIR, \"longmemeval_s_cleaned.json\")\nRESULTS_DIR = \"/tmp/alma-benchmark-results\"\n\nos.makedirs(DATA_DIR, exist_ok=True)\nos.makedirs(RESULTS_DIR, exist_ok=True)\n\nif not os.path.exists(DATA_FILE):\n    print(\"Downloading LongMemEval dataset from HuggingFace...\")\n    !curl -fsSL -o {DATA_FILE} \\\n      https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json\n    print(\"Download complete.\")\nelse:\n    print(\"Dataset already downloaded.\")\n\n# Quick sanity check\nwith open(DATA_FILE) as f:\n    data = json.load(f)\nprint(f\"Questions loaded: {len(data)}\")\nprint(f\"File size: {os.path.getsize(DATA_FILE) / 1e6:.1f} MB\")\nprint(f\"Sample question: {data[0].get('question', data[0].get('query', 'N/A'))[:100]}...\")"
   },
   {
    "cell_type": "markdown",