facebookresearch · zyaoj · Mar 24, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 28, 2025
@@ -71,6 +71,7 @@ Documentation
 
    notebooks/datapipeline
    notebooks/dataset_gsm8k_sft
+   notebooks/models/load_model
 
 .. toctree::
    :maxdepth: 1

@@ -79,9 +79,7 @@
     "pipeline2 = read_sequence([5, 6, 7, 8]).and_return()\n",
     "pipeline3 = read_sequence([0, 2, 4, 6]).and_return()\n",
     "\n",
-    "pipeline = DataPipeline.round_robin(\n",
-    "    [pipeline1, pipeline2, pipeline3]\n",
-    ").and_return()\n",
+    "pipeline = DataPipeline.round_robin([pipeline1, pipeline2, pipeline3]).and_return()\n",
     "\n",
     "for i in pipeline:\n",
     "    print(i)\n",
@@ -103,9 +101,7 @@
     "pipeline2 = DataPipeline.constant(0).and_return()\n",
     "pipeline3 = read_sequence([0, 2, 4, 6]).and_return()\n",
     "\n",
-    "pipeline = DataPipeline.round_robin(\n",
-    "    [pipeline1, pipeline2, pipeline3]\n",
-    ").and_return()\n",
+    "pipeline = DataPipeline.round_robin([pipeline1, pipeline2, pipeline3]).and_return()\n",
     "\n",
     "for _ in range(2):\n",
     "    assert list(pipeline) == [1, 0, 0, 2, 0, 2, 3, 0, 4, 4, 0, 6]\n",
@@ -124,9 +120,7 @@
     "pipeline2 = read_sequence([0]).repeat().and_return()\n",
     "pipeline3 = read_sequence([0, 2, 4, 6]).and_return()\n",
     "\n",
-    "pipeline = DataPipeline.round_robin(\n",
-    "    [pipeline1, pipeline2, pipeline3]\n",
-    ").and_return()\n",
+    "pipeline = DataPipeline.round_robin([pipeline1, pipeline2, pipeline3]).and_return()\n",
     "\n",
     "for _ in range(2):\n",
     "    it = iter(pipeline)\n",
@@ -582,7 +576,9 @@
     "\n",
     "pipeline = (\n",
     "    read_sequence(seq)\n",
-    "    .dynamic_bucket(threshold, cost_fn, min_num_examples=2, max_num_examples=2, drop_remainder=True)\n",
+    "    .dynamic_bucket(\n",
+    "        threshold, cost_fn, min_num_examples=2, max_num_examples=2, drop_remainder=True\n",
+    "    )\n",
     "    .and_return()\n",
     ")\n",
     "\n",
@@ -631,6 +627,7 @@
     "def fn(d: int) -> int:\n",
     "    return d**2\n",
     "\n",
+    "\n",
     "seq = list(range(1, 10))\n",
     "\n",
     "pipeline = read_sequence(seq).map(fn, num_parallel_calls=4).and_return()  # fmt: skip\n",
@@ -653,9 +650,11 @@
     "\n",
     "fn1 = StrToIntConverter()\n",
     "\n",
+    "\n",
     "def fn2(d: int) -> int:\n",
     "    return d**2\n",
     "\n",
+    "\n",
     "pipeline = read_sequence([\"1\", \"2\", \"3\", \"4\"]).map([fn1, fn2]).and_return()\n",
     "\n",
     "for _ in range(2):\n",
@@ -673,15 +672,18 @@
     "# a bit more complex example with a dataclass\n",
     "from dataclasses import dataclass\n",
     "\n",
+    "\n",
     "@dataclass\n",
     "class Foo:\n",
     "    value: int\n",
     "\n",
+    "\n",
     "def fn(d: Foo) -> Foo:\n",
     "    d.value += 2\n",
     "\n",
     "    return d\n",
     "\n",
+    "\n",
     "pipeline = read_sequence([Foo(1), Foo(2)]).map(fn).and_return()\n",
     "\n",
     "it = iter(pipeline)\n",
@@ -703,9 +705,11 @@
     "def fn1(d: int) -> int:\n",
     "    return d + 10\n",
     "\n",
+    "\n",
     "def fn2(d: int) -> int:\n",
     "    return d * 2\n",
     "\n",
+    "\n",
     "seq = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n",
     "\n",
     "pipeline = read_sequence(seq).map([fn1, fn2], selector=\"[1]\").and_return()\n",
@@ -747,9 +751,11 @@
     "\n",
     "import copy\n",
     "\n",
+    "\n",
     "def fn(d: int) -> int:\n",
     "    return d + 10\n",
     "\n",
+    "\n",
     "d1 = {\n",
     "    \"foo1\": 1,\n",
     "    \"foo2\": [2, 3, {\"foo4\": 4}],\n",
@@ -853,7 +859,7 @@
     "# Expected to be EOD.\n",
     "pipeline.load_state_dict(state_dict)\n",
     "\n",
-    "try: \n",
+    "try:\n",
     "    # this should raise StopIteration\n",
     "    next(iter(pipeline))\n",
     "except StopIteration:\n",

@@ -34,7 +34,7 @@
     "    load_text_tokenizer,\n",
     "    setup_gangs,\n",
     ")\n",
-    "from fairseq2.recipes.config import GangSection\n",
+    "from fairseq2.recipes.config import DatasetSection, GangSection, ModelSection\n",
     "from fairseq2.recipes.lm import InstructionFinetuneDatasetSection\n",
     "from fairseq2.datasets.instruction import (\n",
     "    InstructionDataset,\n",
@@ -49,7 +49,21 @@
     "## Initialization\n",
     "\n",
     "We first need to initialize fairseq2 -- `setup_fairseq2()`.\n",
-    "This will load the configuration and register the assets, which allows us to interact with pre-defined datasets and models."
+    "This will load the configuration and register the assets, which allows us to interact with pre-defined datasets and models.\n",
+    "\n",
+    "> Prerequisite: Follow the [HuggingFace Datasets Tutorial](https://huggingface.co/docs/hub/en/datasets-downloading) to download the [gsm8k data](https://huggingface.co/datasets/facebook/fairseq2-lm-gsm8k) (formatted with fairseq2 flavor) to your local path (_e.g._ `/datasets/facebook/fairseq2-lm-gsm8k/`).\n",
+    "\n",
+    "<details>\n",
+    "<summary>[1 example datapoint in the sft jsonl]</summary>\n",
+    "\n",
+    "```json\n",
+    "{\n",
+    "    \"src\": \"<|start_header_id|>user<|end_header_id|>\\n\\nBrittany got a 78 on her first test. After her second test, her average rose to an 81. What grade did she get on her second test?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n",
+    "    \"tgt\": \"First multiply her average grade by the number of tests she took to find the total number of points she scored: 81 points * 2 = <<81*2=162>>162 points\\nThen subtract the number of points she scored on her first exam to find how many points she scored on her second exam: 162 points - 78 points = <<162-78=84>>84 points\\n#### 84\"\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "</details>"
    ]
   },
   {
@@ -64,10 +78,9 @@
     "context = get_runtime_context()\n",
     "\n",
     "# Load the configuration\n",
-    "dataset_config = InstructionFinetuneDatasetSection()\n",
-    "\n",
-    "dataset_config.name = \"gsm8k_sft\"\n",
-    "dataset_config.path = Path(\"/path/to/gsm8k_data/sft\")"
+    "dataset_config = InstructionFinetuneDatasetSection(\n",
+    "    name=\"gsm8k_sft\", path=Path(\"/datasets/facebook/fairseq2-lm-gsm8k/sft\")\n",
+    ")"
    ]
   },
   {
@@ -81,21 +94,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "# prepare the seed\n",
     "seed = 42\n",
     "\n",
-    "# prepare the gang\n",
-    "gangs = setup_gangs(context, GangSection(tensor_parallel_size=5))\n",
-    "dataset = load_dataset(InstructionDataset, context, dataset_config, gangs)"
+    "\n",
+    "class Config(object):\n",
+    "    \"\"\"\n",
+    "    A configuration object for the dataset and model.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, gang: GangSection, dataset: DatasetSection, model: ModelSection):\n",
+    "        self.gang = gang\n",
+    "        self.dataset = dataset\n",
+    "        self.model = model\n",
+    "\n",
+    "\n",
+    "config = Config(\n",
+    "    gang=GangSection(tensor_parallel_size=1),\n",
+    "    dataset=dataset_config,\n",
+    "    model=ModelSection(name=\"llama3_1_8b\"),\n",
+    ")\n",
+    "gangs = setup_gangs(context, config)\n",
+    "dataset = load_dataset(InstructionDataset, context, config, gangs)\n",
+    "# load the tokenizer\n",
+    "tokenizer = load_text_tokenizer(context, config)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -119,16 +150,6 @@
     ")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load the tokenizer\n",
-    "tokenizer = load_text_tokenizer(context, \"llama3_1_8b\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -141,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -166,27 +187,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "===batch_nr===0===\n",
-      "SequenceBatch(seqs=tensor([[128000, 128006,    882,  ...,    220,  10132, 128009],\n",
+      "SequenceBatch(seqs=tensor([[128000, 128006,    882,  ...,    220,  10132, 128001],\n",
       "        [128000, 128006,    882,  ...,      0,      0,      0],\n",
       "        [128000, 128006,    882,  ...,      0,      0,      0],\n",
       "        ...,\n",
       "        [128000, 128006,    882,  ...,      0,      0,      0],\n",
       "        [128000, 128006,    882,  ...,      0,      0,      0],\n",
-      "        [128000, 128006,    882,  ...,      0,      0,      0]]), padding_mask=<fairseq2.nn.padding.PaddingMask object at 0x7f6f630faf20>, target_mask=tensor([[False, False, False,  ...,  True,  True,  True],\n",
+      "        [128000, 128006,    882,  ...,      0,      0,      0]],\n",
+      "       device='cuda:0'), padding_mask=<fairseq2.nn.padding.PaddingMask object at 0x78220946d270>, target_mask=tensor([[False, False, False,  ...,  True,  True,  True],\n",
       "        [False, False, False,  ..., False, False, False],\n",
       "        [False, False, False,  ..., False, False, False],\n",
       "        ...,\n",
       "        [False, False, False,  ..., False, False, False],\n",
       "        [False, False, False,  ..., False, False, False],\n",
-      "        [False, False, False,  ..., False, False, False]]), example={'id': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], 'indices': {'is_ragged': True, 'seqs': tensor([[128000, 128006,    882,  ...,    220,  10132, 128009],\n",
+      "        [False, False, False,  ..., False, False, False]], device='cuda:0'), example={'id': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], 'indices': {'is_ragged': True, 'seqs': tensor([[128000, 128006,    882,  ...,    220,  10132, 128001],\n",
       "        [128000, 128006,    882,  ...,      0,      0,      0],\n",
       "        [128000, 128006,    882,  ...,      0,      0,      0],\n",
       "        ...,\n",
@@ -231,18 +253,6 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.16"
   }
  },
  "nbformat": 4,