Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[doc] add examples and minor updates #1071

Merged
merged 15 commits into from
Mar 24, 2025
Merged
1 change: 1 addition & 0 deletions doc/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ Documentation

notebooks/datapipeline
notebooks/dataset_gsm8k_sft
notebooks/models/load_model

.. toctree::
:maxdepth: 1
Expand Down
28 changes: 17 additions & 11 deletions doc/source/notebooks/datapipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,7 @@
"pipeline2 = read_sequence([5, 6, 7, 8]).and_return()\n",
"pipeline3 = read_sequence([0, 2, 4, 6]).and_return()\n",
"\n",
"pipeline = DataPipeline.round_robin(\n",
" [pipeline1, pipeline2, pipeline3]\n",
").and_return()\n",
"pipeline = DataPipeline.round_robin([pipeline1, pipeline2, pipeline3]).and_return()\n",
"\n",
"for i in pipeline:\n",
" print(i)\n",
Expand All @@ -103,9 +101,7 @@
"pipeline2 = DataPipeline.constant(0).and_return()\n",
"pipeline3 = read_sequence([0, 2, 4, 6]).and_return()\n",
"\n",
"pipeline = DataPipeline.round_robin(\n",
" [pipeline1, pipeline2, pipeline3]\n",
").and_return()\n",
"pipeline = DataPipeline.round_robin([pipeline1, pipeline2, pipeline3]).and_return()\n",
"\n",
"for _ in range(2):\n",
" assert list(pipeline) == [1, 0, 0, 2, 0, 2, 3, 0, 4, 4, 0, 6]\n",
Expand All @@ -124,9 +120,7 @@
"pipeline2 = read_sequence([0]).repeat().and_return()\n",
"pipeline3 = read_sequence([0, 2, 4, 6]).and_return()\n",
"\n",
"pipeline = DataPipeline.round_robin(\n",
" [pipeline1, pipeline2, pipeline3]\n",
").and_return()\n",
"pipeline = DataPipeline.round_robin([pipeline1, pipeline2, pipeline3]).and_return()\n",
"\n",
"for _ in range(2):\n",
" it = iter(pipeline)\n",
Expand Down Expand Up @@ -582,7 +576,9 @@
"\n",
"pipeline = (\n",
" read_sequence(seq)\n",
" .dynamic_bucket(threshold, cost_fn, min_num_examples=2, max_num_examples=2, drop_remainder=True)\n",
" .dynamic_bucket(\n",
" threshold, cost_fn, min_num_examples=2, max_num_examples=2, drop_remainder=True\n",
" )\n",
" .and_return()\n",
")\n",
"\n",
Expand Down Expand Up @@ -631,6 +627,7 @@
"def fn(d: int) -> int:\n",
" return d**2\n",
"\n",
"\n",
"seq = list(range(1, 10))\n",
"\n",
"pipeline = read_sequence(seq).map(fn, num_parallel_calls=4).and_return() # fmt: skip\n",
Expand All @@ -653,9 +650,11 @@
"\n",
"fn1 = StrToIntConverter()\n",
"\n",
"\n",
"def fn2(d: int) -> int:\n",
" return d**2\n",
"\n",
"\n",
"pipeline = read_sequence([\"1\", \"2\", \"3\", \"4\"]).map([fn1, fn2]).and_return()\n",
"\n",
"for _ in range(2):\n",
Expand All @@ -673,15 +672,18 @@
"# a bit more complex example with a dataclass\n",
"from dataclasses import dataclass\n",
"\n",
"\n",
"@dataclass\n",
"class Foo:\n",
" value: int\n",
"\n",
"\n",
"def fn(d: Foo) -> Foo:\n",
" d.value += 2\n",
"\n",
" return d\n",
"\n",
"\n",
"pipeline = read_sequence([Foo(1), Foo(2)]).map(fn).and_return()\n",
"\n",
"it = iter(pipeline)\n",
Expand All @@ -703,9 +705,11 @@
"def fn1(d: int) -> int:\n",
" return d + 10\n",
"\n",
"\n",
"def fn2(d: int) -> int:\n",
" return d * 2\n",
"\n",
"\n",
"seq = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n",
"\n",
"pipeline = read_sequence(seq).map([fn1, fn2], selector=\"[1]\").and_return()\n",
Expand Down Expand Up @@ -747,9 +751,11 @@
"\n",
"import copy\n",
"\n",
"\n",
"def fn(d: int) -> int:\n",
" return d + 10\n",
"\n",
"\n",
"d1 = {\n",
" \"foo1\": 1,\n",
" \"foo2\": [2, 3, {\"foo4\": 4}],\n",
Expand Down Expand Up @@ -853,7 +859,7 @@
"# Expected to be EOD.\n",
"pipeline.load_state_dict(state_dict)\n",
"\n",
"try: \n",
"try:\n",
" # this should raise StopIteration\n",
" next(iter(pipeline))\n",
"except StopIteration:\n",
Expand Down
86 changes: 48 additions & 38 deletions doc/source/notebooks/dataset_gsm8k_sft.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
" load_text_tokenizer,\n",
" setup_gangs,\n",
")\n",
"from fairseq2.recipes.config import GangSection\n",
"from fairseq2.recipes.config import DatasetSection, GangSection, ModelSection\n",
"from fairseq2.recipes.lm import InstructionFinetuneDatasetSection\n",
"from fairseq2.datasets.instruction import (\n",
" InstructionDataset,\n",
Expand All @@ -49,7 +49,21 @@
"## Initialization\n",
"\n",
"We first need to initialize fairseq2 -- `setup_fairseq2()`.\n",
"This will load the configuration and register the assets, which allows us to interact with pre-defined datasets and models."
"This will load the configuration and register the assets, which allows us to interact with pre-defined datasets and models.\n",
"\n",
"> Prerequisite: Follow the [HuggingFace Datasets Tutorial](https://huggingface.co/docs/hub/en/datasets-downloading) to download the [gsm8k data](https://huggingface.co/datasets/facebook/fairseq2-lm-gsm8k) (formatted with fairseq2 flavor) to your local path (_e.g._ `/datasets/facebook/fairseq2-lm-gsm8k/`).\n",
"\n",
"<details>\n",
"<summary>[1 example datapoint in the sft jsonl]</summary>\n",
"\n",
"```json\n",
"{\n",
" \"src\": \"<|start_header_id|>user<|end_header_id|>\\n\\nBrittany got a 78 on her first test. After her second test, her average rose to an 81. What grade did she get on her second test?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n",
" \"tgt\": \"First multiply her average grade by the number of tests she took to find the total number of points she scored: 81 points * 2 = <<81*2=162>>162 points\\nThen subtract the number of points she scored on her first exam to find how many points she scored on her second exam: 162 points - 78 points = <<162-78=84>>84 points\\n#### 84\"\n",
"}\n",
"```\n",
"\n",
"</details>"
]
},
{
Expand All @@ -64,10 +78,9 @@
"context = get_runtime_context()\n",
"\n",
"# Load the configuration\n",
"dataset_config = InstructionFinetuneDatasetSection()\n",
"\n",
"dataset_config.name = \"gsm8k_sft\"\n",
"dataset_config.path = Path(\"/path/to/gsm8k_data/sft\")"
"dataset_config = InstructionFinetuneDatasetSection(\n",
" name=\"gsm8k_sft\", path=Path(\"/datasets/facebook/fairseq2-lm-gsm8k/sft\")\n",
")"
]
},
{
Expand All @@ -81,21 +94,39 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# prepare the seed\n",
"seed = 42\n",
"\n",
"# prepare the gang\n",
"gangs = setup_gangs(context, GangSection(tensor_parallel_size=5))\n",
"dataset = load_dataset(InstructionDataset, context, dataset_config, gangs)"
"\n",
"class Config(object):\n",
" \"\"\"\n",
" A configuration object for the dataset and model.\n",
" \"\"\"\n",
"\n",
" def __init__(self, gang: GangSection, dataset: DatasetSection, model: ModelSection):\n",
" self.gang = gang\n",
" self.dataset = dataset\n",
" self.model = model\n",
"\n",
"\n",
"config = Config(\n",
" gang=GangSection(tensor_parallel_size=1),\n",
" dataset=dataset_config,\n",
" model=ModelSection(name=\"llama3_1_8b\"),\n",
")\n",
"gangs = setup_gangs(context, config)\n",
"dataset = load_dataset(InstructionDataset, context, config, gangs)\n",
"# load the tokenizer\n",
"tokenizer = load_text_tokenizer(context, config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -119,16 +150,6 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the tokenizer\n",
"tokenizer = load_text_tokenizer(context, \"llama3_1_8b\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -141,7 +162,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -166,27 +187,28 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"===batch_nr===0===\n",
"SequenceBatch(seqs=tensor([[128000, 128006, 882, ..., 220, 10132, 128009],\n",
"SequenceBatch(seqs=tensor([[128000, 128006, 882, ..., 220, 10132, 128001],\n",
" [128000, 128006, 882, ..., 0, 0, 0],\n",
" [128000, 128006, 882, ..., 0, 0, 0],\n",
" ...,\n",
" [128000, 128006, 882, ..., 0, 0, 0],\n",
" [128000, 128006, 882, ..., 0, 0, 0],\n",
" [128000, 128006, 882, ..., 0, 0, 0]]), padding_mask=<fairseq2.nn.padding.PaddingMask object at 0x7f6f630faf20>, target_mask=tensor([[False, False, False, ..., True, True, True],\n",
" [128000, 128006, 882, ..., 0, 0, 0]],\n",
" device='cuda:0'), padding_mask=<fairseq2.nn.padding.PaddingMask object at 0x78220946d270>, target_mask=tensor([[False, False, False, ..., True, True, True],\n",
" [False, False, False, ..., False, False, False],\n",
" [False, False, False, ..., False, False, False],\n",
" ...,\n",
" [False, False, False, ..., False, False, False],\n",
" [False, False, False, ..., False, False, False],\n",
" [False, False, False, ..., False, False, False]]), example={'id': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], 'indices': {'is_ragged': True, 'seqs': tensor([[128000, 128006, 882, ..., 220, 10132, 128009],\n",
" [False, False, False, ..., False, False, False]], device='cuda:0'), example={'id': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], 'indices': {'is_ragged': True, 'seqs': tensor([[128000, 128006, 882, ..., 220, 10132, 128001],\n",
" [128000, 128006, 882, ..., 0, 0, 0],\n",
" [128000, 128006, 882, ..., 0, 0, 0],\n",
" ...,\n",
Expand Down Expand Up @@ -231,18 +253,6 @@
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
Expand Down
Loading