|
544 | 544 | "profiler": "xplane", |
545 | 545 | "dataset_path": "gs://max-datasets-rogue", |
546 | 546 | "dataset_type": "tfds", |
547 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.llama2"), |
| 547 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.llama2"), |
548 | 548 | "sa_block_q": 1024, |
549 | 549 | "sa_block_q_dkv": 2048, |
550 | 550 | "sa_block_q_dq": 2048, |
|
1280 | 1280 | "skip_first_n_steps_for_profiler": 10, |
1281 | 1281 | "profiler_steps": 5, |
1282 | 1282 | "tokenizer_type": "tiktoken", |
1283 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer_llama3.tiktoken"), |
| 1283 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer_llama3.tiktoken"), |
1284 | 1284 | }, |
1285 | 1285 | xla_flags=( |
1286 | 1286 | xla_flags_library.DENSE_VMEM_LIMIT_FLAG |
|
1336 | 1336 | "skip_first_n_steps_for_profiler": 10, |
1337 | 1337 | "profiler_steps": 5, |
1338 | 1338 | "tokenizer_type": "tiktoken", |
1339 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer_llama3.tiktoken"), |
| 1339 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer_llama3.tiktoken"), |
1340 | 1340 | }, |
1341 | 1341 | xla_flags=( |
1342 | 1342 | xla_flags_library.DENSE_VMEM_LIMIT_FLAG |
|
1517 | 1517 | "megablox": False, |
1518 | 1518 | "sparse_matmul": False, |
1519 | 1519 | "capacity_factor": 1.25, |
1520 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.mistral-v1"), |
| 1520 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.mistral-v1"), |
1521 | 1521 | }, |
1522 | 1522 | xla_flags=( |
1523 | 1523 | xla_flags_library.MOE_VMEM_LIMIT_FLAG |
|
1552 | 1552 | "sparse_matmul": False, |
1553 | 1553 | "capacity_factor": 1.25, |
1554 | 1554 | "quantization": "int8", |
1555 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.mistral-v1"), |
| 1555 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.mistral-v1"), |
1556 | 1556 | }, |
1557 | 1557 | xla_flags=( |
1558 | 1558 | xla_flags_library.MOE_VMEM_LIMIT_FLAG |
|
1593 | 1593 | "megablox": False, |
1594 | 1594 | "sparse_matmul": False, |
1595 | 1595 | "capacity_factor": 1.25, |
1596 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.mistral-v3"), |
| 1596 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.mistral-v3"), |
1597 | 1597 | "dtype": "bfloat16", |
1598 | 1598 | "weight_dtype": "bfloat16", |
1599 | 1599 | "allow_split_physical_axes": True, |
|
1634 | 1634 | "megablox": False, |
1635 | 1635 | "sparse_matmul": False, |
1636 | 1636 | "capacity_factor": 1.0, |
1637 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.mistral-v3"), |
| 1637 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.mistral-v3"), |
1638 | 1638 | "dtype": "bfloat16", |
1639 | 1639 | "opt_type": "sgd", |
1640 | 1640 | "weight_dtype": "bfloat16", |
|
1667 | 1667 | "reuse_example_batch": 1, |
1668 | 1668 | "enable_checkpointing": False, |
1669 | 1669 | "profiler": "xplane", |
1670 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.llama2"), |
| 1670 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.llama2"), |
1671 | 1671 | "sa_block_q": 2048, |
1672 | 1672 | "sa_block_q_dkv": 2048, |
1673 | 1673 | "sa_block_q_dq": 2048, |
|
1700 | 1700 | "reuse_example_batch": 1, |
1701 | 1701 | "enable_checkpointing": False, |
1702 | 1702 | "profiler": "xplane", |
1703 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.llama2"), |
| 1703 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.llama2"), |
1704 | 1704 | "sa_block_q": 2048, |
1705 | 1705 | "sa_block_q_dkv": 2048, |
1706 | 1706 | "sa_block_q_dq": 2048, |
|
1739 | 1739 | "profiler": "xplane", |
1740 | 1740 | "skip_first_n_steps_for_profiler": 10, |
1741 | 1741 | "profiler_steps": 2, |
1742 | | - "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"), |
| 1742 | + "tokenizer_path": os.path.join("assets", "tokenizers", "tokenizer.gemma3"), |
1743 | 1743 | "sa_block_q": 1024, |
1744 | 1744 | "sa_block_kv": 1024, |
1745 | 1745 | "sa_block_kv_compute": 1024, |
|
1779 | 1779 | "profiler": "xplane", |
1780 | 1780 | "skip_first_n_steps_for_profiler": 10, |
1781 | 1781 | "profiler_steps": 2, |
1782 | | - "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"), |
| 1782 | + "tokenizer_path": os.path.join("assets", "tokenizers", "tokenizer.gemma3"), |
1783 | 1783 | "sa_block_q": 1024, |
1784 | 1784 | "sa_block_kv": 1024, |
1785 | 1785 | "sa_block_kv_compute": 1024, |
|
1819 | 1819 | "profiler": "xplane", |
1820 | 1820 | "skip_first_n_steps_for_profiler": 10, |
1821 | 1821 | "profiler_steps": 2, |
1822 | | - "tokenizer_path": os.path.join("assets", "tokenizer.gemma3"), |
| 1822 | + "tokenizer_path": os.path.join("assets", "tokenizers", "tokenizer.gemma3"), |
1823 | 1823 | "sa_block_q": 1024, |
1824 | 1824 | "sa_block_kv": 1024, |
1825 | 1825 | "sa_block_kv_compute": 1024, |
|
1868 | 1868 | "skip_first_n_steps_for_profiler": 10, |
1869 | 1869 | "profiler_steps": 5, |
1870 | 1870 | "tokenizer_type": "tiktoken", |
1871 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer_llama3.tiktoken"), |
| 1871 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer_llama3.tiktoken"), |
1872 | 1872 | "packing": False, |
1873 | 1873 | }, |
1874 | 1874 | xla_flags=( |
|
1933 | 1933 | "sa_use_fused_bwd_kernel": True, |
1934 | 1934 | "sparse_matmul": False, |
1935 | 1935 | "capacity_factor": 1.5, |
1936 | | - "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer.mistral-v1"), |
| 1936 | + "tokenizer_path": os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer.mistral-v1"), |
1937 | 1937 | "dtype": "bfloat16", |
1938 | 1938 | "weight_dtype": "bfloat16", |
1939 | 1939 | "opt_type": "sgd", |
|
0 commit comments