diff --git a/Makefile b/Makefile index a3d6f6691..1520ff4d5 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,7 @@ TEST_IMAGE = $(shell beaker workspace images $(BEAKER_WORKSPACE) --format=json .PHONY : run-checks run-checks : - isort --check . - black --check . + ruff format --check . ruff check . mypy . CUDA_VISIBLE_DEVICES='' pytest -v --color=yes tests/ diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index ccea68096..6c7fbf4f3 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -242,9 +242,9 @@ def _instance_predictions_map_list( instance_id = guess_instance_id(instance, idx=idx) # dict if keep_instance_fields or keep_all_instance_fields_except: - assert ( - keep_instance_fields is None or keep_all_instance_fields_except is None - ), "Can't use both keep_instance_fields and keep_all_instance_fields_except" + assert keep_instance_fields is None or keep_all_instance_fields_except is None, ( + "Can't use both keep_instance_fields and keep_all_instance_fields_except" + ) for field in instance: if keep_instance_fields and field not in keep_instance_fields: continue diff --git a/olmo/checkpoint.py b/olmo/checkpoint.py index 9453a69cc..a7fcaf8f8 100644 --- a/olmo/checkpoint.py +++ b/olmo/checkpoint.py @@ -471,9 +471,9 @@ def read_data(self, plan: dist_cp.LoadPlan, planner: dist_cp.LoadPlanner) -> Fut tensor = narrow_tensor_by_index(tensor, read_item.storage_offsets, read_item.lengths) target_tensor = planner.resolve_tensor(read_item).detach() - assert ( - target_tensor.size() == tensor.size() - ), f"req {read_item.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}" + assert target_tensor.size() == tensor.size(), ( + f"req {read_item.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}" + ) target_tensor.copy_(tensor) planner.commit_tensor(read_item, target_tensor) @@ -903,9 +903,9 @@ def save_checkpoint( *, upload_to: Optional[str] = None, ) -> None: - assert isinstance( - dist_model, FSDP - ), f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP." + assert isinstance(dist_model, FSDP), ( + f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP." + ) with self._temporary_wd(dir) as checkpoint_dir: # Save model and optim state. save_fsdp_model_and_optim_state( @@ -940,9 +940,9 @@ def restore_checkpoint( ) -> Dict[str, Any]: # Load model and optimizer state in place. log.info("Loading model and optimizer state...") - assert isinstance( - dist_model, FSDP - ), f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP." + assert isinstance(dist_model, FSDP), ( + f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP." + ) load_fsdp_model_and_optim_state( load_path, @@ -987,9 +987,9 @@ def save_checkpoint( *, upload_to: Optional[str] = None, ) -> None: - assert isinstance( - dist_model, FSDP - ), f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP." + assert isinstance(dist_model, FSDP), ( + f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP." + ) with self._temporary_wd(dir) as checkpoint_dir: with FSDP.state_dict_type( dist_model, @@ -1022,9 +1022,9 @@ def restore_checkpoint( local_cache: Optional[PathOrStr] = None, load_optimizer_state: bool = True, ) -> Dict[str, Any]: - assert isinstance( - dist_model, FSDP - ), f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP." + assert isinstance(dist_model, FSDP), ( + f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP." + ) with FSDP.state_dict_type( dist_model, state_dict_type=StateDictType.SHARDED_STATE_DICT, @@ -1587,9 +1587,9 @@ def save_checkpoint( *, upload_to: Optional[str] = None, ) -> None: - assert isinstance( - dist_model, FSDP - ), f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP." + assert isinstance(dist_model, FSDP), ( + f"{self.__class__.__name__} is being called to save a model where `distributed_strategy` is not FSDP." + ) with self._temporary_wd(dir) as checkpoint_dir: # Gather local FSDP flat params data to save. @@ -1648,9 +1648,9 @@ def restore_checkpoint( # Load local FSDP flat param data. log.info("Loading local FSDP flat params data...") - assert isinstance( - dist_model, FSDP - ), f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP." + assert isinstance(dist_model, FSDP), ( + f"{self.__class__.__name__} is being called to load a model where `distributed_strategy` is not FSDP." + ) model_state = load_state_dict( load_path, f"model/rank{get_global_rank()}.pt", local_cache=local_cache, map_location="cpu" diff --git a/olmo/config.py b/olmo/config.py index 6da7dc03d..ab325976d 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -650,13 +650,13 @@ class CustomDatasetCollatorConfig(BaseConfig): @dataclass class CustomDatasetConfig(BaseConfig): name: str #: The name of the custom dataset class or function that will be used to load the dataset. - module: Optional[ - str - ] = None #: The module where the custom dataset class is defined. If not set, the module will be inferred from the class name. + module: Optional[str] = ( + None #: The module where the custom dataset class is defined. If not set, the module will be inferred from the class name. + ) args: Optional[Dict[str, Any]] = None #: The arguments to pass to the custom dataset class or function - collate_fn: Optional[ - str - ] = None #: The name of the collate function to use for the custom dataset. Assumes the collate function is defined in the same module as the custom dataset class unless specified otherwise using the full object path. + collate_fn: Optional[str] = ( + None #: The name of the collate function to use for the custom dataset. Assumes the collate function is defined in the same module as the custom dataset class unless specified otherwise using the full object path. + ) token_field: Optional[str] = None #: The field in the dataset items that contains the tokenized text. collate_config: Optional[CustomDatasetCollatorConfig] = field( default_factory=CustomDatasetCollatorConfig diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index 94ae39b21..fb5d95ef2 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -1528,9 +1528,9 @@ def prep_examples(self): continue if doc_id > max_doc_id: max_doc_id = doc_id - assert ( - request["request_type"] == "loglikelihood" - ), f"Unsupported request type: {request['request_type']}" + assert request["request_type"] == "loglikelihood", ( + f"Unsupported request type: {request['request_type']}" + ) # from EAI harness # how this all works: diff --git a/olmo/optim.py b/olmo/optim.py index a8b2fc755..2ec46133c 100644 --- a/olmo/optim.py +++ b/olmo/optim.py @@ -393,9 +393,9 @@ def __init__( def get_post_step_metrics( self, module: nn.Module, process_group: Optional[dist.ProcessGroup] = None ) -> Dict[str, torch.Tensor]: - assert isinstance( - module, FSDP - ), "`get_post_step_metrics` expects module to be FSDP and will not work with other `distributed_strategy`." + assert isinstance(module, FSDP), ( + "`get_post_step_metrics` expects module to be FSDP and will not work with other `distributed_strategy`." + ) update_total_dot_prod = self._update_total_dot_prod update_total_norm = self._update_total_norm @@ -792,6 +792,7 @@ def get_lr(self, initial_lr: float, step: int, max_steps: int) -> float: @dataclass class CosLinearEnvelope(Scheduler): "Pointwise product of cosine schedule and linear decay; useful during annealing." + warmup_steps: int alpha_f: float = 0.1 t_max: Optional[int] = None @@ -874,9 +875,9 @@ def get_param_groups(cfg: TrainConfig, model: nn.Module) -> List[Dict[str, Any]] inter_params = decay & no_decay union_params = decay | no_decay assert len(inter_params) == 0, f"parameters {inter_params} made it into both decay/no_decay sets!" - assert ( - len(all_params.keys() - union_params) == 0 - ), f"parameters {all_params.keys() - union_params} were not separated into either decay/no_decay set!" + assert len(all_params.keys() - union_params) == 0, ( + f"parameters {all_params.keys() - union_params} were not separated into either decay/no_decay set!" + ) # Create the pytorch optimizer groups. decay_sorted = sorted(list(decay)) diff --git a/olmo/util.py b/olmo/util.py index af163e524..85d7e41cc 100644 --- a/olmo/util.py +++ b/olmo/util.py @@ -512,7 +512,7 @@ def _gcs_find_latest_checkpoint(bucket_name: str, prefix: str) -> Optional[str]: or (step == latest_step and latest_checkpoint is not None and latest_checkpoint.endswith("-unsharded")) ): latest_step = step - latest_checkpoint = f"gs://{bucket_name}/{blob.name[:-len(suffix)]}" + latest_checkpoint = f"gs://{bucket_name}/{blob.name[: -len(suffix)]}" return latest_checkpoint @@ -710,7 +710,7 @@ def _http_get_bytes_range(scheme: str, host_name: str, path: str, bytes_start: i try: response = requests.get( f"{scheme}://{host_name}/{path}", - headers={"Range": f"bytes={bytes_start}-{bytes_start+num_bytes-1}"}, + headers={"Range": f"bytes={bytes_start}-{bytes_start + num_bytes - 1}"}, ) result = response.content if len(result) == num_bytes: @@ -719,7 +719,7 @@ def _http_get_bytes_range(scheme: str, host_name: str, path: str, bytes_start: i log.warning(f"Expected {num_bytes} bytes, but got {len(result)}. Retrying...") except requests.exceptions.RequestException as e: - log.warning(f"Attempt {attempt+1}/{max_retries}. Network error: {e}. Retrying...") + log.warning(f"Attempt {attempt + 1}/{max_retries}. Network error: {e}. Retrying...") attempt += 1 time.sleep(2**attempt) raise ValueError( @@ -910,7 +910,7 @@ def get_resource(self, temp_file: io.BufferedWriter) -> None: def get_bytes_range(self, index: int, length: int) -> bytes: response = self.s3.get_object( - Bucket=self.bucket_name, Key=self.path, Range=f"bytes={index}-{index+length-1}" + Bucket=self.bucket_name, Key=self.path, Range=f"bytes={index}-{index + length - 1}" ) return response["Body"].read() diff --git a/pyproject.toml b/pyproject.toml index b40cfb5af..ea8d85ab1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,6 @@ dependencies = [ dev = [ "ruff", "mypy>=1.0,<1.4", - "black>=23.1,<24.0", - "isort>=5.12,<5.13", "pytest", "pytest-sphinx", "twine>=1.11.0", @@ -90,29 +88,6 @@ exclude = [ "inference.*", ] -[tool.black] -line-length = 115 -include = '\.pyi?$' -exclude = ''' -( - __pycache__ - | \.git - | \.mypy_cache - | \.pytest_cache - | \.vscode - | \.venv - | \bdist\b - | \bdoc\b - | pretrain_data/ - | inference/ -) -''' - -[tool.isort] -profile = "black" -multi_line_output = 3 -extend_skip = ["pretrain_data", "tokenizer"] - [tool.ruff] line-length = 115 lint.ignore = ["F403", "F405", "E501"] @@ -145,6 +120,9 @@ exclude = [ [tool.ruff.lint.per-file-ignores] "**/__init__.py" = ["F401"] +[tool.ruff.format] +# ruff format is black-compatible by default + [tool.pyright] reportPrivateImportUsage = false exclude = ["pretrain_data/", "tokenizer/"] diff --git a/scripts/add_code_eval.py b/scripts/add_code_eval.py index 939b9da69..dd12a4f59 100644 --- a/scripts/add_code_eval.py +++ b/scripts/add_code_eval.py @@ -1,6 +1,7 @@ """ Script to create perplexity eval datasets for code. """ + import os import pandas as pd diff --git a/scripts/compare_module_outputs.py b/scripts/compare_module_outputs.py index 9c47497b8..5d31af3d5 100644 --- a/scripts/compare_module_outputs.py +++ b/scripts/compare_module_outputs.py @@ -4,7 +4,7 @@ This script is useful for identifying where model activations start to differ within 2 forward passes that should yield identical results. In turn, detecting -regressions can be a lot quicker/easier. +regressions can be a lot quicker/easier. This script requires that traces containing submodule outputs have been collected during training. The traces can be saved using diff --git a/scripts/compare_wandb_configs.py b/scripts/compare_wandb_configs.py index aed025e54..89180c673 100644 --- a/scripts/compare_wandb_configs.py +++ b/scripts/compare_wandb_configs.py @@ -6,7 +6,7 @@ Comparing Peteish7 to Amberish7 - python scripts/compare_wandb_configs.py https://wandb.ai/ai2-llm/olmo-medium/runs/cej4ya39 https://wandb.ai/ai2-llm/olmo-medium/runs/ij4ls6v2 - + """ diff --git a/scripts/flops_by_perf_figure.py b/scripts/flops_by_perf_figure.py index e062cdd66..8a987fda5 100644 --- a/scripts/flops_by_perf_figure.py +++ b/scripts/flops_by_perf_figure.py @@ -23,7 +23,7 @@ Zamba-2-7B,,65.2,92.2,89.4,79.6,68.5,51.7,36.5,55.5,67.2,32.8,78.8 Invocation looks like: - + python scripts/flops_by_perf_figure.py /path/to/results.csv output/ @kyleclo, @soldni diff --git a/scripts/init_config.py b/scripts/init_config.py index 22c223d7b..b405e6b92 100644 --- a/scripts/init_config.py +++ b/scripts/init_config.py @@ -1,6 +1,7 @@ """ Run this to initialize a new training config to a file. """ + import logging import sys from pathlib import Path diff --git a/scripts/ladder.ipynb b/scripts/ladder.ipynb index aa1dc4fd4..589067850 100644 --- a/scripts/ladder.ipynb +++ b/scripts/ladder.ipynb @@ -38,7 +38,7 @@ " \"baseline-300M-3xC\": 2.457,\n", " \"baseline-300M-4xC\": 2.425,\n", " \"baseline-300M-10xC\": 2.356,\n", - " \"baseline-750M-2xC\": 2.290\n", + " \"baseline-750M-2xC\": 2.290,\n", "}\n", "\n", "c4_ce = {\n", @@ -52,17 +52,17 @@ " \"baseline-300M-3xC\": 3.058,\n", " \"baseline-300M-4xC\": 3.028,\n", " \"baseline-300M-10xC\": 2.952,\n", - " \"baseline-750M-2xC\": 2.885\n", + " \"baseline-750M-2xC\": 2.885,\n", "}\n", "\n", "run_name_re = re.compile(r\"^([^-]+)-([^-]+)-([^-]+)$\")\n", "\n", + "\n", "def parse_run_name(name: str):\n", " name, size, length = run_name_re.match(name).groups()\n", " size = ladder.parse_size(size)\n", " length = ladder.parse_length(length, size)\n", - " return name, size, length\n", - " " + " return name, size, length" ] }, { @@ -99,7 +99,7 @@ "for run in [\"baseline-150M-2xC\", \"baseline-300M-2xC\", \"baseline-750M-2xC\", \"baseline-1B-2xC\"]:\n", " name, size, length = parse_run_name(run)\n", " x.append(size)\n", - " y.append((pile_ce[run] + c4_ce[run])/2)\n", + " y.append((pile_ce[run] + c4_ce[run]) / 2)\n", "ax1.scatter(x, y)\n", "ax1.set_title(\"Varying model size, all Cx2\")\n", "ax1.set_xlabel(\"Model size\")\n", @@ -108,10 +108,16 @@ "# make length plot\n", "x = []\n", "y = []\n", - "for run in [\"baseline-300M-1xC\", \"baseline-300M-2xC\", \"baseline-300M-3xC\", \"baseline-300M-4xC\", \"baseline-300M-10xC\"]:\n", + "for run in [\n", + " \"baseline-300M-1xC\",\n", + " \"baseline-300M-2xC\",\n", + " \"baseline-300M-3xC\",\n", + " \"baseline-300M-4xC\",\n", + " \"baseline-300M-10xC\",\n", + "]:\n", " name, size, length = parse_run_name(run)\n", " x.append(length)\n", - " y.append((pile_ce[run] + c4_ce[run])/2)\n", + " y.append((pile_ce[run] + c4_ce[run]) / 2)\n", "ax2.scatter(x, y)\n", "ax2.set_title(\"Varying run length, all 300M\")\n", "ax2.set_xlabel(\"Run length in tokens\")\n", @@ -157,8 +163,11 @@ "import numpy as np\n", "from scipy.optimize import curve_fit\n", "\n", + "\n", "def power_law(x, a, b, c):\n", - " return a * x ** b + c\n", + " return a * x**b + c\n", + "\n", + "\n", "power_law_bounds = ([0, -np.inf, 0], [np.inf, 0, np.inf])\n", "\n", "fix, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))\n", @@ -170,7 +179,7 @@ " _, size, _ = run_name_re.match(run).groups()\n", " _, size = ladder.size_for_model(size)\n", " x.append(size)\n", - " y.append((pile_ce[run] + c4_ce[run])/2)\n", + " y.append((pile_ce[run] + c4_ce[run]) / 2)\n", "\n", "train_x = np.array(x[:-1])\n", "train_y = np.array(y[:-1])\n", @@ -186,10 +195,16 @@ "# run length\n", "x = []\n", "y = []\n", - "for run in [\"baseline-300M-1xC\", \"baseline-300M-2xC\", \"baseline-300M-3xC\", \"baseline-300M-4xC\", \"baseline-300M-10xC\"]:\n", + "for run in [\n", + " \"baseline-300M-1xC\",\n", + " \"baseline-300M-2xC\",\n", + " \"baseline-300M-3xC\",\n", + " \"baseline-300M-4xC\",\n", + " \"baseline-300M-10xC\",\n", + "]:\n", " _, _, length = parse_run_name(run)\n", " x.append(length)\n", - " y.append((pile_ce[run] + c4_ce[run])/2)\n", + " y.append((pile_ce[run] + c4_ce[run]) / 2)\n", "\n", "train_x = np.array(x[:-1])\n", "train_y = np.array(y[:-1])\n", diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index bae690697..70d846a8c 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -315,9 +315,9 @@ def make_source_and_target( random.shuffle(exploded_src) if paths_per_worker > 1: - assert ( - len(exploded_src) >= paths_per_worker - ), f"Number of paths ({len(exploded_src)}) must be >= paths_per_worker ({paths_per_worker})" + assert len(exploded_src) >= paths_per_worker, ( + f"Number of paths ({len(exploded_src)}) must be >= paths_per_worker ({paths_per_worker})" + ) # group the paths into chunks of paths_per_worker exploded_src = [ @@ -325,7 +325,7 @@ def make_source_and_target( ] # determine the destination paths - exploded_dst = [f'{output.rstrip("/")}/{i:0{output_digits}d}' for i in range(len(exploded_src))] + exploded_dst = [f"{output.rstrip('/')}/{i:0{output_digits}d}" for i in range(len(exploded_src))] return tuple(exploded_src), tuple(exploded_dst) diff --git a/scripts/show_model_size.py b/scripts/show_model_size.py index cf2ca1e22..66b50490e 100644 --- a/scripts/show_model_size.py +++ b/scripts/show_model_size.py @@ -7,6 +7,7 @@ python scripts/show_model_size.py train_config.yaml ``` """ + import logging import sys diff --git a/tests/grad_norm_test.py b/tests/grad_norm_test.py index 17aa27032..495ca4934 100644 --- a/tests/grad_norm_test.py +++ b/tests/grad_norm_test.py @@ -62,9 +62,9 @@ def _init_torch_optim(cfg, model): inter_params = decay & no_decay union_params = decay | no_decay assert len(inter_params) == 0, f"parameters {inter_params} made it into both decay/no_decay sets!" - assert ( - len(all_params.keys() - union_params) == 0 - ), f"parameters {all_params.keys() - union_params} were not separated into either decay/no_decay set!" + assert len(all_params.keys() - union_params) == 0, ( + f"parameters {all_params.keys() - union_params} were not separated into either decay/no_decay set!" + ) # Create the pytorch optimizer groups. optim_groups = [ @@ -215,9 +215,9 @@ def _naive_train_loop( # params set by observing grads for the two cases on a cpu run assert total_grad_diff < 1e-4, "model gradients diverged during optimization" assert total_param_diff < 1e-2, "model parameters diverged during optimization" - assert ( - torch.abs(torch_grad_norm - olmo_grad_norm) < 1e-6 - ), "grad norms computed by torch and OLMo codebase are different" + assert torch.abs(torch_grad_norm - olmo_grad_norm) < 1e-6, ( + "grad norms computed by torch and OLMo codebase are different" + ) if step_count == max_iterations: break