Skip to content

Commit 2aeb5cc

Browse files
committed
feat: cache index of configs automatically instead of generating
1 parent b4cf7ab commit 2aeb5cc

6 files changed

Lines changed: 95 additions & 28 deletions

File tree

carps/analysis/gather_data.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from carps.analysis.calc_hypervolume import add_hypervolume_to_df
2323
from carps.analysis.utils import convert_mixed_types_to_str, get_ids_mo
24+
from carps.utils.index_configs import get_index_config
2425
from carps.utils.loggingutils import get_logger, setup_logging
2526
from carps.utils.task import Task
2627
from carps.utils.trials import TrialInfo
@@ -404,9 +405,7 @@ def maybe_postadd_task(logs: pd.DataFrame, overwrite: bool = False) -> pd.DataFr
404405
logger.debug("No task_id in logs. Can't add task info.")
405406
return logs
406407
index_fn = Path(__file__).parent.parent / "configs/task/index.csv"
407-
if not index_fn.is_file():
408-
raise ValueError("ObjectiveFunction ids have not been indexed. Run `python -m carps.utils.index_configs`.")
409-
task_index = pd.read_csv(index_fn)
408+
task_index = get_index_config(index_fn)
410409

411410
new_logs = []
412411
for gid, gdf in logs.groupby(by=["task_id", "seed"]):

carps/analysis/process_data.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pandas as pd
1010
from omegaconf import DictConfig, ListConfig, OmegaConf
1111

12+
from carps.utils.index_configs import get_index_config
1213
from carps.utils.loggingutils import get_logger, setup_logging
1314

1415
setup_logging()
@@ -69,9 +70,7 @@ def maybe_postadd_task(logs: pd.DataFrame) -> pd.DataFrame:
6970
Logs with task columns.
7071
"""
7172
index_fn = Path(__file__).parent.parent / "configs/task/index.csv"
72-
if not index_fn.is_file():
73-
raise ValueError("Task ids have not been indexed. Run `python -m carps.utils.index_configs`.")
74-
task_index = pd.read_csv(index_fn)
73+
task_index = get_index_config(index_fn)
7574

7675
def load_task_cfg(task_id: str) -> DictConfig:
7776
config_fn = task_index["config_fn"][task_index["task_id"] == task_id].iloc[0]

carps/utils/index_configs.py

Lines changed: 87 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,30 @@
22

33
from __future__ import annotations
44

5+
import hashlib
56
from pathlib import Path
67

78
import fire
89
import pandas as pd
10+
from deprecated import deprecated
911
from omegaconf import OmegaConf
12+
from platformdirs import user_cache_dir
1013
from rich.progress import track
1114

1215
from carps.utils.loggingutils import get_logger
1316

1417
logger = get_logger("ConfigIndexer")
1518

19+
1620
config_folder = Path(__file__).parent.parent / "configs"
1721
config_folder_task = config_folder / "task"
1822
config_folder_optimizer = config_folder / "optimizer"
1923

24+
PATH_KEY_ZIP = {
25+
config_folder_task: "task_id",
26+
config_folder_optimizer: "optimizer_id",
27+
}
28+
2029

2130
def index_configs(extra_task_paths: list[str] | None = None, extra_optimizer_paths: list[str] | None = None) -> None:
2231
"""Index all task and optimizer configs.
@@ -31,22 +40,13 @@ def index_configs(extra_task_paths: list[str] | None = None, extra_optimizer_pat
3140
extra_optimizer_paths : list[str], optional
3241
Extra paths to custom optimizers, must be a folder containing only optimizer configs.
3342
"""
34-
config_folder_tasks = [config_folder_task] if extra_task_paths is None else [config_folder_task, *extra_task_paths] # type: ignore[list-item]
35-
config_folder_tasks = [Path(p) for p in config_folder_tasks]
36-
config_folder_optimizers = (
37-
[config_folder_optimizer]
38-
if extra_optimizer_paths is None
39-
else [config_folder_optimizer, *extra_optimizer_paths] # type: ignore[list-item]
40-
)
41-
config_folder_optimizers = [Path(p) for p in config_folder_optimizers]
42-
for key, paths in zip(["task_id", "optimizer_id"], [config_folder_tasks, config_folder_optimizers], strict=False):
43-
logger.info(f"Search configs for {key} from {paths}...")
44-
filenames = []
45-
for path in paths:
46-
filenames.extend(list(path.glob("**/*.yaml")))
43+
register_extra_paths(extra_task_paths, extra_optimizer_paths)
44+
45+
for path, key in PATH_KEY_ZIP.items():
46+
paths = list(path.glob("**/*.yaml"))
4747

4848
table_list = []
49-
for fn in track(filenames, total=len(filenames), description=f"Gathering for {key}..."):
49+
for fn in track(paths, total=len(paths), description=f"Gathering for {key}..."):
5050
cfg = OmegaConf.load(fn)
5151
value = cfg.get(key)
5252
table_list.append(
@@ -59,5 +59,78 @@ def index_configs(extra_task_paths: list[str] | None = None, extra_optimizer_pat
5959
table.to_csv(paths[0] / "index.csv", index=False)
6060

6161

62+
def create_table(key, paths: list[Path], target: Path) -> None:
63+
"""Create index table."""
64+
table_list = []
65+
for p in paths:
66+
cfg = OmegaConf.load(p)
67+
value = cfg.get(key)
68+
table_list.append(
69+
{
70+
"config_fn": str(p),
71+
key: value,
72+
}
73+
)
74+
table = pd.DataFrame(table_list)
75+
table.to_csv(target, index=False)
76+
77+
78+
def hash_inputs(paths: list[Path]) -> str:
79+
"""Hash inputs so that index file can be cached."""
80+
hasher = hashlib.sha256()
81+
for path in sorted(paths):
82+
with open(path, "rb") as f:
83+
while chunk := f.read(16 * 1024 * 1024):
84+
hasher.update(chunk)
85+
return hasher.hexdigest()
86+
87+
88+
def register_extra_paths(extra_task_paths: list[str] | None, extra_optimizer_paths: list[str] | None) -> None:
89+
"""Register extra task and optimizer paths.
90+
91+
Parameters
92+
----------
93+
extra_task_paths : list[str]
94+
Extra paths to custom tasks, must be a folder containing only task configs.
95+
extra_optimizer_paths : list[str]
96+
Extra paths to custom optimizers, must be a folder containing only optimizer configs.
97+
"""
98+
if not extra_task_paths:
99+
extra_task_paths = []
100+
if not extra_optimizer_paths:
101+
extra_optimizer_paths = []
102+
103+
for optimizer_path_str in extra_optimizer_paths:
104+
PATH_KEY_ZIP[Path(optimizer_path_str)] = "optimizer_id"
105+
for task_path_str in extra_task_paths:
106+
PATH_KEY_ZIP[Path(task_path_str)] = "task_id"
107+
108+
109+
def get_index_config(path: Path) -> pd.DataFrame:
110+
"""Index all task and optimizer configs.
111+
112+
Create `index.csv` containing the config filename `config_fn` and the
113+
`task_id` or `optimizer_id` for all task and optimizer configs.
114+
Replaces old indexing api by using caching directory
115+
116+
Parameters:
117+
----------
118+
path: path the old index file would have been
119+
120+
returns: pd.DataFrame containing the index
121+
"""
122+
path_dashed = str(path.parent).replace("/", "-")
123+
124+
paths = list(path.parent.glob("**/*.yaml"))
125+
paths_hash = hash_inputs(paths)[:12]
126+
127+
cache_path = Path(user_cache_dir("carps")) / f"index-{path_dashed}-{paths_hash}.csv"
128+
if not cache_path.is_file():
129+
cache_path.parent.mkdir(exist_ok=True, parents=True)
130+
create_table(PATH_KEY_ZIP[path.parent], paths, cache_path)
131+
132+
return pd.read_csv(cache_path)
133+
134+
62135
if __name__ == "__main__":
63136
fire.Fire(index_configs)

carps/utils/overridefinder.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@
66
from pathlib import Path
77

88
import fire
9-
import pandas as pd
109

11-
from carps.utils.index_configs import index_configs
10+
from carps.utils.index_configs import get_index_config
1211
from carps.utils.loggingutils import get_logger
1312

1413
logger = get_logger(__file__)
@@ -45,9 +44,7 @@ def find_override(task_id: str | None = None, optimizer_id: str | None = None) -
4544
raise ValueError("Please specify either `task_id` or `optimizer_id`.")
4645

4746
index_fn = path / "index.csv"
48-
if not index_fn.is_file():
49-
index_configs()
50-
table = pd.read_csv(index_fn)
47+
table = get_index_config(index_fn)
5148

5249
try:
5350
config_fn = table["config_fn"][table[key] == to_find].to_numpy()[0]

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ dev = [
6666
"pre-commit",
6767
"ruff",
6868
"mypy",
69+
"types-Deprecated",
6970
"mkdocs",
7071
"mkdocs-material",
7172
"mkdocs-autorefs",

subselection/create_subset_configs.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@ def write_subsets(subset_fn: str, identifier: str):
2222
task_ids = subset["task_id"].to_list()
2323

2424
index_fn = config_target_path.parent.parent / "index.csv"
25-
if not index_fn.is_file():
26-
raise ValueError(f"Could not find {index_fn}. ObjectiveFunction ids have not been indexed. Run `python -m carps.utils.index_configs`.")
27-
task_index = pd.read_csv(index_fn)
25+
task_index = get_index_config(index_fn)
2826
print(task_index.head())
2927
print(task_ids)
3028
not_found = [pid for pid in task_ids if pid not in task_index["task_id"].to_list()]

0 commit comments

Comments
 (0)