From e006938d7d292fd68ee01bb530846434783698f4 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:31:41 -0700 Subject: [PATCH 1/3] add visualization dependencies --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 50c4b4f..e2e044b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,10 +59,12 @@ nwb_benchmarks = "nwb_benchmarks.command_line_interface:main" app = ["flask", "flask-cors", "flask_restx"] database = ["polars"] dev = ["ipython", "pre-commit"] +figures = ["seaborn", "matplotlib", "pyarrow"] all = [ {include-group = "app"}, {include-group = "database"}, - {include-group = "dev"} + {include-group = "dev"}, + {include-group = "figures"} ] From 9a9cd7826e0b9552dfba4f2e9be4793ed1e0ad54 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:32:18 -0700 Subject: [PATCH 2/3] refactor dataclasses, parquet processing --- src/nwb_benchmarks/_flask/_app.py | 250 +-------------------- src/nwb_benchmarks/database/__init__.py | 22 ++ src/nwb_benchmarks/database/_models.py | 211 +++++++++++++++++ src/nwb_benchmarks/database/_processing.py | 82 +++++++ 4 files changed, 320 insertions(+), 245 deletions(-) create mode 100644 src/nwb_benchmarks/database/__init__.py create mode 100644 src/nwb_benchmarks/database/_models.py create mode 100644 src/nwb_benchmarks/database/_processing.py diff --git a/src/nwb_benchmarks/_flask/_app.py b/src/nwb_benchmarks/_flask/_app.py index 1556751..d059c2c 100644 --- a/src/nwb_benchmarks/_flask/_app.py +++ b/src/nwb_benchmarks/_flask/_app.py @@ -1,4 +1,3 @@ -import dataclasses import json import os import pathlib @@ -6,14 +5,12 @@ import time import traceback import typing -import uuid -from datetime import datetime -from pathlib import Path - import flask import flask_restx -import packaging.version -import typing_extensions + +from pathlib import Path + +from nwb_benchmarks.database._processing import repackage_as_parquet app = flask.Flask(__name__) api = flask_restx.Api( @@ -99,7 +96,7 @@ def post(self) -> int: time.sleep(1) - repackage_as_parquet(directory=directory, output_directory=output_directory) + repackage_as_parquet(directory=directory, output_directory=output_directory, minimum_version="3.0.0") time.sleep(1) @@ -208,243 +205,6 @@ def push(self): raise RuntimeError(message) -@dataclasses.dataclass -class Machine: - id: str - name: str - version: str - os: dict - sys: dict - platform: dict - psutil: dict - cuda: dict - asv: dict - - @classmethod - def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self | None: - with file_path.open(mode="r") as file_stream: - data = json.load(file_stream) - - version = str(data.get("version", None)) - if version is None or packaging.version.Version(version) < packaging.version.Version(version="1.1.0"): - return None - - machine_id = file_path.stem.removeprefix("machine-") - - return cls( - id=machine_id, - name=data.get("name", ""), - version=version, - os=data.get("os", {}), - sys=data.get("sys", {}), - platform=data.get("platform", {}), - psutil=data.get("psutil", {}), - cuda=data.get("cuda", {}), - asv=data.get("asv", {}), - ) - - def to_dataframe(self) -> "polars.DataFrame": - import polars - - data = { - "name": self.name, - "version": self.version, - "os": json.dumps(self.os), - "sys": json.dumps(self.sys), - "platform": json.dumps(self.platform), - "psutil": json.dumps(self.psutil), - "cuda": json.dumps(self.cuda), - "asv": json.dumps(self.asv), - } - - data_frame = polars.DataFrame(data=data) - return data_frame - - -@dataclasses.dataclass -class Environment: - environment_id: str - preamble: str - - # Allow arbitrary fields - def __init__(self, environment_id: str, preamble: str, **kwargs) -> None: - self.environment_id = environment_id - self.preamble = preamble - for key, value in kwargs.items(): - setattr(self, key, value) - - @classmethod - def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self | None: - with file_path.open(mode="r") as file_stream: - data = json.load(fp=file_stream) - - if len(data) > 1: - return None - - environment_id = file_path.stem.removeprefix("environment-") - preamble = next(iter(data.keys())) - - packages = { - package["name"]: f'{package["version"]} ({package["build"]})' - for package in data[preamble] - if len(package) == 3 - } - - if not any(packages): - return None - - return cls(environment_id=environment_id, preamble=preamble, **packages) - - def to_dataframe(self) -> "polars.DataFrame": - import polars - - data = { - "environment_id": self.environment_id, - "preamble": self.preamble, - } - for package_name, package_details in self.__dict__.items(): - if package_name not in ["environment_id", "preamble"]: - data[package_name] = package_details - - data_frame = polars.DataFrame(data=data, orient="col") - return data_frame - - -@dataclasses.dataclass -class Result: - uuid: str - version: str - timestamp: str - commit_hash: str - environment_id: str - machine_id: str - benchmark_name: str - parameter_case: str - value: float - variable: str - - -@dataclasses.dataclass -class Results: - results: list[Result] - - @classmethod - def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self | None: - with file_path.open(mode="r") as file_stream: - data = json.load(fp=file_stream) - - database_version = data.get("database_version", None) - if database_version is None or packaging.version.Version(data["database_version"]) < packaging.version.Version( - version="1.0.0" - ): - return None - - timestamp = data["timestamp"] - commit_hash = data["commit_hash"] - environment_id = data["environment_id"] - machine_id = data["machine_id"] - - def normalize_time_and_network_results(benchmark_results) -> dict: - """Convert benchmark results to a consistent dict format with list values.""" - if isinstance(benchmark_results, dict): - value_dict = benchmark_results - else: - value_dict = dict(time=benchmark_results) - - # Ensure all values are lists - return {k: v if isinstance(v, list) else [float(v)] for k, v in value_dict.items()} - - results = [ - Result( - uuid=str(uuid.uuid4()), # TODO: add this to each results file so it is persistent - version=database_version, - timestamp=timestamp, - commit_hash=commit_hash, - environment_id=environment_id, - machine_id=machine_id, - benchmark_name=benchmark_name, - parameter_case=parameter_case, - value=value, - variable=variable_name, - ) - for benchmark_name, parameter_cases in data["results"].items() - for parameter_case, benchmark_results in parameter_cases.items() - for variable_name, values in normalize_time_and_network_results(benchmark_results).items() - for value in values - ] - - return cls(results=results) - - def to_dataframe(self) -> "polars.DataFrame": - import polars - - data = { - "uuid": [result.uuid for result in self.results], - "version": [result.version for result in self.results], - "commit_hash": [result.commit_hash for result in self.results], - "environment_id": [result.environment_id for result in self.results], - "machine_id": [result.machine_id for result in self.results], - "benchmark_name": [result.benchmark_name for result in self.results], - "parameter_case": [result.parameter_case for result in self.results], - "value": [result.value for result in self.results], - "variable": [result.variable for result in self.results], - } - - data_frame = polars.DataFrame(data=data) - return data_frame - - -def repackage_as_parquet(directory: pathlib.Path, output_directory: pathlib.Path) -> None: - import polars - - # Machines - machines_data_frames = [] - machines_directory = directory / "machines" - for machine_file_path in machines_directory.iterdir(): - machine = Machine.safe_load_from_json(file_path=machine_file_path) - - if machine is None: - continue - - machine_data_frame = machine.to_dataframe() - machines_data_frames.append(machine_data_frame) - machines_database = polars.concat(items=machines_data_frames, how="diagonal_relaxed") - - machines_database_file_path = output_directory / "machines.parquet" - machines_database.write_parquet(file=machines_database_file_path) - - # Environments - environments_data_frames = [] - environments_directory = directory / "environments" - for environment_file_path in environments_directory.iterdir(): - environment = Environment.safe_load_from_json(file_path=environment_file_path) - - if environment is None: - continue - - environment_data_frame = environment.to_dataframe() - environments_data_frames.append(environment_data_frame) - environments_database = polars.concat(items=environments_data_frames, how="diagonal") - - environments_database_file_path = output_directory / "environments.parquet" - environments_database.write_parquet(file=environments_database_file_path) - - # Results - all_results_data_frames = [] - results_directory = directory / "results" - for result_file_path in results_directory.iterdir(): - results = Results.safe_load_from_json(file_path=result_file_path) - - if results is None: - continue - results_data_frame = results.to_dataframe() - all_results_data_frames.append(results_data_frame) - all_results_database = polars.concat(items=all_results_data_frames, how="diagonal") - - all_results_database_file_path = output_directory / "results.parquet" - all_results_database.write_parquet(file=all_results_database_file_path) - - if __name__ == "__main__": DEBUG_MODE = os.environ.get("NWB_BENCHMARKS_DEBUG", None) if DEBUG_MODE is not None and DEBUG_MODE != "1": diff --git a/src/nwb_benchmarks/database/__init__.py b/src/nwb_benchmarks/database/__init__.py new file mode 100644 index 0000000..3c8e2dd --- /dev/null +++ b/src/nwb_benchmarks/database/__init__.py @@ -0,0 +1,22 @@ +"""Exposed imports to the `database` submodule.""" + +from ._models import ( + Machine, + Result, + Results, + Environment +) + +from ._processing import ( + concat_dataclasses_to_parquet, + repackage_as_parquet, +) + +__all__ = [ + "Machine", + "Result", + "Results", + "Environment", + "concat_dataclasses_to_parquet", + "repackage_as_parquet", +] diff --git a/src/nwb_benchmarks/database/_models.py b/src/nwb_benchmarks/database/_models.py new file mode 100644 index 0000000..992ade7 --- /dev/null +++ b/src/nwb_benchmarks/database/_models.py @@ -0,0 +1,211 @@ +import ast +import dataclasses +import json +import pathlib +import re +import uuid + +import packaging.version +import typing_extensions + + +@dataclasses.dataclass +class Result: + uuid: str + version: str + timestamp: str + commit_hash: str + environment_id: str + machine_id: str + benchmark_name: str + parameter_case: dict + value: float + variable: str + + +@dataclasses.dataclass +class Results: + results: list[Result] + + @classmethod + def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self | None: + with file_path.open(mode="r") as file_stream: + data = json.load(fp=file_stream) + + database_version = data.get("database_version", None) + if database_version is None or packaging.version.Version(data["database_version"]) < packaging.version.Version( + version="1.0.0" + ): + return None + + timestamp = data["timestamp"] + commit_hash = data["commit_hash"] + environment_id = data["environment_id"] + machine_id = data["machine_id"] + + def normalize_time_and_network_results(benchmark_results) -> dict: + """Convert benchmark results to a consistent dict format with list values.""" + if isinstance(benchmark_results, dict): + value_dict = benchmark_results + else: + value_dict = dict(time=benchmark_results) + + # Ensure all values are lists + return {k: v if isinstance(v, list) else [float(v)] for k, v in value_dict.items()} + + def parse_parameter_case(s): + # replace any slice(...) with "slice(...)" for safe parsing + modified_s = re.sub(r'slice\([^)]+\)', r'"\g<0>"', s) + output = ast.literal_eval(modified_s) + + # if the parsed string is not a dict (older benchmarks results), convert it to one + if not isinstance(output, dict): + output = {'https_url': output[0].strip("'")} + + return output + + results = [ + Result( + uuid=str(uuid.uuid4()), # TODO: add this to each results file so it is persistent + version=database_version, + timestamp=timestamp, + commit_hash=commit_hash, + environment_id=environment_id, + machine_id=machine_id, + benchmark_name=benchmark_name, + parameter_case=parse_parameter_case(parameter_case), + value=value, + variable=variable_name, + ) + for benchmark_name, parameter_cases in data["results"].items() + for parameter_case, benchmark_results in parameter_cases.items() + for variable_name, values in normalize_time_and_network_results(benchmark_results).items() + for value in values + ] + + + return cls(results=results) + + def to_dataframe(self) -> "polars.DataFrame": + import polars + + data = { + "uuid": [result.uuid for result in self.results], + "version": [result.version for result in self.results], + "commit_hash": [result.commit_hash for result in self.results], + "environment_id": [result.environment_id for result in self.results], + "machine_id": [result.machine_id for result in self.results], + "benchmark_name": [result.benchmark_name for result in self.results], + "parameter_case_name": [result.parameter_case.get('name') for result in self.results], + "parameter_case_https_url": [result.parameter_case.get('https_url') for result in self.results], + "parameter_case_object_name": [result.parameter_case.get('object_name') for result in self.results], + "parameter_case_slice_range": [result.parameter_case.get('slice_range')for result in self.results], + "value": [result.value for result in self.results], + "variable": [result.variable for result in self.results], + } + + data_frame = polars.DataFrame(data=data) + return data_frame + + +@dataclasses.dataclass +class Machine: + id: str + name: str + version: str + os: dict + sys: dict + platform: dict + psutil: dict + cuda: dict + asv: dict + + @classmethod + def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self | None: + with file_path.open(mode="r") as file_stream: + data = json.load(file_stream) + + version = str(data.get("version", None)) + if version is None or packaging.version.Version(version) < packaging.version.Version(version="1.1.0"): + return None + + machine_id = file_path.stem.removeprefix("machine-") + + return cls( + id=machine_id, + name=data.get("name", ""), + version=version, + os=data.get("os", {}), + sys=data.get("sys", {}), + platform=data.get("platform", {}), + psutil=data.get("psutil", {}), + cuda=data.get("cuda", {}), + asv=data.get("asv", {}), + ) + + def to_dataframe(self) -> "polars.DataFrame": + import polars + + data = { + "machine_id": self.id, + "name": self.name, + "version": self.version, + "os": json.dumps(self.os), + "sys": json.dumps(self.sys), + "platform": json.dumps(self.platform), + "psutil": json.dumps(self.psutil), + "cuda": json.dumps(self.cuda), + "asv": json.dumps(self.asv), + } + + data_frame = polars.DataFrame(data=data) + return data_frame + + +@dataclasses.dataclass +class Environment: + environment_id: str + preamble: str + + # Allow arbitrary fields + def __init__(self, environment_id: str, preamble: str, **kwargs) -> None: + self.environment_id = environment_id + self.preamble = preamble + for key, value in kwargs.items(): + setattr(self, key, value) + + @classmethod + def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self | None: + with file_path.open(mode="r") as file_stream: + data = json.load(fp=file_stream) + + if len(data) > 1: + return None + + environment_id = file_path.stem.removeprefix("environment-") + preamble = next(iter(data.keys())) + + packages = { + package["name"]: f'{package["version"]} ({package["build"]})' + for package in data[preamble] + if len(package) == 3 + } + + if not any(packages): + return None + + return cls(environment_id=environment_id, preamble=preamble, **packages) + + def to_dataframe(self) -> "polars.DataFrame": + import polars + + data = { + "environment_id": self.environment_id, + "preamble": self.preamble, + } + for package_name, package_details in self.__dict__.items(): + if package_name not in ["environment_id", "preamble"]: + data[package_name] = package_details + + data_frame = polars.DataFrame(data=data, orient="col") + return data_frame \ No newline at end of file diff --git a/src/nwb_benchmarks/database/_processing.py b/src/nwb_benchmarks/database/_processing.py new file mode 100644 index 0000000..e3b28c0 --- /dev/null +++ b/src/nwb_benchmarks/database/_processing.py @@ -0,0 +1,82 @@ + +import dataclasses +import pathlib +import packaging +import polars + +from ._models import Machine, Environment, Results + + +def concat_dataclasses_to_parquet(directory: pathlib.Path, + output_directory: pathlib.Path, + dataclass_name: str, + dataclass: dataclasses.dataclass, + concat_how: str = "diagonal_relaxed", + minimum_version: str = "1.0.0") -> None: + """Generic function to process any data type (machines, environments, results) + + Args: + directory (pathlib.Path): Path to the root directory containing data subdirectories. + output_directory (pathlib.Path): Path to the output directory where the parquet file will be saved. + dataclass_name (str): Name of the data class, used for input and output filenames. + dataclass: The dataclass type to process (Machine, Environment, Results). + concat_how (str, optional): How to concatenate dataframes. Defaults to "diagonal_relaxed". + minimum_version (str, optional): Minimum version of the database to include. Defaults to "1.0.0". + Returns: + + """ + + data_frames = [] + data_directory = directory / dataclass_name + + for file_path in data_directory.iterdir(): + obj = dataclass.safe_load_from_json(file_path=file_path) + + if obj is None: + continue + + data_frame = obj.to_dataframe() + + # filter by minimum version (before concatenation to avoid issues with different results structures) + # TODO - should environment have a version? + if "version" in data_frame.columns: + data_frame = data_frame.filter( + polars.col("version").map_elements( + lambda x: packaging.version.parse(x) >= packaging.version.parse(minimum_version), + return_dtype=polars.Boolean + )) + + data_frames.append(data_frame) + + if data_frames: + database = polars.concat(items=data_frames, how=concat_how) + output_file_path = output_directory / f"{dataclass_name}.parquet" + database.write_parquet(file=output_file_path) + + +def repackage_as_parquet(directory: pathlib.Path, output_directory: pathlib.Path, minimum_version: str = "1.0.0") -> None: + """Repackage JSON results files as parquet databases for easier querying.""" + + # Machines + concat_dataclasses_to_parquet(directory=directory, + output_directory=output_directory, + dataclass_name="machines", + dataclass=Machine, + concat_how="diagonal_relaxed", + minimum_version=minimum_version) + + # Environments + concat_dataclasses_to_parquet(directory=directory, + output_directory=output_directory, + dataclass_name="environments", + dataclass=Environment, + concat_how="diagonal", + minimum_version=minimum_version) + + # Results + concat_dataclasses_to_parquet(directory=directory, + output_directory=output_directory, + dataclass_name="results", + dataclass=Results, + concat_how="diagonal_relaxed", + minimum_version=minimum_version) From 994b1b42537fe12f13d0cc30edac20915fc08f11 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:49:07 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/nwb_benchmarks/_flask/_app.py | 4 +- src/nwb_benchmarks/database/__init__.py | 8 +-- src/nwb_benchmarks/database/_models.py | 23 +++--- src/nwb_benchmarks/database/_processing.py | 83 ++++++++++++---------- 4 files changed, 61 insertions(+), 57 deletions(-) diff --git a/src/nwb_benchmarks/_flask/_app.py b/src/nwb_benchmarks/_flask/_app.py index d059c2c..a327f77 100644 --- a/src/nwb_benchmarks/_flask/_app.py +++ b/src/nwb_benchmarks/_flask/_app.py @@ -5,11 +5,11 @@ import time import traceback import typing +from pathlib import Path + import flask import flask_restx -from pathlib import Path - from nwb_benchmarks.database._processing import repackage_as_parquet app = flask.Flask(__name__) diff --git a/src/nwb_benchmarks/database/__init__.py b/src/nwb_benchmarks/database/__init__.py index 3c8e2dd..9f3a046 100644 --- a/src/nwb_benchmarks/database/__init__.py +++ b/src/nwb_benchmarks/database/__init__.py @@ -1,12 +1,6 @@ """Exposed imports to the `database` submodule.""" -from ._models import ( - Machine, - Result, - Results, - Environment -) - +from ._models import Environment, Machine, Result, Results from ._processing import ( concat_dataclasses_to_parquet, repackage_as_parquet, diff --git a/src/nwb_benchmarks/database/_models.py b/src/nwb_benchmarks/database/_models.py index 992ade7..c70523d 100644 --- a/src/nwb_benchmarks/database/_models.py +++ b/src/nwb_benchmarks/database/_models.py @@ -42,26 +42,26 @@ def safe_load_from_json(cls, file_path: pathlib.Path) -> typing_extensions.Self commit_hash = data["commit_hash"] environment_id = data["environment_id"] machine_id = data["machine_id"] - + def normalize_time_and_network_results(benchmark_results) -> dict: """Convert benchmark results to a consistent dict format with list values.""" if isinstance(benchmark_results, dict): value_dict = benchmark_results else: value_dict = dict(time=benchmark_results) - + # Ensure all values are lists return {k: v if isinstance(v, list) else [float(v)] for k, v in value_dict.items()} def parse_parameter_case(s): # replace any slice(...) with "slice(...)" for safe parsing - modified_s = re.sub(r'slice\([^)]+\)', r'"\g<0>"', s) + modified_s = re.sub(r"slice\([^)]+\)", r'"\g<0>"', s) output = ast.literal_eval(modified_s) # if the parsed string is not a dict (older benchmarks results), convert it to one if not isinstance(output, dict): - output = {'https_url': output[0].strip("'")} - + output = {"https_url": output[0].strip("'")} + return output results = [ @@ -83,7 +83,6 @@ def parse_parameter_case(s): for value in values ] - return cls(results=results) def to_dataframe(self) -> "polars.DataFrame": @@ -96,17 +95,17 @@ def to_dataframe(self) -> "polars.DataFrame": "environment_id": [result.environment_id for result in self.results], "machine_id": [result.machine_id for result in self.results], "benchmark_name": [result.benchmark_name for result in self.results], - "parameter_case_name": [result.parameter_case.get('name') for result in self.results], - "parameter_case_https_url": [result.parameter_case.get('https_url') for result in self.results], - "parameter_case_object_name": [result.parameter_case.get('object_name') for result in self.results], - "parameter_case_slice_range": [result.parameter_case.get('slice_range')for result in self.results], + "parameter_case_name": [result.parameter_case.get("name") for result in self.results], + "parameter_case_https_url": [result.parameter_case.get("https_url") for result in self.results], + "parameter_case_object_name": [result.parameter_case.get("object_name") for result in self.results], + "parameter_case_slice_range": [result.parameter_case.get("slice_range") for result in self.results], "value": [result.value for result in self.results], "variable": [result.variable for result in self.results], } data_frame = polars.DataFrame(data=data) return data_frame - + @dataclasses.dataclass class Machine: @@ -208,4 +207,4 @@ def to_dataframe(self) -> "polars.DataFrame": data[package_name] = package_details data_frame = polars.DataFrame(data=data, orient="col") - return data_frame \ No newline at end of file + return data_frame diff --git a/src/nwb_benchmarks/database/_processing.py b/src/nwb_benchmarks/database/_processing.py index e3b28c0..cea3376 100644 --- a/src/nwb_benchmarks/database/_processing.py +++ b/src/nwb_benchmarks/database/_processing.py @@ -1,20 +1,22 @@ - import dataclasses import pathlib + import packaging import polars -from ._models import Machine, Environment, Results +from ._models import Environment, Machine, Results -def concat_dataclasses_to_parquet(directory: pathlib.Path, - output_directory: pathlib.Path, - dataclass_name: str, - dataclass: dataclasses.dataclass, - concat_how: str = "diagonal_relaxed", - minimum_version: str = "1.0.0") -> None: +def concat_dataclasses_to_parquet( + directory: pathlib.Path, + output_directory: pathlib.Path, + dataclass_name: str, + dataclass: dataclasses.dataclass, + concat_how: str = "diagonal_relaxed", + minimum_version: str = "1.0.0", +) -> None: """Generic function to process any data type (machines, environments, results) - + Args: directory (pathlib.Path): Path to the root directory containing data subdirectories. output_directory (pathlib.Path): Path to the output directory where the parquet file will be saved. @@ -25,16 +27,16 @@ def concat_dataclasses_to_parquet(directory: pathlib.Path, Returns: """ - + data_frames = [] data_directory = directory / dataclass_name - + for file_path in data_directory.iterdir(): obj = dataclass.safe_load_from_json(file_path=file_path) - + if obj is None: continue - + data_frame = obj.to_dataframe() # filter by minimum version (before concatenation to avoid issues with different results structures) @@ -43,40 +45,49 @@ def concat_dataclasses_to_parquet(directory: pathlib.Path, data_frame = data_frame.filter( polars.col("version").map_elements( lambda x: packaging.version.parse(x) >= packaging.version.parse(minimum_version), - return_dtype=polars.Boolean - )) + return_dtype=polars.Boolean, + ) + ) data_frames.append(data_frame) - + if data_frames: database = polars.concat(items=data_frames, how=concat_how) output_file_path = output_directory / f"{dataclass_name}.parquet" database.write_parquet(file=output_file_path) -def repackage_as_parquet(directory: pathlib.Path, output_directory: pathlib.Path, minimum_version: str = "1.0.0") -> None: +def repackage_as_parquet( + directory: pathlib.Path, output_directory: pathlib.Path, minimum_version: str = "1.0.0" +) -> None: """Repackage JSON results files as parquet databases for easier querying.""" - + # Machines - concat_dataclasses_to_parquet(directory=directory, - output_directory=output_directory, - dataclass_name="machines", - dataclass=Machine, - concat_how="diagonal_relaxed", - minimum_version=minimum_version) + concat_dataclasses_to_parquet( + directory=directory, + output_directory=output_directory, + dataclass_name="machines", + dataclass=Machine, + concat_how="diagonal_relaxed", + minimum_version=minimum_version, + ) # Environments - concat_dataclasses_to_parquet(directory=directory, - output_directory=output_directory, - dataclass_name="environments", - dataclass=Environment, - concat_how="diagonal", - minimum_version=minimum_version) + concat_dataclasses_to_parquet( + directory=directory, + output_directory=output_directory, + dataclass_name="environments", + dataclass=Environment, + concat_how="diagonal", + minimum_version=minimum_version, + ) # Results - concat_dataclasses_to_parquet(directory=directory, - output_directory=output_directory, - dataclass_name="results", - dataclass=Results, - concat_how="diagonal_relaxed", - minimum_version=minimum_version) + concat_dataclasses_to_parquet( + directory=directory, + output_directory=output_directory, + dataclass_name="results", + dataclass=Results, + concat_how="diagonal_relaxed", + minimum_version=minimum_version, + )