Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions sarc/cli/acquire/slurmconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from sarc.cache import CachePolicy, FormatterProto, with_cache
from sarc.client.gpumetrics import _gpu_billing_collection
from sarc.config import ClusterConfig, config
from sarc.config import ClusterConfig, config, UTC, TZLOCAL
from sarc.jobs.node_gpu_mapping import _node_gpu_mapping_collection

logger = logging.getLogger(__name__)
Expand All @@ -25,9 +25,9 @@ class AcquireSlurmConfig:
required=False,
help=(
"Cluster config file date (format YYYY-MM-DD). "
"Used for file versioning. Should represent a day when config file has been updated "
"(e.g. for new GPU billings, node GPUs, etc.). "
"If not specified, uses current day and downloads config file from cluster."
"Used for file versioning. Should represent day when config file has been downloaded. "
"If not specified, uses current day and downloads config file from cluster. "
"NB: Day is assumed to be in local timezone."
),
)
threshold: float = field(
Expand All @@ -52,17 +52,17 @@ def execute(self) -> int:
cluster_config = config("scraping").clusters[self.cluster_name]
parser = SlurmConfigParser(
cluster_config,
self.day,
datetime.strptime(self.day, "%Y-%m-%d").replace(tzinfo=TZLOCAL),
parse_gpu_billing=parse_gpu_billing,
threshold=self.threshold,
)
slurm_conf = parser.get_slurm_config()
if slurm_conf.gpu_to_billing is not None:
_gpu_billing_collection().save_gpu_billing(
self.cluster_name, parser.day, slurm_conf.gpu_to_billing
self.cluster_name, parser.day.astimezone(UTC), slurm_conf.gpu_to_billing
)
_node_gpu_mapping_collection().save_node_gpu_mapping(
self.cluster_name, parser.day, slurm_conf.node_to_gpus
self.cluster_name, parser.day.astimezone(UTC), slurm_conf.node_to_gpus
)
return 0

Expand All @@ -89,21 +89,32 @@ class SlurmConfigParser:
def __init__(
self,
cluster: ClusterConfig,
day: str | None = None,
day: datetime | None = None,
parse_gpu_billing: bool = True,
threshold: float = 0.1,
):
if day is None:
# No day given, get current day
day = datetime.now().strftime("%Y-%m-%d")
day = datetime.now(tz=TZLOCAL).replace(
hour=0, minute=0, second=0, microsecond=0
)
# Cache must download slurm conf file and save it locally.
cache_policy = CachePolicy.use
logger.info(f"Looking for config file at current date: {day}")
else:
# Day given. Slurm conf file must be retrieved from cache only.
cache_policy = CachePolicy.always

# We want a day, i.e. datetime at 00h 00min 00sec 00microsec
assert (
day.hour == 0
and day.minute == 0
and day.second == 0
and day.microsecond == 0
), day

self.cluster = cluster
self.day = day
self.day: datetime = day
self.cache_policy = cache_policy
self.parse_gpu_billing = bool(parse_gpu_billing)
self.threshold = threshold
Expand All @@ -123,7 +134,8 @@ def _get_slurm_conf(self) -> str:
return result.stdout

def _cache_key(self) -> str:
return f"slurm.{self.cluster.name}.{self.day}.conf"
day_str = self.day.strftime("%Y-%m-%d")
return f"slurm.{self.cluster.name}.{day_str}.conf"

@classmethod
def _file_lines(cls, file) -> Iterator[tuple[int, str]]:
Expand Down
21 changes: 4 additions & 17 deletions sarc/client/gpumetrics.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from __future__ import annotations

import logging
from datetime import datetime, time
from types import SimpleNamespace

from iguane.fom import RAWDATA, fom_ugr
from pydantic import field_validator
from pydantic_mongo import AbstractRepository, PydanticObjectId

from sarc.config import MTL, UTC, config, scraping_mode_required
from sarc.config import config, scraping_mode_required
from sarc.core.models.validators import datetime_utc
from sarc.model import BaseModel

logger = logging.getLogger(__name__)
Expand All @@ -21,21 +20,9 @@ class GPUBilling(BaseModel):
id: PydanticObjectId | None = None

cluster_name: str
since: datetime
since: datetime_utc
gpu_to_billing: dict[str, float]

@field_validator("since", mode="before")
@classmethod
def _ensure_since(cls, value: str | datetime) -> datetime:
"""Parse `since` from stored string to Python datetime."""
if isinstance(value, str):
return datetime.combine(datetime.fromisoformat(value), time.min).replace(
tzinfo=MTL
)
else:
assert isinstance(value, datetime)
return value.replace(tzinfo=UTC).astimezone(MTL)


class GPUBillingRepository(AbstractRepository[GPUBilling]):
class Meta:
Expand All @@ -45,7 +32,7 @@ class Meta:
def save_gpu_billing(
self,
cluster_name: str,
since: str,
since: datetime_utc,
gpu_to_billing: dict[str, float],
) -> None:
"""Save GPU->billing mapping into database."""
Expand Down
30 changes: 6 additions & 24 deletions sarc/jobs/node_gpu_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

import bisect
import logging
from datetime import datetime, time
from datetime import datetime
from typing import Optional

from pydantic import field_validator
from pydantic_mongo import AbstractRepository, PydanticObjectId

from sarc.config import MTL, UTC, config, scraping_mode_required
from sarc.config import config, scraping_mode_required
from sarc.core.models.validators import datetime_utc
from sarc.model import BaseModel

logger = logging.getLogger(__name__)
Expand All @@ -21,31 +21,13 @@ class NodeGPUMapping(BaseModel):
id: PydanticObjectId | None = None

cluster_name: str
since: datetime
since: datetime_utc
node_to_gpu: dict[str, list[str]]

@field_validator("since", mode="before")
@classmethod
def _ensure_since(cls, value: str | datetime) -> datetime:
"""Parse `since` from stored string to Python datetime."""
return parse_since(value)

def __lt__(self, other):
return self.since < other.since


def parse_since(since: str | datetime) -> datetime:
if isinstance(since, str):
return datetime.combine(datetime.fromisoformat(since), time.min).replace(
tzinfo=MTL
)
else:
assert isinstance(since, datetime)
if since.tzinfo is None:
since = since.replace(tzinfo=UTC)
return since.astimezone(MTL)


class NodeGPUMappingRepository(AbstractRepository[NodeGPUMapping]):
class Meta:
collection_name = "node_gpu_mapping"
Expand All @@ -54,11 +36,11 @@ class Meta:
def save_node_gpu_mapping(
self,
cluster_name: str,
since: str | datetime,
since: datetime_utc,
node_to_gpu: dict[str, list[str]],
):
mapping = NodeGPUMapping(
cluster_name=cluster_name, since=parse_since(since), node_to_gpu=node_to_gpu
cluster_name=cluster_name, since=since, node_to_gpu=node_to_gpu
)
# Check if a node->GPU mapping was already registered
# for given cluster and date.
Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def tzlocal_is_mtl(monkeypatch):
monkeypatch.setattr(
"sarc.cli.acquire.jobs.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal")
)
monkeypatch.setattr(
"sarc.cli.acquire.slurmconfig.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal")
)


@pytest.fixture
Expand Down
17 changes: 10 additions & 7 deletions tests/functional/cli/acquire/test_acquire_slurmconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sarc.cache import CacheException
from sarc.cli.acquire.slurmconfig import InconsistentGPUBillingError, SlurmConfigParser
from sarc.client.gpumetrics import GPUBilling, get_cluster_gpu_billings
from sarc.config import MTL, config
from sarc.config import MTL, config, UTC
from sarc.jobs.node_gpu_mapping import NodeGPUMapping, get_node_to_gpu
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

Expand Down Expand Up @@ -90,7 +90,7 @@ def _setup_logging_do_nothing(*args, **kwargs):
"""


@pytest.mark.usefixtures("empty_read_write_db", "enabled_cache")
@pytest.mark.usefixtures("empty_read_write_db", "enabled_cache", "tzlocal_is_mtl")
def test_acquire_slurmconfig(cli_main, caplog, monkeypatch):
monkeypatch.setattr("sarc.cli.setupLogging", _setup_logging_do_nothing)

Expand Down Expand Up @@ -120,7 +120,7 @@ def test_acquire_slurmconfig(cli_main, caplog, monkeypatch):

expected_gpu_billing_1 = GPUBilling(
cluster_name="raisin",
since="2020-01-01",
since=datetime(2020, 1, 1, tzinfo=MTL).astimezone(UTC),
gpu_to_billing={
"gpu1": 5000,
"THE GPU II": 7500,
Expand All @@ -130,7 +130,7 @@ def test_acquire_slurmconfig(cli_main, caplog, monkeypatch):

expected_node_to_gpu_1 = NodeGPUMapping(
cluster_name="raisin",
since="2020-01-01",
since=datetime(2020, 1, 1, tzinfo=MTL).astimezone(UTC),
node_to_gpu={
**{
node_name: ["gpu1"]
Expand Down Expand Up @@ -162,7 +162,7 @@ def test_acquire_slurmconfig(cli_main, caplog, monkeypatch):
)
expected_gpu_billing_2 = GPUBilling(
cluster_name="raisin",
since="2020-05-01",
since=datetime(2020, 5, 1, tzinfo=MTL).astimezone(UTC),
gpu_to_billing={
"gpu1": 4000,
"THE GPU II": 9000,
Expand All @@ -175,7 +175,7 @@ def test_acquire_slurmconfig(cli_main, caplog, monkeypatch):

expected_node_to_gpu_2 = NodeGPUMapping(
cluster_name="raisin",
since="2020-05-01",
since=datetime(2020, 5, 1, tzinfo=MTL).astimezone(UTC),
node_to_gpu=expected_node_to_gpu_1.node_to_gpu.copy(),
)
del expected_node_to_gpu_2.node_to_gpu["alone_node"]
Expand Down Expand Up @@ -287,7 +287,10 @@ def assert_same_node_gpu_mapping(


def _save_slurm_conf(cluster_name: str, day: str, content: str):
scp = SlurmConfigParser(config().clusters[cluster_name], day)
scp = SlurmConfigParser(
config().clusters[cluster_name],
datetime.strptime(day, "%Y-%m-%d").replace(tzinfo=MTL),
)
folder = "slurm_conf"
filename = scp._cache_key()
cache_dir = config().cache
Expand Down
6 changes: 3 additions & 3 deletions tests/functional/jobs/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def create_gpu_billings():
return [
{
"cluster_name": "patate",
"since": "2023-02-15",
"since": datetime(2023, 2, 15, tzinfo=MTL).astimezone(UTC),
"gpu_to_billing": {
"patate_gpu_no_rgu_with_billing": 120,
"patate_gpu_with_rgu_with_billing": 90,
Expand All @@ -509,7 +509,7 @@ def create_gpu_billings():
},
{
"cluster_name": "patate",
"since": "2023-02-18",
"since": datetime(2023, 2, 18, tzinfo=MTL).astimezone(UTC),
"gpu_to_billing": {
"patate_gpu_no_rgu_with_billing": 240, # / 2
"patate_gpu_with_rgu_with_billing": 180, # x 2
Expand All @@ -518,7 +518,7 @@ def create_gpu_billings():
},
{
"cluster_name": "raisin",
"since": "2023-02-15",
"since": datetime(2023, 2, 15, tzinfo=MTL).astimezone(UTC),
"gpu_to_billing": {
"raisin_gpu_no_rgu_with_billing": 150,
"raisin_gpu_with_rgu_with_billing": 50,
Expand Down
18 changes: 2 additions & 16 deletions tests/functional/jobs/test_func_prometheus_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,14 @@
from opentelemetry.trace import StatusCode

from sarc.client.job import JobStatistics, get_jobs, get_available_clusters
from sarc.config import MTL, UTC, config
from sarc.config import MTL, UTC
from sarc.jobs import prometheus_scraping

from .factory import create_sacct_json
from ..cli.acquire.test_acquire_slurmconfig import _save_slurm_conf
from ...common.dateutils import _dtfmt, _dtstr, _dtreg


def _save_slurm_conf(cluster_name: str, day: str, content: str):
from sarc.cli.acquire.slurmconfig import SlurmConfigParser

scp = SlurmConfigParser(config().clusters[cluster_name], day)
folder = "slurm_conf"
filename = scp._cache_key()
cache_dir = config().cache
file_dir = cache_dir / folder
file_dir.mkdir(parents=True, exist_ok=True)
file_path = file_dir / filename
print(file_path)
with file_path.open("w") as file:
file.write(content)


@pytest.fixture
def mock_compute_job_statistics(monkeypatch):
def mock_func(job):
Expand Down