Skip to content

Commit 4cf4d11

Browse files
committed
feat(panda): Support panda wms and idtoken operations
1 parent 9774bfa commit 4cf4d11

File tree

11 files changed

+516
-50
lines changed

11 files changed

+516
-50
lines changed

src/lsst/cmservice/common/htcondor.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Utility functions for working with htcondor jobs"""
22

33
import importlib.util
4-
import os
54
import sys
65
from collections.abc import Mapping
76
from types import ModuleType
@@ -198,7 +197,18 @@ def build_htcondor_submit_environment() -> Mapping[str, str]:
198197
# condor_environment = config.htcondor.model_dump(by_alias=True)
199198
# TODO we should not always use the same schedd host. We could get a list
200199
# of all schedds from the collector and pick one at random.
201-
return dict(
200+
201+
# FIXME / TODO
202+
# This is nothing to do with htcondor vs panda as a WMS, but because CM
203+
# uses htcondor as its primary script-running engine for bps workflows even
204+
# if that workflow uses panda. Because of this, we need to also serialize
205+
# all of the panda environmental config for the subprocess to pick up.
206+
# We do this instead of delegating panda config to some arbitrary bash
207+
# script elsewhere in the filesystem whose only job is to set these env
208+
# vars for panda. This also allows us to provide our specific panda idtoken
209+
# as an env var instead of requiring the target process to pick it up from
210+
# some .token file that may or may not be present or valid.
211+
return config.panda.model_dump(by_alias=True, exclude_none=True) | dict(
202212
CONDOR_CONFIG=config.htcondor.config_source,
203213
_CONDOR_CONDOR_HOST=config.htcondor.collector_host,
204214
_CONDOR_COLLECTOR_HOST=config.htcondor.collector_host,
@@ -210,7 +220,7 @@ def build_htcondor_submit_environment() -> Mapping[str, str]:
210220
HOME=config.htcondor.remote_user_home,
211221
LSST_VERSION=config.bps.lsst_version,
212222
LSST_DISTRIB_DIR=config.bps.lsst_distrib_dir,
213-
# FIX: because there is no db-auth.yaml in lsstsvc1's home directory
223+
# FIXME: because there is no db-auth.yaml in lsstsvc1's home directory
214224
PGPASSFILE=f"{config.htcondor.remote_user_home}/.lsst/postgres-credentials.txt",
215225
PGUSER=config.butler.default_username,
216226
PATH=(
@@ -233,12 +243,6 @@ def import_htcondor() -> ModuleType | None:
233243
logger.warning("HTcondor not available.")
234244
return None
235245

236-
# Ensure environment is configured for htcondor operations
237-
# FIXME: the python process needs the correct condor env set up. Alternate
238-
# to setting these values JIT in the os.environ would be to hack a way to
239-
# have the config.htcondor submodel's validation_alias match the
240-
# serialization_alias, e.g., "_CONDOR_value"
241-
os.environ |= config.htcondor.model_dump(by_alias=True)
242246
htcondor.reload_config()
243247

244248
return htcondor

src/lsst/cmservice/common/panda.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""Module for PanDA operations within CM-Service"""
2+
3+
import datetime
4+
import json
5+
import os
6+
from collections.abc import Generator
7+
from contextlib import contextmanager
8+
from pathlib import Path
9+
10+
import httpx
11+
from pandaclient.openidc_utils import decode_id_token
12+
13+
from ..config import config
14+
from .logging import LOGGER
15+
16+
logger = LOGGER.bind(module=__name__)
17+
"""A module-level logger"""
18+
19+
20+
@contextmanager
21+
def http_client() -> Generator[httpx.Client]:
22+
"""Generate a client session for panda API operations."""
23+
transport = httpx.HTTPTransport(
24+
verify=config.panda.verify_host,
25+
retries=3,
26+
)
27+
with httpx.Client(transport=transport) as session:
28+
yield session
29+
30+
31+
def refresh_panda_token(url: str, data: dict[str, str]) -> str | None:
32+
"""Refresh a panda auth token."""
33+
with http_client() as session:
34+
response = session.post(
35+
url=url, data=data, headers={"content-type": "application/x-www-form-urlencoded"}
36+
)
37+
response.raise_for_status()
38+
39+
token_data: dict[str, str] = response.json()
40+
# with the new token...
41+
# - update the configuration object
42+
config.panda.id_token = token_data["id_token"]
43+
config.panda.refresh_token = token_data["refresh_token"]
44+
# - update the process environment
45+
os.environ["PANDA_AUTH_ID_TOKEN"] = config.panda.id_token
46+
# - update token expiry
47+
decoded_token = decode_id_token(config.panda.id_token)
48+
config.panda.token_expiry = float(decoded_token["exp"]) # type: ignore
49+
return config.panda.id_token
50+
51+
52+
def get_panda_token() -> str | None:
53+
"""Fetch a panda id token from configuration or a token file as necessary.
54+
If a token does not exist or is near expiry, create or refresh a token.
55+
56+
Returns
57+
-------
58+
str or None
59+
The string value of a panda id token or None if no such token exists or
60+
can be created.
61+
62+
TODO: make this async if necessary, but the daemon is less sensitive to
63+
sync operations as long as they do not block indefinitely.
64+
65+
Notes
66+
-----
67+
This function should be called at application startup to bootstrap an id
68+
token, and again before panda operations that may require the use of the
69+
id token, to ensure the validity within the token expiry time.
70+
71+
The refresh operation never actually uses the current idtoken except to
72+
discover the expiry time. We don't actually need any bootstrap value for
73+
the idtoken if we start with a refresh token; the auth_config_url is
74+
determined from the panda url and the oidc VO.
75+
"""
76+
77+
# If a token has been added to the configuration object, use it instead of
78+
# loading one from disk
79+
try:
80+
if config.panda.refresh_token is None:
81+
token_data = json.loads((Path(config.panda.config_root) / ".token").read_text())
82+
config.panda.id_token = token_data["id_token"]
83+
config.panda.refresh_token = token_data["refresh_token"]
84+
except (FileNotFoundError, json.JSONDecodeError):
85+
logger.exception()
86+
return None
87+
88+
now_utc = datetime.datetime.now(datetime.UTC)
89+
90+
# Determine whether the token should be renewed
91+
# The token expiry time is part of the encoded token
92+
try:
93+
decoded_token = decode_id_token(config.panda.id_token)
94+
# TODO if "exp" not in decoded_token: ...
95+
config.panda.token_expiry = float(decoded_token["exp"]) # type: ignore
96+
except Exception:
97+
# FIXME: this should generally be an AttributeError but the 3rdparty
98+
# function may change its operation.
99+
# If current id_token is None or otherwise not decodable, we will get a
100+
# new one from the refresh operation
101+
logger.exception()
102+
config.panda.token_expiry = now_utc
103+
104+
if (config.panda.token_expiry - now_utc) < datetime.timedelta(config.panda.renew_after):
105+
if config.panda.auth_config_url is None:
106+
logger.error("There is no PanDA auth config url known to the service, cannot refresh token.")
107+
return config.panda.id_token
108+
109+
try:
110+
# TODO it is probably safe to cache these response tokens
111+
with http_client() as session:
112+
auth_config_response = session.get(config.panda.auth_config_url)
113+
auth_config_response.raise_for_status()
114+
panda_auth_config = auth_config_response.json()
115+
116+
token_response = session.get(panda_auth_config["oidc_config_url"])
117+
token_response.raise_for_status()
118+
token_endpoint = token_response.json()["token_endpoint"]
119+
120+
data = dict(
121+
client_id=panda_auth_config["client_id"],
122+
client_secret=panda_auth_config["client_secret"],
123+
grant_type="refresh_token",
124+
refresh_token=config.panda.refresh_token,
125+
)
126+
127+
_ = refresh_panda_token(token_endpoint, data)
128+
except httpx.HTTPStatusError:
129+
logger.exception()
130+
131+
return config.panda.id_token

src/lsst/cmservice/config.py

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
from datetime import UTC, datetime
2+
from typing import Self
3+
from urllib.parse import urlparse
14
from warnings import warn
25

36
from dotenv import load_dotenv
4-
from pydantic import BaseModel, Field, field_validator
7+
from pydantic import BaseModel, Field, computed_field, field_serializer, field_validator, model_validator
58
from pydantic_settings import BaseSettings, SettingsConfigDict
69

710
from .common.enums import ScriptMethodEnum, StatusEnum, WmsComputeSite
@@ -210,6 +213,144 @@ class HTCondorConfiguration(BaseModel):
210213
)
211214

212215

216+
class PandaConfiguration(BaseModel, validate_assignment=True):
217+
"""Configuration parameters for the PanDA WMS"""
218+
219+
tls_url: str | None = Field(
220+
description="Base HTTPS URL of PanDA server",
221+
serialization_alias="PANDA_URL_SSL",
222+
default=None,
223+
)
224+
225+
url: str | None = Field(
226+
description="Base HTTP URL of PanDA server",
227+
serialization_alias="PANDA_URL",
228+
default=None,
229+
)
230+
231+
monitor_url: str | None = Field(
232+
description="URL of PanDA monitor",
233+
serialization_alias="PANDAMON_URL",
234+
default=None,
235+
)
236+
237+
cache_url: str | None = Field(
238+
description="Base URL of PanDA sandbox server",
239+
serialization_alias="PANDACACHE_URL",
240+
default=None,
241+
)
242+
243+
virtual_organization: str = Field(
244+
description="Virtual organization name used with Panda OIDC",
245+
serialization_alias="PANDA_AUTH_VO",
246+
default="Rubin",
247+
)
248+
249+
renew_after: int = Field(
250+
description="Minimum auth token lifetime in seconds before renewal attempts are made",
251+
default=302_400,
252+
exclude=True,
253+
)
254+
255+
# The presence of this environment variable should cause the panda client
256+
# to use specified token directly, skipping IO related to reading a token
257+
# file.
258+
id_token: str | None = Field(
259+
description="Current id token for PanDA authentication",
260+
serialization_alias="PANDA_AUTH_ID_TOKEN",
261+
default=None,
262+
)
263+
264+
refresh_token: str | None = Field(
265+
description="Current refresh token for PanDA token operations",
266+
default=None,
267+
exclude=True,
268+
)
269+
270+
token_expiry: datetime = Field(
271+
description="Time at which the current idtoken expires",
272+
default=datetime.now(tz=UTC),
273+
exclude=True,
274+
)
275+
276+
config_root: str = Field(
277+
description="Location of the PanDA .token file",
278+
serialization_alias="PANDA_CONFIG_ROOT",
279+
default="/var/run/secrets/panda",
280+
exclude=True,
281+
)
282+
283+
auth_type: str = Field(
284+
description="Panda Auth type",
285+
serialization_alias="PANDA_AUTH",
286+
default="oidc",
287+
)
288+
289+
behind_lb: bool = Field(
290+
description="Whether Panda is behind a loadbalancer",
291+
default=False,
292+
serialization_alias="PANDA_BEHIND_REAL_LB",
293+
)
294+
295+
verify_host: bool = Field(
296+
description="Whether to verify PanDA host TLS",
297+
default=True,
298+
serialization_alias="PANDA_VERIFY_HOST",
299+
)
300+
301+
use_native_httplib: bool = Field(
302+
description="Use native http lib instead of curl",
303+
default=True,
304+
serialization_alias="PANDA_USE_NATIVE_HTTPLIB",
305+
)
306+
307+
@computed_field(repr=False) # type: ignore[prop-decorator]
308+
@property
309+
def auth_config_url(self) -> str | None:
310+
"""Location of auth config for PanDA VO."""
311+
if self.tls_url is None:
312+
return None
313+
url_parts = urlparse(self.tls_url)
314+
return f"{url_parts.scheme}://{url_parts.hostname}:{url_parts.port}/auth/{self.virtual_organization}_auth_config.json"
315+
316+
@model_validator(mode="after")
317+
def set_base_url_fields(self) -> Self:
318+
"""Set all url fields when only a subset of urls are supplied."""
319+
# NOTE: there is a danger of this validator creating a recursion error
320+
# if unbounded field-setters are used. Every update to the model
321+
# will itself trigger this validator because of the
322+
# `validate_assignment` directive on the model itself.
323+
324+
# If no panda urls have been specified there is no need to continue
325+
# with model validation
326+
if self.url is None and self.tls_url is None:
327+
return self
328+
# It does not seem critical that these URLs actually use the scheme
329+
# with which they are nominally associated, only that both be set.
330+
elif self.url is None:
331+
self.url = self.tls_url
332+
elif self.tls_url is None:
333+
self.tls_url = self.url
334+
335+
# default the cache url to the tls url
336+
if self.cache_url is None:
337+
self.cache_url = self.tls_url
338+
return self
339+
340+
@field_validator("token_expiry", mode="after")
341+
@classmethod
342+
def set_datetime_utc(cls, value: datetime) -> datetime:
343+
"""Applies UTC timezone to datetime value."""
344+
# For tz-naive datetimes, treat the time as UTC in the first place
345+
# otherwise coerce the tz-aware datetime into UTC
346+
return value.replace(tzinfo=UTC) if value.tzinfo is None else value.astimezone(tz=UTC)
347+
348+
@field_serializer("behind_lb", "verify_host", "use_native_httplib")
349+
def serialize_booleans(self, value: bool) -> str: # noqa: FBT001
350+
"""Serialize boolean fields as string values."""
351+
return "on" if value else "off"
352+
353+
213354
# TODO deprecate and remove "slurm"-specific logic from cm-service; it is
214355
# unlikely that interfacing with slurm directly from k8s will be possible.
215356
class SlurmConfiguration(BaseModel):
@@ -383,6 +524,7 @@ class Configuration(BaseSettings):
383524
htcondor: HTCondorConfiguration = HTCondorConfiguration()
384525
logging: LoggingConfiguration = LoggingConfiguration()
385526
slurm: SlurmConfiguration = SlurmConfiguration()
527+
panda: PandaConfiguration = PandaConfiguration()
386528

387529
# Root fields
388530
script_handler: ScriptMethodEnum = Field(

src/lsst/cmservice/daemon.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from asyncio import create_task
23
from collections.abc import AsyncGenerator
34
from contextlib import asynccontextmanager
@@ -11,6 +12,7 @@
1112
from . import __version__
1213
from .common.daemon import daemon_iteration
1314
from .common.logging import LOGGER
15+
from .common.panda import get_panda_token
1416
from .config import config
1517
from .routers.healthz import health_router
1618

@@ -22,6 +24,11 @@
2224
@asynccontextmanager
2325
async def lifespan(app: FastAPI) -> AsyncGenerator:
2426
# start
27+
# Bootstrap a panda id token
28+
_ = get_panda_token()
29+
# Update process environment with configuration models
30+
os.environ |= config.panda.model_dump(by_alias=True, exclude_none=True)
31+
os.environ |= config.htcondor.model_dump(by_alias=True, exclude_none=True)
2532
app.state.tasks = set()
2633
daemon = create_task(main_loop(), name="daemon")
2734
app.state.tasks.add(daemon)

0 commit comments

Comments
 (0)