Skip to content
This repository was archived by the owner on Oct 2, 2024. It is now read-only.

Commit a50d23b

Browse files
committed
Merge branch 'master' into run-oci-bundles_1754
2 parents 94545fe + 649c710 commit a50d23b

20 files changed

+342
-63
lines changed

.github/workflows/main.yml

+9
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ jobs:
4747
steps:
4848
- uses: actions/checkout@v3
4949

50+
# This allows SSH access to the GitHub Actions VM to debug things that
51+
# only happen on CI. Comment out unless needed. WARNING: tmate.io has
52+
# access to unencrypted SSH traffic.
53+
# See: https://github.com/marketplace/actions/debugging-with-tmate
54+
#- name: set up tmate session
55+
# uses: mxschmitt/action-tmate@v3
56+
# with:
57+
# detached: true
58+
5059
- name: early setup & validation
5160
run: |
5261
[[ -n $CH_TEST_BUILDER ]]

bin/Makefile.am

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ if HAVE_LIBSQUASHFUSE
1313
ch_run_SOURCES += ch_fuse.h ch_fuse.c
1414
endif
1515

16-
ch_run_CFLAGS = $(CFLAGS) $(PTHREAD_CFLAGS)
16+
# additional build flags for ch-run
17+
ch_run_CFLAGS = $(PTHREAD_CFLAGS)
1718
ch_run_LDADD = $(CH_RUN_LIBS)
1819

1920

bin/ch-image.py.in

+2
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,8 @@ if (__name__ == "__main__"):
415415
for (opt, arg) in zip(sys.argv[1:], sys.argv[2:] + [None]):
416416
(opt, _, arg_eq) = opt.partition("=")
417417
if (opt == "--break"):
418+
if (not sys.stdin.isatty()):
419+
ch.FATAL("--break: standard input must be a terminal")
418420
if (arg_eq != ""):
419421
arg = arg_eq
420422
try:

bin/ch-run.c

+12
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,12 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
449449
exit(0);
450450
#else
451451
exit(1);
452+
#endif
453+
} else if (!strcmp(arg, "overlayfs")) {
454+
#ifdef HAVE_OVERLAYFS
455+
exit(0);
456+
#else
457+
exit(1);
452458
#endif
453459
} else if (!strcmp(arg, "seccomp")) {
454460
#ifdef HAVE_SECCOMP
@@ -461,6 +467,12 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state)
461467
exit(0);
462468
#else
463469
exit(1);
470+
#endif
471+
} else if (!strcmp(arg, "tmpfs-xattrs")) {
472+
#ifdef HAVE_TMPFS_XATTRS
473+
exit(0);
474+
#else
475+
exit(1);
464476
#endif
465477
}
466478
else

bin/ch_core.c

+33-12
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,27 @@
4141
/* Timeout in seconds for waiting for join semaphore. */
4242
#define JOIN_TIMEOUT 30
4343

44-
/* Maximum length of paths we're willing to deal with. (Note that
44+
/* Maximum length of paths were willing to deal with. (Note that
4545
system-defined PATH_MAX isn't reliable.) */
4646
#define PATH_CHARS 4096
4747

48+
/* Mount point for the tmpfs used by -W. We want this to be (a) always
49+
available [1], (b) short, (c) not used by anything else we care about
50+
during container setup, and (d) not wildly confusing if users see it in an
51+
error message. Must be a string literal because we use C’s literal
52+
concatenation feature. Options considered (all of these required by FHS):
53+
54+
/boot Not present if host is booted in some strange way?
55+
/etc Likely very reliable but seems risky
56+
/mnt Used for images on GitHub Actions and causes CI failures
57+
/opt Seems very omittable
58+
/srv I’ve never actually seen it used; reliable?
59+
/var Too aggressive?
60+
/var/spool Long; omittable for lightweight hosts?
61+
62+
[1]: https://www.pathname.com/fhs/pub/fhs-2.3.pdf */
63+
#define WF_MNT "/srv"
64+
4865

4966
/** Constants **/
5067

@@ -306,26 +323,30 @@ void enter_udss(struct container *c)
306323
// https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html
307324
// https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html
308325
if (c->overlay_size != NULL) {
309-
VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size);
310326
char *options;
327+
struct stat st;
328+
VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size);
311329
T_ (1 <= asprintf(&options, "size=%s", c->overlay_size));
312-
Zf (mount(NULL, "/mnt", "tmpfs", 0, options), // host should have /mnt
330+
Zf (mount(NULL, WF_MNT, "tmpfs", 0, options),
313331
"cannot mount tmpfs for overlay");
314332
free(options);
315-
Z_ (mkdir("/mnt/upper", 0700));
316-
Z_ (mkdir("/mnt/work", 0700));
317-
Z_ (mkdir("/mnt/merged", 0700));
318-
mkdir_scratch = "/mnt/mkdir_overmount";
333+
Z_ (mkdir(WF_MNT "/upper", 0700));
334+
Z_ (mkdir(WF_MNT "/work", 0700));
335+
Z_ (mkdir(WF_MNT "/merged", 0700));
336+
mkdir_scratch = WF_MNT "/mkdir_overmount";
319337
Z_ (mkdir(mkdir_scratch, 0700));
320-
T_ (1 <= asprintf(&options, "lowerdir=%s,upperdir=%s,workdir=%s,"
321-
"index=on,userxattr,volatile",
322-
c->newroot, "/mnt/upper", "/mnt/work"));
338+
T_ (1 <= asprintf(&options, ("lowerdir=%s,upperdir=%s,workdir=%s,"
339+
"index=on,userxattr,volatile"),
340+
c->newroot, WF_MNT "/upper", WF_MNT "/work"));
323341
// update newroot
324-
c->newroot = "/mnt/merged";
342+
Zf (stat(c->newroot, &st),
343+
"can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot);
344+
c->newroot = WF_MNT "/merged";
325345
free(nr_parent);
326346
free(nr_base);
327347
path_split(c->newroot, &nr_parent, &nr_base);
328-
Zf (mount(NULL, c->newroot, "overlay", 0, options), "can't overlay");
348+
Zf (mount(NULL, c->newroot, "overlay", 0, options),
349+
"can't overlay: %s, %s", c->newroot, options);
329350
VERBOSE("newroot updated: %s", c->newroot);
330351
free(options);
331352
}

bin/ch_fuse.c

+5
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@
2727
// SquashFUSE redefines __le16 unless HAVE_LINUX_TYPES_LE16 is defined. We are
2828
// assuming it is defined in <linux/types.h> on your machine.
2929
#define HAVE_LINUX_TYPES_LE16
30+
// The forget operation in libfuse3 takes uint64_t as third parameter,
31+
// while SquashFUSE defaults to unsigned long as used in libfuse2.
32+
// This causes a mess on arches with different size of these types,
33+
// so explicitly switch to the libfuse3 variant.
34+
#define HAVE_FUSE_LL_FORGET_OP_64T
3035
// Now we can include ll.h.
3136
#include <squashfuse/ll.h>
3237

configure.ac

+6-1
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@ AC_MSG_RESULT($have_userns)
343343
AC_DEFUN([CH_OVERLAY_C], [[
344344
#define _GNU_SOURCE
345345
#include <errno.h>
346+
#include <fcntl.h>
346347
#include <sched.h>
347348
#include <stdio.h>
348349
#include <stdlib.h>
@@ -786,6 +787,10 @@ AC_SUBST([CH_RUN_LIBS])
786787
AC_SUBST([PYTHON_SHEBANG])
787788
AC_SUBST([SPHINX])
788789

790+
AS_IF([test $have_overlayfs = yes],
791+
[AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])])
792+
AS_IF([test $have_tmpfs_xattrs = yes],
793+
[AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])])
789794
AS_IF([test $have_fnm_extmatch = yes],
790795
[AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])])
791796
AS_IF([test $have_seccomp = yes],
@@ -941,7 +946,7 @@ Building Charliecloud
941946
test suite ... ${enable_test}
942947
943948
required:
944-
C99 compiler ... ${CC} ${CC_VERSION}
949+
C99 compiler ... ${CC} ${CFLAGS}
945950
946951
optional:
947952
extended glob patterns in --unset-env ... ${have_fnm_extmatch}

doc/best_practices.rst

+150-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
Best practices
22
**************
33

4+
.. contents::
5+
:depth: 3
6+
:local:
7+
48
Other best practices information
59
================================
610

@@ -303,5 +307,149 @@ building, and then run using a separate container invoked from a different
303307
terminal.
304308

305309

306-
.. LocalWords: userguide Gruening Souppaya Morello Scarfone openmpi nist
307-
.. LocalWords: ident OCFS MAGICK
310+
MPI
311+
===
312+
313+
Problems that best practices help you avoid
314+
-------------------------------------------
315+
316+
These recommendations are derived from our experience in mitigating container
317+
MPI issues. It is important to note that, despite marketing claims, no single
318+
container implementation has “solved” MPI or is free of warts; the issues are
319+
numerous, multifaceted, and dynamic.
320+
321+
Key concepts and related issues include:
322+
323+
1. **Workload management**. Running applications on HPC clusters requires
324+
resource management and job scheduling. Put simply, resource management
325+
is the act of allocating and restricting compute resources, e.g., CPU and
326+
memory, whereas job scheduling is the act of prioritizing and enforcing
327+
resource management. *Both require privileged operations.*
328+
329+
Some privileged container implementations attempt to provide their own
330+
workload management, often referred to as “container orchestration”.
331+
332+
Charliecloud is lightweight and completely unprivileged. We rely on
333+
existing, reputable and well established HPC workload managers such as
334+
Slurm.
335+
336+
2. **Job launch**. When a multi-node MPI job is launched, each node must
337+
launch a number of containerized processes, i.e., *ranks*. Doing this
338+
unprivileged and at scale requires interaction between the application
339+
and workload manager. That is, something like Process Management
340+
Interface (PMI) is needed to facilitate the job launch.
341+
342+
3. **Shared memory**. Processes in separate sibling containers cannot use
343+
single-copy *cross-memory attach* (CMA), as opposed to double-copy POSIX
344+
or SysV shared memory. The solution is to put all ranks in the *same*
345+
container with :code:`ch-run --join`. (See above for details:
346+
:ref:`faq_join`.)
347+
348+
4. **Network fabric.** Performant MPI jobs must recognize and use a system’s
349+
high-speed interconnect. Common issues that arise are:
350+
351+
a. Libraries required to use the interconnect are proprietary or
352+
otherwise unavailable to the container.
353+
354+
b. The interconnect is not supported by the container MPI.
355+
356+
In both cases, the containerized MPI application will either fail or run
357+
significantly slower.
358+
359+
These problems can be avoided, and this section describes our recommendations
360+
to do so.
361+
362+
Recommendations TL;DR
363+
---------------------
364+
365+
Generally, we recommend building a flexible MPI container using:
366+
367+
a. **libfabric** to flexibly manage process communication over a diverse
368+
set of network fabrics;
369+
370+
b. a parallel **process management interface** (PMI), compatible with the
371+
host workload manager (e.g., PMI2, PMIx, flux-pmi); and
372+
373+
c. an **MPI** that supports (1) libfabric and (2) the selected PMI.
374+
375+
More experienced MPI and unprivileged container users can find success through
376+
MPI replacement (injection); however, such practices are beyond the scope of
377+
this FAQ.
378+
379+
The remaining sections detail the reasoning behind our approach. We recommend
380+
referencing, or directly using, our examples
381+
:code:`examples/Dockerfile.{libfabric,mpich,openmpi}`.
382+
383+
Use libfabric
384+
-------------
385+
386+
`libfabric <https://ofiwg.github.io/libfabric>`_ (a.k.a. Open Fabrics
387+
Interfaces or OFI) is a low-level communication library that abstracts diverse
388+
networking technologies. It defines *providers* that implement the mapping
389+
between application-facing software (e.g., MPI) and network specific drivers,
390+
protocols, and hardware. These providers have been co-designed with fabric
391+
hardware and application developers with a focus on HPC needs. libfabric lets
392+
us more easily manage MPI communication over diverse network high-speed
393+
interconnects (a.k.a. *fabrics*).
394+
395+
From our libfabric example (:code:`examples/Dockerfile.libfabric`):
396+
397+
.. literalinclude:: ../examples/Dockerfile.libfabric
398+
:language: docker
399+
:lines: 116-135
400+
401+
The above compiles libfabric with several “built-in” providers, i.e.
402+
:code:`psm3` (on x86-64), :code:`rxm`, :code:`shm`, :code:`tcp`, and
403+
:code:`verbs`, which enables MPI applications to run efficiently over most
404+
verb devices using TCP, IB, OPA, and RoCE protocols.
405+
406+
Two key advantages of using libfabric are: (1) the container’s libfabric can
407+
make use of “external” i.e. dynamic-shared-object (DSO) providers, and
408+
(2) libfabric replacement is simpler than MPI replacement and preserves the
409+
original container MPI. That is, managing host/container ABI compatibility is
410+
difficult and error-prone, so we instead manage the more forgiving libfabric
411+
ABI compatibility.
412+
413+
A DSO provider can be used by a libfabric that did not originally compile it,
414+
i.e., they can be compiled on a target host and later injected into the
415+
container along with any missing shared library dependencies, and used by the
416+
container's libfabric. To build a libfabric provider as a DSO, add :code:`=dl`
417+
to its :code:`configure` argument, e.g., :code:`--with-cxi=dl`.
418+
419+
A container's libfabric can also be replaced by a host libfabric. This is a
420+
brittle but usually effective way to give containers access to the Cray
421+
libfabric Slingshot provider :code:`cxi`.
422+
423+
In Charliecloud, both of these injection operations are currently done with
424+
:code:`ch-fromhost`, though see `issue #1861
425+
<https://github.com/hpc/charliecloud/issues/1861>`_.
426+
427+
Choose a compatible PMI
428+
-----------------------
429+
430+
Unprivileged processes, including unprivileged containerized processes, are
431+
unable to independently launch containerized processes on different nodes,
432+
aside from using SSH, which isn’t scalable. We must either (1) rely on a host
433+
supported parallel process management interface (PMI), or (2) achieve
434+
host/container MPI ABI compatibility through unsavory practices such as
435+
complete container MPI replacement.
436+
437+
The preferred PMI implementation, e.g., PMI1, PMI2, OpenPMIx, or flux-pmi,
438+
will be that which is best supported by your host workload manager and
439+
container MPI.
440+
441+
In :code:`example/Dockerfile.libfabric`, we selected :code:`OpenPMIx` because
442+
(1) it is supported by SLURM, OpenMPI, and MPICH, (2)~it is required for
443+
exascale, and (3) OpenMPI versions 5 and newer will no longer support PMI2.
444+
445+
Choose an MPI compatible with your libfabric and PMI
446+
----------------------------------------------------
447+
448+
There are various MPI implementations, e.g., OpenMPI, MPICH, MVAPICH2,
449+
Intel-MPI, etc., to consider. We generally recommend OpenMPI; however, your
450+
MPI implementation of choice will ultimately be that which best supports the
451+
libfabric and PMI most compatible with your hardware and workload manager.
452+
453+
454+
.. LocalWords: userguide Gruening Souppaya Morello Scarfone openmpi nist dl
455+
.. LocalWords: ident OCFS MAGICK mpich psm rxm shm DSO pmi MVAPICH

doc/ch-run.rst

+10-3
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,16 @@ mounting SquashFS images with FUSE.
6060
Don’t expand variables when using :code:`--set-env`.
6161

6262
:code:`--feature=FEAT`
63-
If feature :code:`FEAT` is enabled, exit with success. Valid values of
64-
:code:`FEAT` are :code:`extglob` for extended globs, :code:`seccomp` for
65-
:code:`seccomp(2)`, and :code:`squash` for squashfs archives.
63+
If feature :code:`FEAT` is enabled, exit successfully (zero); otherwise,
64+
exit unsuccessfully (non-zero). Note this just communicates the results of
65+
:code:`configure` rather than testing the feature. Valid values of
66+
:code:`FEAT` are:
67+
68+
* :code:`extglob`: extended globs in :code:`--unset-env`
69+
* :code:`seccomp`: :code:`--seccomp` available
70+
* :code:`squash`: internal SquashFUSE image mounts
71+
* :code:`overlayfs`: unprivileged overlayfs support
72+
* :code:`tmpfs-xattrs`: :code:`user` xattrs on tmpfs
6673

6774
:code:`-g`, :code:`--gid=GID`
6875
Run as group :code:`GID` within container.

doc/faq.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1364,4 +1364,4 @@ conversion. Important caveats include:
13641364

13651365
.. LocalWords: CAs SY Gutmann AUTH rHsFFqwwqh MrieaQ Za loc mpihello mvo du
13661366
.. LocalWords: VirtualSize linuxcontainers jour uk lxd rwxr xr qq qqq drwxr
1367-
.. LocalWords: drwx
1367+
.. LocalWords: drwx mpich

doc/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ package managers.
224224
Maintained by us:
225225

226226
* `Spack
227-
<https://spack.readthedocs.io/en/latest/package_list.html#charliecloud>`_;
227+
<https://packages.spack.io/package.html?name=charliecloud>`_;
228228
install with :code:`+builder` to get :code:`ch-image`.
229229
* `Fedora/EPEL <https://bodhi.fedoraproject.org/updates/?search=charliecloud>`_;
230230
check for available versions with :code:`{yum,dnf} list charliecloud`.

examples/obspy/Dockerfile

+6-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,13 @@ WORKDIR /usr/local/src
1717
#
1818
# 2. Use latest version so we catch sooner if things explode.
1919
#
20+
# 3. ObsPy 1.4.0, the latest as of 2024-03-27, is incompatible with Python
21+
# 3.12 [2], which is recently the default in Miniconda (see PR #1885 and
22+
# issue #1886).
23+
#
2024
# [1]: https://docs.anaconda.com/anaconda/user-guide/faq/
21-
ARG MC_VERSION=latest
25+
# [2]: https://github.com/obspy/obspy/issues/3313#issuecomment-1818165937
26+
ARG MC_VERSION=py311_24.1.2-0
2227
ARG MC_FILE=Miniconda3-$MC_VERSION-Linux-x86_64.sh
2328
RUN wget -nv https://repo.anaconda.com/miniconda/$MC_FILE
2429
# Miniconda will fail if the HOME variable is not set.
@@ -32,7 +37,6 @@ RUN conda config --set auto_update_conda False
3237
# new environment for obspy.
3338
# See: https://github.com/obspy/obspy/wiki/Installation-via-Anaconda
3439
RUN conda config --add channels conda-forge
35-
# Use numpy 1.21 to avoid isse: https://github.com/obspy/obspy/issues/2940
3640
RUN conda install --yes obspy=1.4.0
3741
RUN conda update obspy
3842

0 commit comments

Comments
 (0)