@@ 0,0 1,328 @@
+;; Declare and combine linux mount_namespaces(7).
+;; Builds a self-contained executable that constructs an
+;; anonymous namespace according to the macro's input.
+(define-module (aqwari namespace)
+ #:use-module (ice-9 match)
+ #:use-module (srfi srfi-1)
+ #:use-module (srfi srfi-9)
+
+ #:use-module (guix gexp)
+ #:use-module (guix packages)
+
+ #:use-module (gnu packages base)
+ #:use-module (gnu packages certs)
+ #:use-module (gnu packages musl)
+ #:use-module (gnu packages linux)
+ #:use-module (gnu packages commencement)
+
+ #:export
+ (namespace
+ namespace?
+ tmpfs))
+
+(define-record-type <namespace>
+ (make-namespace mounts)
+ namespace?
+ (mounts namespace-mounts))
+
+;; A bind mount (e.g. mount --bind)
+(define-record-type <bind-mount>
+ (make-bind-mount target source)
+ bind-mount?
+ (target bind-mount-target)
+ (source bind-mount-source))
+
+;; An overlayfs union mount
+(define-record-type <overlay-mount>
+ (make-overlay-mount target lowerdir upperdir workdir)
+ overlay-mount?
+ (target overlay-mount-target) ;; dir to mount on
+ (lowerdir overlay-mount-lowerdir) ;; ordered list of read-only dirs
+ (upperdir overlay-mount-upperdir) ;; single writable directory
+ (workdir overlay-mount-workdir)) ;; writable merge dir, same fs as upperdir
+
+;; A tmpfs mount
+(define-record-type <tmpfs-mount>
+ (make-tmpfs-mount target option)
+ tmpfs-mount?
+ (target tmpfs-mount-target) ;; dir to mount on
+ (option tmpfs-mount-option)) ;; "size=10M", etc
+
+(define (tmpfs options)
+ "Create a template for a tmpfs mount. Re-using the result in multiple binds
+creates multiple distinct tmpfs file systems. OPTIONS is a comma-separated
+string of key=value parameters for tmpfs(5)"
+ (make-tmpfs-mount #nil options))
+
+(define mount-file
+ (match-lambda
+ (($ <bind-mount> target) target)
+ (($ <overlay-mount> target) target)
+ (($ <tmpfs-mount> target) target)))
+
+(define mount-spec
+ (match-lambda
+ (($ <bind-mount> _ source) source)
+ (($ <overlay-mount>) "overlay")
+ (($ <tmpfs-mount>) "tmpfs")))
+
+(define mount-type
+ (match-lambda
+ (($ <bind-mount>) "none")
+ (($ <overlay-mount>) "overlay")
+ (($ <tmpfs-mount>) "tmpfs")))
+
+(define mount-opts
+ (match-lambda
+ (($ <bind-mount>) "defaults,bind")
+ (($ <tmpfs-mount> _ opts) opts)
+
+ ;; upper, work, and any element of lower may be a G-expression, whose
+ ;; location is not known until build-time. So we don't build the option
+ ;; string yet and ship the parameters to the build-side code instead.
+ (($ <overlay-mount> _ lower upper work)
+ (list lower upper work))))
+
+(define bind->mount
+ ;; Parses the inputs of the (bind ...) macro in (namespace ...)
+ ;; and produces the appropriate mount (tmpfs, overlay, bind, etc)
+ (match-lambda*
+ (((? string? target) (? string? source))
+ (make-bind-mount target source))
+
+ (((? string? target) (? gexp? source))
+ (make-bind-mount target source))
+
+ ;; This is for convenience, so users don't have to wrap each package
+ ;; in a gexp. We also mount any propagated inputs, so they are present
+ ;; in the resulting namespace.
+ (((? string? target) (? package? pkg))
+ (apply values
+ (cons
+ (make-bind-mount target (gexp (ungexp pkg)))
+ (map
+ (lambda (dep) (bind->mount target dep))
+ (package-propagated-inputs pkg)))))
+
+ ;; Matching the various formats for describing inputs
+ (((? string? target) ((? string? pkgname) (? package? pkg)))
+ (bind->mount target pkg))
+
+ (((? string? target) ((? string? pkgname) (? package? pkg) (? string? output)))
+ (make-bind-mount target (gexp (ungexp pkg output))))
+
+ (((? string? target) ($ <tmpfs-mount> _ options))
+ (make-tmpfs-mount target options))
+
+ (((? string? target))
+ (make-bind-mount target target))
+
+ (((? list? targets))
+ (apply values (map bind->mount targets)))
+
+ (((? string? target) (? list? sources))
+ (apply values
+ (map (lambda (source) (bind->mount target source)) sources)))))
+
+(define-syntax namespace-args
+ (syntax-rules (bind include)
+ ((namespace-args ()) '())
+
+ ((namespace-args ((include ns) . rest))
+ (append
+ (namespace-mounts ns)
+ (namespace-args rest)))
+
+ ((namespace-args ((bind args ...) . rest))
+ (append
+ (call-with-values (lambda () (bind->mount args ...)) list)
+ (namespace-args rest)))))
+
+(define-syntax-rule (namespace . args)
+ (make-namespace (namespace-args args)))
+
+(define-public %namespace-minimal
+ (namespace
+ (bind '("/dev/" "/proc/"))
+ (bind "/" (list tzdata nss-certs))))
+
+(define (compare-mounts a b)
+ ;; Push tmpfs for a directory to the head of a sequence of mounts for
+ ;; the same mountpoint, because it must be mounted first if it is to
+ ;; be used for an overlay mount.
+ (if (tmpfs-mount? a)
+ (string<= (mount-file a) (mount-file b))
+ (string< (mount-file a) (mount-file b))))
+
+(define (collapse-mounts mounts)
+ "Create union mounts for multiple binds to the same location, if possible."
+ ;; The order of bindings to the same mountpoint is significant,
+ ;; so stable sort is a requirement.
+ (let loop ((args (stable-sort mounts compare-mounts)))
+ (match args
+ ('() '())
+
+ ;; overlayfs is kind of a PITA. For writable unions, it requires
+ ;; a writable dir to use as the "upperdir", where writes will go,
+ ;; *AND* an empty "workdir" for temporary storage, on the same
+ ;; file system as upperdir. The semantics we're supporting here is
+ ;; that if a tmpfs is part of a union mount, it is used as the
+ ;; upperdir and the union is made writable. To avoid polluting the
+ ;; new namespace, it is mounted *outside* the namespace root, and
+ ;; the `exec` helper populates it with a "work" and "upper" dir.
+ ;;
+ ;; Compare to the simple semantics of Plan 9's bind(1); additions to
+ ;; a union directory go to the first member of the union. That's it.
+ ((($ <tmpfs-mount> mnt opts)
+ ($ <bind-mount> mnt source) . rest)
+ (let* ((tmnt (string-append "/tmpfs" mnt))
+ (upper (string-append (string-trim-right tmnt #\/) "/upper"))
+ (work (string-append (string-trim-right tmnt #\/) "/work"))
+ (lower (list source)))
+ (cons
+ (make-tmpfs-mount tmnt opts)
+ (loop (cons (make-overlay-mount mnt lower upper work) rest)))))
+
+ ;; A side effect of the above is that whether or not a tmpfs is
+ ;; mounted in the new namespace is determined by whether or not
+ ;; it is a member of a union mount. So the build-side code must
+ ;; be told explicitly whether to mount it under "root" or "tmpfs"
+ ((($ <tmpfs-mount> mnt opts) . rest)
+ (cons
+ (make-tmpfs-mount (string-append "/root" mnt) opts)
+ (loop rest)))
+
+ ((($ <overlay-mount> mnt lower upper work)
+ ($ <bind-mount> mnt source) . rest)
+ (let ((stack (append lower (list source))))
+ (loop
+ (cons
+ (make-overlay-mount mnt stack upper work)
+ rest))))
+
+ ((($ <bind-mount> mnt dir1)
+ ($ <bind-mount> mnt dir2) . rest)
+ (loop
+ (cons
+ (make-overlay-mount mnt (list dir1 dir2) #f #f)
+ rest)))
+
+ (((= mount-file mnt) (= mount-file mnt) . rest)
+ (error "don't know how to combine ~a" (take args 2)))
+
+ ((fs . rest) (cons fs (loop rest))))))
+
+(define-gexp-compiler
+ (namespace-compiler (ns <namespace>) system target)
+
+ (define mounts (collapse-mounts (namespace-mounts ns)))
+
+ (define builder
+ (with-imported-modules '((guix build utils))
+ #~(begin
+ (use-modules
+ (ice-9 popen)
+ (ice-9 match)
+ (srfi srfi-1)
+ (srfi srfi-13)
+ (guix build utils))
+
+ (define (clean path) (string-trim-right path #\/))
+
+ (define (overlay-opts file lowerdir . rest)
+ (let ((lowerdir (append lowerdir (list (string-append #$output "/root" file)))))
+ (string-join
+ (map
+ string-append
+ (list "lowerdir=" "upperdir=" "workdir=")
+ (cons
+ (string-join lowerdir ":")
+ (map
+ (lambda (dir) (string-append #$output dir))
+ (remove not rest)))) ",")))
+
+ (define (write-exec-c-header port)
+ "Write C header describing the mount namespace to PORT"
+ (format port "struct fstab { ~a};~%"
+ (string-join
+ (cons "int upper"
+ (map
+ (lambda (field) (format #f "char *~a" field))
+ '("spec" "file" "type" "opts"))) "; " 'suffix))
+ (format port "const char *root = ~s;~%"
+ (string-append #$output "/root"))
+ (format port "const char *gnustore = ~s;~%"
+ (string-append #$output "/root/gnu/store"))
+ (format port "struct fstab fstab[] = {~%~a~%};~%"
+ (string-join
+ (map
+ (match-lambda*
+ (("tmpfs" file "tmpfs" opts)
+ (format #f " {~a, ~s, ~s, ~s, ~s}"
+ (if (string-prefix? "/root" file) 0 1)
+ "tmpfs" (string-append #$output (clean file))
+ "tmpfs" opts))
+
+ (("overlay" file "overlay" (lower upper work))
+ (let ((opts (overlay-opts file lower upper work)))
+ (format #f " {0, ~s, ~s, ~s, ~s}"
+ "overlay"
+ (string-append #$output "/root" (clean file))
+ "overlay" opts)))
+
+ ((spec file type opts)
+ (format #f " {0, ~s, ~s, ~s, ~s}"
+ (clean spec)
+ (string-append #$output "/root" (clean file))
+ type opts)))
+ '#$(map mount-spec mounts)
+ '#$(map mount-file mounts)
+ '#$(map mount-type mounts)
+ '#$(map mount-opts mounts)) ",\n" 'infix)))
+
+ (define (build-exec dst)
+ "Compile the `exec` binary which constructs a namespace"
+ (define cc (string-append #+gcc-toolchain "/bin/gcc"))
+ (define pipe
+ (open-pipe* OPEN_WRITE
+ cc "-mmusl" "-static"
+ "-include" "/dev/stdin"
+ (string-append "-B" #$musl "/lib")
+ (string-append "-I" #$musl "/include")
+ (string-append "-I" #$linux-libre-headers "/include")
+ (string-append "-L" #$musl "/lib")
+ (string-append "-B" #$gcc-toolchain "/bin")
+ "-o" dst
+ #$(local-file "./ns-helper.c")))
+
+ (write-exec-c-header pipe)
+ (match (status:exit-val (close-pipe pipe))
+ (0 #t)
+ (#f (error "cc exec failed"))
+ (rc (error "cc ext status ~a" rc))))
+
+ (define (trailing-slash? path)
+ (string-suffix? "/" path))
+
+ (define (create-empty file)
+ (with-output-to-file file noop))
+
+ (mkdir-p (string-append #$output "/root"))
+ (mkdir-p (string-append #$output "/root/gnu/store"))
+
+ (for-each
+ (lambda (dir) (mkdir-p (string-append #$output dir)))
+ '#$(map mount-file (filter tmpfs-mount? mounts)))
+
+ (for-each
+ (match-lambda
+ ((? trailing-slash? dir)
+ (mkdir-p (string-append #$output "/root" dir)))
+ (file
+ (mkdir-p (string-append #$output "/root" (dirname file)))
+ (create-empty (string-append #$output "/root" file))))
+ '#$(map mount-file (remove tmpfs-mount? mounts)))
+
+ (build-exec (string-append #$output "/exec")))))
+
+ (gexp->derivation "namespace" builder))
@@ 0,0 1,142 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define _GNU_SOURCE
+#include <sched.h> /* for unshare(2) */
+#include <sys/mount.h> /* for mount(2) */
+#include <sys/syscall.h> /* for pivot_root(2) */
+#include <sys/stat.h> /* for mkdirat(2) */
+#include <linux/limits.h> /* for PATH_MAX */
+
+#define NELEMS(x) ((sizeof x) / (sizeof *x))
+
+
+void die(const int rc, const char *fmterr, ...) {
+ va_list ap;
+ va_start(ap, fmterr);
+ vfprintf(stderr, fmterr, ap);
+ va_end(ap);
+ exit(rc);
+}
+
+void fail(const int rc, const char *fmterr, ...) {
+ int n;
+ va_list ap;
+ char prefix[500];
+ memset(prefix, 0, sizeof prefix);
+ va_start(ap, fmterr);
+ n = vsnprintf(prefix, (sizeof prefix) - 1, fmterr, ap);
+ va_end(ap);
+ perror(prefix);
+ exit(rc);
+}
+
+int pivot_root(const char *new_root, const char *put_old) {
+ return syscall(SYS_pivot_root, new_root, put_old);
+}
+
+void dumpfstab() {
+ printf("root = %s\n", root);
+ for (int i = 0; i < NELEMS(fstab); i++) {
+ struct fstab *fs = &fstab[i];
+ printf("%s\t%s\t\%s\t%s\n",
+ fs->spec, fs->file, fs->type, fs->opts);
+ }
+}
+
+int main(int argc, char **argv) {
+ const char *usage = "Usage: exec prog arg ...\n";
+
+ if (argc <= 1) die(2, usage);
+ if (!strcmp(argv[1] , "-h")) die(0, usage);
+ if (!strcmp(argv[1], "--help")) die(0, usage);
+ if (!strcmp(argv[1], "-D")) {
+ dumpfstab();
+ exit(0);
+ }
+
+ if (unshare(CLONE_NEWNS)) fail(1, "unshare(CLONE_NEWNS)");
+
+ if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) {
+ fail(1, "force mounts private: mount(/, MS_PRIVATE)");
+ }
+
+
+ for (int i = 0; i < NELEMS(fstab); i++) {
+ struct fstab *fs = &fstab[i];
+ int flags = 0;
+ if (!strcmp(fs->opts, "defaults,bind")) {
+ flags = MS_BIND | MS_REC;
+ }
+ if (mount(fs->spec, fs->file, fs->type, flags, fs->opts)) {
+ fail(1, "mount(%s, %s, %s, %x, %s)",
+ fs->spec, fs->file, fs->type, flags, fs->opts);
+ }
+
+ /* tmpfs mounts used to support a writable overlay must be
+ pre-populated with an "upper" and "work" dir */
+ int tmpfs;
+ if (fs->upper) {
+ if ((tmpfs = open(fs->file, O_PATH)) < 0)
+ fail(1, "open(%s, O_PATH)", fs->file);
+ if (mkdirat(tmpfs, "upper", 0777))
+ fail(1, "mkdirat(%s, \"upper\")", fs->file);
+ if (mkdirat(tmpfs, "work", 0777))
+ fail(1, "mkdirat(%s, \"work\")", fs->file);
+ if (close(tmpfs))
+ fail(1, "close(%s)", fs->file);
+ }
+ }
+
+ /* I don't like that this is necessary. However, most, if not all
+ binaries in the GNU store are dynamically linked against other
+ libraries in the GNU store.
+
+ Note that MS_REC is left out intentionally to avoid creating
+ a loop. */
+ if (mount("/gnu/store", gnustore, NULL, MS_BIND, "defaults,none"))
+ fail(1, "bind /gnu/store: mount(/gnu/store, %s)", gnustore);
+
+ /* This satisfies pivot_root(2)'s requirement that new_root must be
+ the path to a mount point. In practice, there is likely to be an
+ entry for the new root in fstab and this is not necessary. However,
+ it does not hurt, so we do it unconditionally. */
+ if (mount(root, root, NULL, MS_BIND | MS_REC, NULL)) {
+ fail(1, "ensure root is mountpoint: mount(%s, MS_BIND)", root);
+ }
+
+ /* This sequence is taken from the pivot_root(2) man page, and inspired
+ by the `lxc_pivot_root` function from LXC. */
+ int oldroot, newroot;
+ if ((oldroot = open("/", O_PATH)) < 0)
+ fail(1, "get fd for oldroot: open(/)");
+
+ if ((newroot = open(root, O_PATH)) < 0)
+ fail(1, "get fd for newroot: open(%s)", root);
+
+ if (fchdir(newroot))
+ fail(1, "change to new root: fchdir(%d)", newroot);
+
+ if (pivot_root(".", "."))
+ fail(1, "pivot_root(\".\", \".\")");
+
+ if (fchdir(oldroot))
+ fail(1, "escape to oldroot: fchdir(%d)", oldroot);
+
+ if (umount2(".", MNT_DETACH))
+ fail(1, "detach oldroot: umount2(\".\", MNT_DETACH)");
+
+ if (fchdir(newroot))
+ fail(1, "re-enter newroot: fchdir(%d)", newroot);
+
+ if (close(oldroot)) fail(1, "close oldroot");
+ if (close(newroot)) fail(1, "close newroot");
+
+ execv(argv[1], &argv[1]);
+ fail(1, "execv(%s, ...)", argv[1]);
+}