~droyo/guix-channel

3bfc628072a3ea8dda59cde303edd7869e30c3d9 — David Arroyo 2 years ago 3661e11
Add module and syntax for declaring mount namespaces

This commit adds the (aqwari namespace) module, which provides the
following syntax:

	(namespace
	  (bind "/" (list nss-certs tzdata util-linux iproute))
	  (bind "/etc/hosts" #~(plain-file "hosts" "127.0.0.1 localhost\n"))
	  (bind "/var")
	  (bind "/etc/resolv.conf"))

It allows you to compose a new mount namespace as a series of bindings
to directories in the root namespace, or resources in the GNU store,
which will get built on-demand. The same directory can be bound to
multiple resources, which will result in the creation of a union mount
using overlayfs.

The namespace is a run-time concept, so this expression actually builds a
directory tree and an `exec` binary which creates an anonymous instance
of the described namespace, occupied by a single process; itself. It
can then execv(2) into a program provided on its commandline.

The `exec` binary is statically compiled and has no dependencies other
than the linux kernel. It can be run multiple times, and will create
a new namespace each time. Once the process exits, the namespace will
be destroyed.

My initial intent is to use this to create a root file system from which
I can run an s6 supervision tree that will run various network servers
on my PC and on my remote servers. I can build it with Guix and ship a
`guix pack` tarball to machines that don't have Guix.

A secondary goal is to explore mount namespaces as an alternative to
guix profiles. There are some very real usability obstacles to that,
first and foremost the problem that the creation of mount namespaces
requires elevated privileges on most Linux distributions.
4 files changed, 486 insertions(+), 0 deletions(-)

M README
A aqwari/namespace.scm
A aqwari/ns-helper.c
A tests/namespace.scm
M README => README +3 -0
@@ 11,3 11,6 @@ To add this channel to your guix, put the following in
           "A5D3 7797 27EB 5C3E 107E  7938 47DF EE31 B4F7 7BD2"))))
       %default-channels)

This Guix channel contains, or will contain, various packages
and extensions to guix that I use for managing my personal
servers.

A aqwari/namespace.scm => aqwari/namespace.scm +328 -0
@@ 0,0 1,328 @@
;; Declare and combine linux mount_namespaces(7).
;; Builds a self-contained executable that constructs an
;; anonymous namespace according to the macro's input.
(define-module (aqwari namespace)
  #:use-module (ice-9 match)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-9)

  #:use-module (guix gexp)
  #:use-module (guix packages)

  #:use-module (gnu packages base)
  #:use-module (gnu packages certs)
  #:use-module (gnu packages musl)
  #:use-module (gnu packages linux)
  #:use-module (gnu packages commencement)

  #:export
    (namespace
     namespace?
     tmpfs))

(define-record-type <namespace>
  (make-namespace mounts)
  namespace?
  (mounts namespace-mounts))

;; A bind mount (e.g. mount --bind)
(define-record-type <bind-mount>
  (make-bind-mount target source)
  bind-mount?
  (target bind-mount-target)
  (source bind-mount-source))

;; An overlayfs union mount
(define-record-type <overlay-mount>
  (make-overlay-mount target lowerdir upperdir workdir)
  overlay-mount?
  (target   overlay-mount-target)   ;; dir to mount on
  (lowerdir overlay-mount-lowerdir) ;; ordered list of read-only dirs
  (upperdir overlay-mount-upperdir) ;; single writable directory
  (workdir  overlay-mount-workdir)) ;; writable merge dir, same fs as upperdir

;; A tmpfs mount
(define-record-type <tmpfs-mount>
  (make-tmpfs-mount target option)
  tmpfs-mount?
  (target tmpfs-mount-target)  ;; dir to mount on
  (option tmpfs-mount-option)) ;; "size=10M", etc

(define (tmpfs options)
  "Create a template for a tmpfs mount. Re-using the result in multiple binds
creates multiple distinct tmpfs file systems. OPTIONS is a comma-separated
string of key=value parameters for tmpfs(5)"
  (make-tmpfs-mount #nil options))

(define mount-file
  (match-lambda
    (($ <bind-mount>    target) target)
    (($ <overlay-mount> target) target)
    (($ <tmpfs-mount>   target) target)))

(define mount-spec
  (match-lambda
    (($ <bind-mount> _ source) source)
    (($ <overlay-mount>) "overlay")
    (($ <tmpfs-mount>) "tmpfs")))

(define mount-type
  (match-lambda
    (($ <bind-mount>) "none")
    (($ <overlay-mount>) "overlay")
    (($ <tmpfs-mount>) "tmpfs")))

(define mount-opts
  (match-lambda
    (($ <bind-mount>) "defaults,bind")
    (($ <tmpfs-mount> _ opts) opts)

    ;; upper, work, and any element of lower may be a G-expression, whose
    ;; location is not known until build-time. So we don't build the option
    ;; string yet and ship the parameters to the build-side code instead.
    (($ <overlay-mount> _ lower upper work)
     (list lower upper work))))

(define bind->mount
  ;; Parses the inputs of the (bind ...) macro in (namespace ...)
  ;; and produces the appropriate mount (tmpfs, overlay, bind, etc)
  (match-lambda*
    (((? string? target) (? string? source))
     (make-bind-mount target source))

    (((? string? target) (? gexp? source))
     (make-bind-mount target source))

    ;; This is for convenience, so users don't have to wrap each package
    ;; in a gexp. We also mount any propagated inputs, so they are present
    ;; in the resulting namespace.
    (((? string? target) (? package? pkg))
     (apply values
       (cons
         (make-bind-mount target (gexp (ungexp pkg)))
         (map
           (lambda (dep) (bind->mount target dep))
           (package-propagated-inputs pkg)))))

    ;; Matching the various formats for describing inputs
    (((? string? target) ((? string? pkgname) (? package? pkg)))
     (bind->mount target pkg))

    (((? string? target) ((? string? pkgname) (? package? pkg) (? string? output)))
     (make-bind-mount target (gexp (ungexp pkg output))))

    (((? string? target) ($ <tmpfs-mount> _ options))
     (make-tmpfs-mount target options))

    (((? string? target))
     (make-bind-mount target target))

    (((? list? targets))
     (apply values (map bind->mount targets)))

    (((? string? target) (? list? sources))
     (apply values
       (map (lambda (source) (bind->mount target source)) sources)))))

(define-syntax namespace-args
  (syntax-rules (bind include)
    ((namespace-args ()) '())

    ((namespace-args ((include ns) . rest))
     (append
       (namespace-mounts ns)
       (namespace-args rest)))

    ((namespace-args ((bind args ...) . rest))
     (append
       (call-with-values (lambda () (bind->mount args ...)) list)
       (namespace-args rest)))))

(define-syntax-rule (namespace . args)
  (make-namespace (namespace-args args)))

(define-public %namespace-minimal
  (namespace
    (bind '("/dev/" "/proc/"))
    (bind "/" (list tzdata nss-certs))))

(define (compare-mounts a b)
  ;; Push tmpfs for a directory to the head of a sequence of mounts for
  ;; the same mountpoint, because it must be mounted first if it is to
  ;; be used for an overlay mount.
  (if (tmpfs-mount? a)
      (string<= (mount-file a) (mount-file b))
      (string<  (mount-file a) (mount-file b))))

(define (collapse-mounts mounts)
  "Create union mounts for multiple binds to the same location, if possible."
  ;; The order of bindings to the same mountpoint is significant,
  ;; so stable sort is a requirement.
  (let loop ((args (stable-sort mounts compare-mounts)))
    (match args
      ('() '())

      ;; overlayfs is kind of a PITA. For writable unions, it requires
      ;; a writable dir to use as the "upperdir", where writes will go,
      ;; *AND* an empty "workdir" for temporary storage, on the same
      ;; file system as upperdir. The semantics we're supporting here is
      ;; that if a tmpfs is part of a union mount, it is used as the
      ;; upperdir and the union is made writable. To avoid polluting the
      ;; new namespace, it is mounted *outside* the namespace root, and
      ;; the `exec` helper populates it with a "work" and "upper" dir.
      ;;
      ;; Compare to the simple semantics of Plan 9's bind(1); additions to
      ;; a union directory go to the first member of the union. That's it.
      ((($ <tmpfs-mount>   mnt opts)
        ($ <bind-mount>    mnt source) . rest)
       (let* ((tmnt (string-append "/tmpfs" mnt))
              (upper (string-append (string-trim-right tmnt #\/) "/upper"))
              (work (string-append (string-trim-right tmnt #\/)  "/work"))
              (lower (list source)))
         (cons
           (make-tmpfs-mount tmnt opts)
           (loop (cons (make-overlay-mount mnt lower upper work) rest)))))

      ;; A side effect of the above is that whether or not a tmpfs is
      ;; mounted in the new namespace is determined by whether or not
      ;; it is a member of a union mount. So the build-side code must
      ;; be told explicitly whether to mount it under "root" or "tmpfs"
      ((($ <tmpfs-mount> mnt opts) . rest)
       (cons
         (make-tmpfs-mount (string-append "/root" mnt) opts)
         (loop rest)))

      ((($ <overlay-mount> mnt lower upper work)
        ($ <bind-mount>    mnt source) . rest)
       (let ((stack (append lower (list source))))
         (loop
           (cons
             (make-overlay-mount mnt stack upper work)
             rest))))

      ((($ <bind-mount> mnt dir1)
        ($ <bind-mount> mnt dir2) . rest)
       (loop
         (cons
           (make-overlay-mount mnt (list dir1 dir2) #f #f)
           rest)))

      (((= mount-file mnt) (= mount-file mnt) . rest)
       (error "don't know how to combine ~a" (take args 2)))

      ((fs . rest) (cons fs (loop rest))))))

(define-gexp-compiler
  (namespace-compiler (ns <namespace>) system target)

  (define mounts (collapse-mounts (namespace-mounts ns)))

  (define builder
    (with-imported-modules '((guix build utils))
    #~(begin
        (use-modules
          (ice-9 popen)
          (ice-9 match)
          (srfi srfi-1)
          (srfi srfi-13)
          (guix build utils))

        (define (clean path) (string-trim-right path #\/))

        (define (overlay-opts file lowerdir . rest)
          (let ((lowerdir (append lowerdir (list (string-append #$output "/root" file)))))
            (string-join
              (map
                string-append
                (list "lowerdir=" "upperdir=" "workdir=")
                (cons
                  (string-join lowerdir ":")
                  (map
                    (lambda (dir) (string-append #$output dir))
                    (remove not rest)))) ",")))

        (define (write-exec-c-header port)
          "Write C header describing the mount namespace to PORT"
          (format port "struct fstab { ~a};~%"
            (string-join
              (cons "int upper"
                (map
                  (lambda (field) (format #f "char *~a" field))
                  '("spec" "file" "type" "opts"))) "; " 'suffix))
          (format port "const char *root = ~s;~%"
            (string-append #$output "/root"))
          (format port "const char *gnustore = ~s;~%"
            (string-append #$output "/root/gnu/store"))
          (format port "struct fstab fstab[] = {~%~a~%};~%"
            (string-join
              (map
                (match-lambda*
                  (("tmpfs" file "tmpfs" opts)
                   (format #f "	{~a, ~s, ~s, ~s, ~s}"
                     (if (string-prefix? "/root" file) 0 1)
                     "tmpfs" (string-append #$output (clean file))
                     "tmpfs" opts))

                  (("overlay" file "overlay" (lower upper work))
                   (let ((opts (overlay-opts file lower upper work)))
                     (format #f "	{0, ~s, ~s, ~s, ~s}"
                       "overlay"
                       (string-append #$output "/root" (clean file))
                       "overlay" opts)))

                  ((spec file type opts)
                   (format #f "	{0, ~s, ~s, ~s, ~s}"
                      (clean spec)
                      (string-append #$output "/root" (clean file))
                      type opts)))
                '#$(map mount-spec mounts)
                '#$(map mount-file mounts)
                '#$(map mount-type mounts)
                '#$(map mount-opts mounts)) ",\n" 'infix)))

        (define (build-exec dst)
          "Compile the `exec` binary which constructs a namespace"
          (define cc (string-append #+gcc-toolchain "/bin/gcc"))
          (define pipe
            (open-pipe* OPEN_WRITE
              cc "-mmusl" "-static"
                 "-include" "/dev/stdin"
                 (string-append "-B" #$musl "/lib")
                 (string-append "-I" #$musl "/include")
                 (string-append "-I" #$linux-libre-headers "/include")
                 (string-append "-L" #$musl "/lib")
                 (string-append "-B" #$gcc-toolchain "/bin")
                 "-o" dst
                 #$(local-file "./ns-helper.c")))

          (write-exec-c-header pipe)
          (match (status:exit-val (close-pipe pipe))
            (0 #t)
            (#f (error "cc exec failed"))
            (rc (error "cc ext status ~a" rc))))

        (define (trailing-slash? path)
          (string-suffix? "/" path))

        (define (create-empty file)
          (with-output-to-file file noop))

        (mkdir-p (string-append #$output "/root"))
        (mkdir-p (string-append #$output "/root/gnu/store"))

        (for-each
          (lambda (dir) (mkdir-p (string-append #$output dir)))
          '#$(map mount-file (filter tmpfs-mount? mounts)))

        (for-each
          (match-lambda
            ((? trailing-slash? dir)
             (mkdir-p (string-append #$output "/root" dir)))
            (file
             (mkdir-p (string-append #$output "/root" (dirname file)))
             (create-empty (string-append #$output "/root" file))))
          '#$(map mount-file (remove tmpfs-mount? mounts)))

        (build-exec (string-append #$output "/exec")))))

  (gexp->derivation "namespace" builder))

A aqwari/ns-helper.c => aqwari/ns-helper.c +142 -0
@@ 0,0 1,142 @@
#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define _GNU_SOURCE
#include <sched.h>        /* for unshare(2) */
#include <sys/mount.h>    /* for mount(2) */
#include <sys/syscall.h>  /* for pivot_root(2) */
#include <sys/stat.h>     /* for mkdirat(2) */
#include <linux/limits.h> /* for PATH_MAX */

#define NELEMS(x) ((sizeof x) / (sizeof *x))


void die(const int rc, const char *fmterr, ...) {
	va_list ap;
	va_start(ap, fmterr);
	vfprintf(stderr, fmterr, ap);
	va_end(ap);
	exit(rc);
}

void fail(const int rc, const char *fmterr, ...) {
	int n;
	va_list ap;
	char prefix[500];
	memset(prefix, 0, sizeof prefix);
	va_start(ap, fmterr);
	n = vsnprintf(prefix, (sizeof prefix) - 1, fmterr, ap);
	va_end(ap);
	perror(prefix);
	exit(rc);
}

int pivot_root(const char *new_root, const char *put_old) {
	return syscall(SYS_pivot_root, new_root, put_old);
}

void dumpfstab() {
	printf("root = %s\n", root);
	for (int i = 0; i < NELEMS(fstab); i++) {
		struct fstab *fs = &fstab[i];
		printf("%s\t%s\t\%s\t%s\n",
			fs->spec, fs->file, fs->type, fs->opts);
	}
}

int main(int argc, char **argv) {
	const char *usage = "Usage: exec prog arg ...\n";

	if (argc <= 1) die(2, usage);
	if (!strcmp(argv[1] , "-h")) die(0, usage);
	if (!strcmp(argv[1], "--help")) die(0, usage);
	if (!strcmp(argv[1], "-D")) {
		dumpfstab();
		exit(0);
	}

	if (unshare(CLONE_NEWNS)) fail(1, "unshare(CLONE_NEWNS)");

	if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) {
		fail(1, "force mounts private: mount(/, MS_PRIVATE)");
	}


	for (int i = 0; i < NELEMS(fstab); i++) {
		struct fstab *fs = &fstab[i];
		int flags = 0;
		if (!strcmp(fs->opts, "defaults,bind")) {
			flags = MS_BIND | MS_REC;
		}
		if (mount(fs->spec, fs->file, fs->type, flags, fs->opts)) {
			fail(1, "mount(%s, %s, %s, %x, %s)",
				fs->spec, fs->file, fs->type, flags, fs->opts);
		}

		/* tmpfs mounts used to support a writable overlay must be
		   pre-populated with an "upper" and "work" dir */
		int tmpfs;
		if (fs->upper) {
			if ((tmpfs = open(fs->file, O_PATH)) < 0)
				fail(1, "open(%s, O_PATH)", fs->file);
			if (mkdirat(tmpfs, "upper", 0777))
				fail(1, "mkdirat(%s, \"upper\")", fs->file);
			if (mkdirat(tmpfs, "work", 0777))
				fail(1, "mkdirat(%s, \"work\")", fs->file);
			if (close(tmpfs))
				fail(1, "close(%s)", fs->file);
		}
	}

	/* I don't like that this is necessary. However, most, if not all
	   binaries in the GNU store are dynamically linked against other
	   libraries in the GNU store.

	   Note that MS_REC is left out intentionally to avoid creating
	   a loop. */
	if (mount("/gnu/store", gnustore, NULL, MS_BIND, "defaults,none"))
		fail(1, "bind /gnu/store: mount(/gnu/store, %s)", gnustore);

	/* This satisfies pivot_root(2)'s requirement that new_root must be
	   the path to a mount point. In practice, there is likely to be an
	   entry for the new root in fstab and this is not necessary. However,
	   it does not hurt, so we do it unconditionally. */
	if (mount(root, root, NULL, MS_BIND | MS_REC, NULL)) {
		fail(1, "ensure root is mountpoint: mount(%s, MS_BIND)", root);
	}

	/* This sequence is taken from the pivot_root(2) man page, and inspired
	   by the `lxc_pivot_root` function from LXC. */
	int oldroot, newroot;
	if ((oldroot = open("/", O_PATH)) < 0)
		fail(1, "get fd for oldroot: open(/)");

	if ((newroot = open(root, O_PATH)) < 0)
		fail(1, "get fd for newroot: open(%s)", root);

	if (fchdir(newroot))
		fail(1, "change to new root: fchdir(%d)", newroot);

	if (pivot_root(".", "."))
		fail(1, "pivot_root(\".\", \".\")");

	if (fchdir(oldroot))
		fail(1, "escape to oldroot: fchdir(%d)", oldroot);

	if (umount2(".", MNT_DETACH))
		fail(1, "detach oldroot: umount2(\".\", MNT_DETACH)");

	if (fchdir(newroot))
		fail(1, "re-enter newroot: fchdir(%d)", newroot);

	if (close(oldroot)) fail(1, "close oldroot");
	if (close(newroot)) fail(1, "close newroot");

	execv(argv[1], &argv[1]);
	fail(1, "execv(%s, ...)", argv[1]);
}

A tests/namespace.scm => tests/namespace.scm +13 -0
@@ 0,0 1,13 @@
(define-module (test-namespace)
  #:use-module (srfi srfi-64)
  #:use-module (aqwari namespace))

(test-begin "namespace")

(define simple-ns
  (namespace
    (include %namespace-minimal)
    (bind "/var")))

(test-assert simple-ns)
(test-assert (namespace? simple-ns))