nixpkgs/nixos/modules/virtualisation/containers.nix

690 lines
22 KiB
Nix

{ config, lib, pkgs, ... }:
with lib;
let
# The container's init script, a small wrapper around the regular
# NixOS stage-2 init script.
containerInit = (cfg:
let
renderExtraVeth = (name: cfg:
''
echo "Bringing ${name} up"
ip link set dev ${name} up
${optionalString (cfg.localAddress != null) ''
echo "Setting ip for ${name}"
ip addr add ${cfg.localAddress} dev ${name}
''}
${optionalString (cfg.localAddress6 != null) ''
echo "Setting ip6 for ${name}"
ip -6 addr add ${cfg.localAddress6} dev ${name}
''}
${optionalString (cfg.hostAddress != null) ''
echo "Setting route to host for ${name}"
ip route add ${cfg.hostAddress} dev ${name}
''}
${optionalString (cfg.hostAddress6 != null) ''
echo "Setting route6 to host for ${name}"
ip -6 route add ${cfg.hostAddress6} dev ${name}
''}
''
);
in
pkgs.writeScript "container-init"
''
#! ${pkgs.stdenv.shell} -e
# Initialise the container side of the veth pair.
if [ "$PRIVATE_NETWORK" = 1 ]; then
ip link set host0 name eth0
ip link set dev eth0 up
if [ -n "$LOCAL_ADDRESS" ]; then
ip addr add $LOCAL_ADDRESS dev eth0
fi
if [ -n "$LOCAL_ADDRESS6" ]; then
ip -6 addr add $LOCAL_ADDRESS6 dev eth0
fi
if [ -n "$HOST_ADDRESS" ]; then
ip route add $HOST_ADDRESS dev eth0
ip route add default via $HOST_ADDRESS
fi
if [ -n "$HOST_ADDRESS6" ]; then
ip -6 route add $HOST_ADDRESS6 dev eth0
ip -6 route add default via $HOST_ADDRESS6
fi
${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg.extraVeths)}
fi
# Start the regular stage 1 script.
exec "$1"
''
);
nspawnExtraVethArgs = (name: cfg: "--network-veth-extra=${name}");
startScript = cfg:
''
mkdir -p -m 0755 "$root/etc" "$root/var/lib"
mkdir -p -m 0700 "$root/var/lib/private" "$root/root" /run/containers
if ! [ -e "$root/etc/os-release" ]; then
touch "$root/etc/os-release"
fi
if ! [ -e "$root/etc/machine-id" ]; then
touch "$root/etc/machine-id"
fi
mkdir -p -m 0755 \
"/nix/var/nix/profiles/per-container/$INSTANCE" \
"/nix/var/nix/gcroots/per-container/$INSTANCE"
cp --remove-destination /etc/resolv.conf "$root/etc/resolv.conf"
if [ "$PRIVATE_NETWORK" = 1 ]; then
extraFlags+=" --network-veth"
if [ -n "$HOST_BRIDGE" ]; then
extraFlags+=" --network-bridge=$HOST_BRIDGE"
fi
if [ -n "$HOST_PORT" ]; then
extraFlags+=" --port=$HOST_PORT"
fi
fi
extraFlags+=" ${concatStringsSep " " (mapAttrsToList nspawnExtraVethArgs cfg.extraVeths)}"
for iface in $INTERFACES; do
extraFlags+=" --network-interface=$iface"
done
for iface in $MACVLANS; do
extraFlags+=" --network-macvlan=$iface"
done
# If the host is 64-bit and the container is 32-bit, add a
# --personality flag.
${optionalString (config.nixpkgs.system == "x86_64-linux") ''
if [ "$(< ''${SYSTEM_PATH:-/nix/var/nix/profiles/per-container/$INSTANCE/system}/system)" = i686-linux ]; then
extraFlags+=" --personality=x86"
fi
''}
# Run systemd-nspawn without startup notification (we'll
# wait for the container systemd to signal readiness).
EXIT_ON_REBOOT=1 \
exec ${config.systemd.package}/bin/systemd-nspawn \
--keep-unit \
-M "$INSTANCE" -D "$root" $extraFlags \
$EXTRA_NSPAWN_FLAGS \
--notify-ready=yes \
--bind-ro=/nix/store \
--bind-ro=/nix/var/nix/db \
--bind-ro=/nix/var/nix/daemon-socket \
--bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles" \
--bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots" \
--setenv PRIVATE_NETWORK="$PRIVATE_NETWORK" \
--setenv HOST_BRIDGE="$HOST_BRIDGE" \
--setenv HOST_ADDRESS="$HOST_ADDRESS" \
--setenv LOCAL_ADDRESS="$LOCAL_ADDRESS" \
--setenv HOST_ADDRESS6="$HOST_ADDRESS6" \
--setenv LOCAL_ADDRESS6="$LOCAL_ADDRESS6" \
--setenv PATH="$PATH" \
${if cfg.additionalCapabilities != null && cfg.additionalCapabilities != [] then
''--capability="${concatStringsSep " " cfg.additionalCapabilities}"'' else ""
} \
${if cfg.tmpfs != null && cfg.tmpfs != [] then
''--tmpfs=${concatStringsSep " --tmpfs=" cfg.tmpfs}'' else ""
} \
${containerInit cfg} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
'';
preStartScript = cfg:
''
# Clean up existing machined registration and interfaces.
machinectl terminate "$INSTANCE" 2> /dev/null || true
if [ "$PRIVATE_NETWORK" = 1 ]; then
ip link del dev "ve-$INSTANCE" 2> /dev/null || true
ip link del dev "vb-$INSTANCE" 2> /dev/null || true
fi
${concatStringsSep "\n" (
mapAttrsToList (name: cfg:
''ip link del dev ${name} 2> /dev/null || true ''
) cfg.extraVeths
)}
'';
postStartScript = (cfg:
let
ipcall = cfg: ipcmd: variable: attribute:
if cfg.${attribute} == null then
''
if [ -n "${variable}" ]; then
${ipcmd} add ${variable} dev $ifaceHost
fi
''
else
''${ipcmd} add ${cfg.${attribute}} dev $ifaceHost'';
renderExtraVeth = name: cfg:
if cfg.hostBridge != null then
''
# Add ${name} to bridge ${cfg.hostBridge}
ip link set dev ${name} master ${cfg.hostBridge} up
''
else
''
# Set IPs and routes for ${name}
${optionalString (cfg.hostAddress != null) ''
ip addr add ${cfg.hostAddress} dev ${name}
''}
${optionalString (cfg.hostAddress6 != null) ''
ip -6 addr add ${cfg.hostAddress6} dev ${name}
''}
${optionalString (cfg.localAddress != null) ''
ip route add ${cfg.localAddress} dev ${name}
''}
${optionalString (cfg.localAddress6 != null) ''
ip -6 route add ${cfg.localAddress6} dev ${name}
''}
'';
in
''
if [ "$PRIVATE_NETWORK" = 1 ]; then
if [ -z "$HOST_BRIDGE" ]; then
ifaceHost=ve-$INSTANCE
ip link set dev $ifaceHost up
${ipcall cfg "ip addr" "$HOST_ADDRESS" "hostAddress"}
${ipcall cfg "ip -6 addr" "$HOST_ADDRESS6" "hostAddress6"}
${ipcall cfg "ip route" "$LOCAL_ADDRESS" "localAddress"}
${ipcall cfg "ip -6 route" "$LOCAL_ADDRESS6" "localAddress6"}
fi
${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg.extraVeths)}
fi
# Get the leader PID so that we can signal it in
# preStop. We can't use machinectl there because D-Bus
# might be shutting down. FIXME: in systemd 219 we can
# just signal systemd-nspawn to do a clean shutdown.
machinectl show "$INSTANCE" | sed 's/Leader=\(.*\)/\1/;t;d' > "/run/containers/$INSTANCE.pid"
''
);
serviceDirectives = cfg: {
ExecReload = pkgs.writeScript "reload-container"
''
#! ${pkgs.stdenv.shell} -e
${pkgs.nixos-container}/bin/nixos-container run "$INSTANCE" -- \
bash --login -c "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/bin/switch-to-configuration test"
'';
SyslogIdentifier = "container %i";
EnvironmentFile = "-/etc/containers/%i.conf";
Type = "notify";
# Note that on reboot, systemd-nspawn returns 133, so this
# unit will be restarted. On poweroff, it returns 0, so the
# unit won't be restarted.
RestartForceExitStatus = "133";
SuccessExitStatus = "133";
Restart = "on-failure";
# Hack: we don't want to kill systemd-nspawn, since we call
# "machinectl poweroff" in preStop to shut down the
# container cleanly. But systemd requires sending a signal
# (at least if we want remaining processes to be killed
# after the timeout). So send an ignored signal.
KillMode = "mixed";
KillSignal = "WINCH";
DevicePolicy = "closed";
DeviceAllow = map (d: "${d.node} ${d.modifier}") cfg.allowedDevices;
};
system = config.nixpkgs.system;
bindMountOpts = { name, config, ... }: {
options = {
mountPoint = mkOption {
example = "/mnt/usb";
type = types.str;
description = "Mount point on the container file system.";
};
hostPath = mkOption {
default = null;
example = "/home/alice";
type = types.nullOr types.str;
description = "Location of the host path to be mounted.";
};
isReadOnly = mkOption {
default = true;
example = true;
type = types.bool;
description = "Determine whether the mounted path will be accessed in read-only mode.";
};
};
config = {
mountPoint = mkDefault name;
};
};
allowedDeviceOpts = { name, config, ... }: {
options = {
node = mkOption {
example = "/dev/net/tun";
type = types.str;
description = "Path to device node";
};
modifier = mkOption {
example = "rw";
type = types.str;
description = ''
Device node access modifier. Takes a combination
<literal>r</literal> (read), <literal>w</literal> (write), and
<literal>m</literal> (mknod). See the
<literal>systemd.resource-control(5)</literal> man page for more
information.'';
};
};
};
mkBindFlag = d:
let flagPrefix = if d.isReadOnly then " --bind-ro=" else " --bind=";
mountstr = if d.hostPath != null then "${d.hostPath}:${d.mountPoint}" else "${d.mountPoint}";
in flagPrefix + mountstr ;
mkBindFlags = bs: concatMapStrings mkBindFlag (lib.attrValues bs);
networkOptions = {
hostBridge = mkOption {
type = types.nullOr types.string;
default = null;
example = "br0";
description = ''
Put the host-side of the veth-pair into the named bridge.
Only one of hostAddress* or hostBridge can be given.
'';
};
hostAddress = mkOption {
type = types.nullOr types.str;
default = null;
example = "10.231.136.1";
description = ''
The IPv4 address assigned to the host interface.
(Not used when hostBridge is set.)
'';
};
hostAddress6 = mkOption {
type = types.nullOr types.string;
default = null;
example = "fc00::1";
description = ''
The IPv6 address assigned to the host interface.
(Not used when hostBridge is set.)
'';
};
localAddress = mkOption {
type = types.nullOr types.str;
default = null;
example = "10.231.136.2";
description = ''
The IPv4 address assigned to the interface in the container.
If a hostBridge is used, this should be given with netmask to access
the whole network. Otherwise the default netmask is /32 and routing is
set up from localAddress to hostAddress and back.
'';
};
localAddress6 = mkOption {
type = types.nullOr types.string;
default = null;
example = "fc00::2";
description = ''
The IPv6 address assigned to the interface in the container.
If a hostBridge is used, this should be given with netmask to access
the whole network. Otherwise the default netmask is /128 and routing is
set up from localAddress6 to hostAddress6 and back.
'';
};
};
dummyConfig =
{
extraVeths = {};
additionalCapabilities = [];
allowedDevices = [];
hostAddress = null;
hostAddress6 = null;
localAddress = null;
localAddress6 = null;
tmpfs = null;
};
in
{
options = {
boot.isContainer = mkOption {
type = types.bool;
default = false;
description = ''
Whether this NixOS machine is a lightweight container running
in another NixOS system.
'';
};
boot.enableContainers = mkOption {
type = types.bool;
default = !config.boot.isContainer;
description = ''
Whether to enable support for nixos containers.
'';
};
containers = mkOption {
type = types.attrsOf (types.submodule (
{ config, options, name, ... }:
{
options = {
config = mkOption {
description = ''
A specification of the desired configuration of this
container, as a NixOS module.
'';
type = lib.mkOptionType {
name = "Toplevel NixOS config";
merge = loc: defs: (import ../../lib/eval-config.nix {
inherit system;
modules =
let extraConfig =
{ boot.isContainer = true;
networking.hostName = mkDefault name;
networking.useDHCP = false;
};
in [ extraConfig ] ++ (map (x: x.value) defs);
prefix = [ "containers" name ];
}).config;
};
};
path = mkOption {
type = types.path;
example = "/nix/var/nix/profiles/containers/webserver";
description = ''
As an alternative to specifying
<option>config</option>, you can specify the path to
the evaluated NixOS system configuration, typically a
symlink to a system profile.
'';
};
additionalCapabilities = mkOption {
type = types.listOf types.str;
default = [];
example = [ "CAP_NET_ADMIN" "CAP_MKNOD" ];
description = ''
Grant additional capabilities to the container. See the
capabilities(7) and systemd-nspawn(1) man pages for more
information.
'';
};
enableTun = mkOption {
type = types.bool;
default = false;
description = ''
Allows the container to create and setup tunnel interfaces
by granting the <literal>NET_ADMIN</literal> capability and
enabling access to <literal>/dev/net/tun</literal>.
'';
};
privateNetwork = mkOption {
type = types.bool;
default = false;
description = ''
Whether to give the container its own private virtual
Ethernet interface. The interface is called
<literal>eth0</literal>, and is hooked up to the interface
<literal>ve-<replaceable>container-name</replaceable></literal>
on the host. If this option is not set, then the
container shares the network interfaces of the host,
and can bind to any port on any interface.
'';
};
interfaces = mkOption {
type = types.listOf types.string;
default = [];
example = [ "eth1" "eth2" ];
description = ''
The list of interfaces to be moved into the container.
'';
};
macvlans = mkOption {
type = types.listOf types.str;
default = [];
example = [ "eth1" "eth2" ];
description = ''
The list of host interfaces from which macvlans will be
created. For each interface specified, a macvlan interface
will be created and moved to the container.
'';
};
extraVeths = mkOption {
type = with types; attrsOf (submodule { options = networkOptions; });
default = {};
description = ''
Extra veth-pairs to be created for the container
'';
};
autoStart = mkOption {
type = types.bool;
default = false;
description = ''
Wether the container is automatically started at boot-time.
'';
};
bindMounts = mkOption {
type = with types; loaOf (submodule bindMountOpts);
default = {};
example = { "/home" = { hostPath = "/home/alice";
isReadOnly = false; };
};
description =
''
An extra list of directories that is bound to the container.
'';
};
allowedDevices = mkOption {
type = with types; listOf (submodule allowedDeviceOpts);
default = [];
example = [ { node = "/dev/net/tun"; modifier = "rw"; } ];
description = ''
A list of device nodes to which the containers has access to.
'';
};
tmpfs = mkOption {
type = types.listOf types.str;
default = [];
example = [ "/var" ];
description = ''
Mounts a set of tmpfs file systems into the container.
Multiple paths can be specified.
Valid items must conform to the --tmpfs argument
of systemd-nspawn. See systemd-nspawn(1) for details.
'';
};
} // networkOptions;
config = mkMerge
[
(mkIf options.config.isDefined {
path = config.config.system.build.toplevel;
})
];
}));
default = {};
example = literalExample
''
{ webserver =
{ path = "/nix/var/nix/profiles/webserver";
};
database =
{ config =
{ config, pkgs, ... }:
{ services.postgresql.enable = true;
services.postgresql.package = pkgs.postgresql92;
};
};
}
'';
description = ''
A set of NixOS system configurations to be run as lightweight
containers. Each container appears as a service
<literal>container-<replaceable>name</replaceable></literal>
on the host system, allowing it to be started and stopped via
<command>systemctl</command>.
'';
};
};
config = mkIf (config.boot.enableContainers) (let
unit = {
description = "Container '%i'";
unitConfig.RequiresMountsFor = [ "/var/lib/containers/%i" ];
path = [ pkgs.iproute ];
environment.INSTANCE = "%i";
environment.root = "/var/lib/containers/%i";
preStart = preStartScript dummyConfig;
script = startScript dummyConfig;
postStart = postStartScript dummyConfig;
preStop =
''
pid="$(cat /run/containers/$INSTANCE.pid)"
if [ -n "$pid" ]; then
kill -RTMIN+4 "$pid"
fi
rm -f "/run/containers/$INSTANCE.pid"
'';
restartIfChanged = false;
serviceConfig = serviceDirectives dummyConfig;
};
in {
systemd.services = listToAttrs (filter (x: x.value != null) (
# The generic container template used by imperative containers
[{ name = "container@"; value = unit; }]
# declarative containers
++ (mapAttrsToList (name: cfg: nameValuePair "container@${name}" (let
config = cfg // (
if cfg.enableTun then
{
allowedDevices = cfg.allowedDevices
++ [ { node = "/dev/net/tun"; modifier = "rw"; } ];
additionalCapabilities = cfg.additionalCapabilities
++ [ "CAP_NET_ADMIN" ];
}
else {});
in
unit // {
preStart = preStartScript config;
script = startScript config;
postStart = postStartScript config;
serviceConfig = serviceDirectives config;
} // (
if config.autoStart then
{
wantedBy = [ "multi-user.target" ];
wants = [ "network.target" ];
after = [ "network.target" ];
restartTriggers = [ config.path ];
reloadIfChanged = true;
}
else {})
)) config.containers)
));
# Generate a configuration file in /etc/containers for each
# container so that container@.target can get the container
# configuration.
environment.etc = mapAttrs' (name: cfg: nameValuePair "containers/${name}.conf"
{ text =
''
SYSTEM_PATH=${cfg.path}
${optionalString cfg.privateNetwork ''
PRIVATE_NETWORK=1
${optionalString (cfg.hostBridge != null) ''
HOST_BRIDGE=${cfg.hostBridge}
''}
${optionalString (cfg.hostAddress != null) ''
HOST_ADDRESS=${cfg.hostAddress}
''}
${optionalString (cfg.hostAddress6 != null) ''
HOST_ADDRESS6=${cfg.hostAddress6}
''}
${optionalString (cfg.localAddress != null) ''
LOCAL_ADDRESS=${cfg.localAddress}
''}
${optionalString (cfg.localAddress6 != null) ''
LOCAL_ADDRESS6=${cfg.localAddress6}
''}
''}
INTERFACES="${toString cfg.interfaces}"
MACVLANS="${toString cfg.macvlans}"
${optionalString cfg.autoStart ''
AUTO_START=1
''}
EXTRA_NSPAWN_FLAGS="${mkBindFlags cfg.bindMounts}"
'';
}) config.containers;
# Generate /etc/hosts entries for the containers.
networking.extraHosts = concatStrings (mapAttrsToList (name: cfg: optionalString (cfg.localAddress != null)
''
${head (splitString "/" cfg.localAddress)} ${name}.containers
'') config.containers);
networking.dhcpcd.denyInterfaces = [ "ve-*" "vb-*" ];
environment.systemPackages = [ pkgs.nixos-container ];
});
}