aboutsummaryrefslogtreecommitdiff
blob: 6a823c6cff91053cfcc2c1f6918945520486f674 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.

"""Support for Linux namespaces"""

import ctypes
import ctypes.util
import errno
import os
import signal
import socket
import subprocess
import sys

from ..osutils.mount import (
    MS_NODEV,
    MS_NOEXEC,
    MS_NOSUID,
    MS_PRIVATE,
    MS_REC,
    MS_RELATIME,
    MS_SLAVE,
)
from ..osutils.mount import mount as _mount
from . import exit_as_status

CLONE_FS = 0x00000200
CLONE_FILES = 0x00000400
CLONE_NEWNS = 0x00020000
CLONE_NEWUTS = 0x04000000
CLONE_NEWIPC = 0x08000000
CLONE_NEWUSER = 0x10000000
CLONE_NEWPID = 0x20000000
CLONE_NEWNET = 0x40000000


def setns(fd, nstype):
    """Binding to the Linux setns system call. See setns(2) for details.

    :param fd: An open file descriptor or path to one.
    :param nstype: Namespace to enter; one of CLONE_*.
    :raises OSError: if setns failed.
    """
    try:
        fp = None
        if isinstance(fd, str):
            fp = open(fd)
            fd = fp.fileno()

        libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
        if libc.setns(ctypes.c_int(fd), ctypes.c_int(nstype)) != 0:
            e = ctypes.get_errno()
            raise OSError(e, os.strerror(e))
    finally:
        if fp is not None:
            fp.close()


def unshare(flags):
    """Binding to the Linux unshare system call. See unshare(2) for details.

    :param flags: Namespaces to unshare; bitwise OR of CLONE_* flags.
    :raises OSError: if unshare failed.
    """
    libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
    if libc.unshare(ctypes.c_int(flags)) != 0:
        e = ctypes.get_errno()
        raise OSError(e, os.strerror(e))


def _reap_children(pid):
    """Reap all children that get reparented to us until we see |pid| exit.

    :param pid: The main child to watch for.
    :return: The wait status of the |pid| child.
    """
    pid_status = 0

    while True:
        try:
            (wpid, status) = os.wait()
            if pid == wpid:
                # Save the status of our main child so we can exit with it below.
                pid_status = status
        except OSError as e:
            if e.errno == errno.ECHILD:
                break
            elif e.errno != errno.EINTR:
                raise

    return pid_status


def _safe_tcsetpgrp(fd, pgrp):
    """Set |pgrp| as the controller of the tty |fd|."""
    try:
        curr_pgrp = os.tcgetpgrp(fd)
    except OSError as e:
        # This can come up when the fd is not connected to a terminal.
        if e.errno == errno.ENOTTY:
            return
        raise

    # We can change the owner only if currently own it.  Otherwise we'll get
    # stopped by the kernel with SIGTTOU and that'll hit the whole group.
    if curr_pgrp == os.getpgrp():
        os.tcsetpgrp(fd, pgrp)


def create_pidns():
    """Start a new pid namespace

    This will launch all the right manager processes.  The child that returns
    will be isolated in a new pid namespace.

    If functionality is not available, then it will return w/out doing anything.

    :return: The last pid outside of the namespace.
    """
    first_pid = os.getpid()

    try:
        # First create the namespace.
        unshare(CLONE_NEWPID)
    except OSError as e:
        if e.errno == errno.EINVAL:
            # For older kernels, or the functionality is disabled in the config,
            # return silently.  We don't want to hard require this stuff.
            return first_pid
        else:
            # For all other errors, abort.  They shouldn't happen.
            raise

    # Now that we're in the new pid namespace, fork.  The parent is the master
    # of it in the original namespace, so it only monitors the child inside it.
    # It is only allowed to fork once too.
    if pid := os.fork():
        # Mask SIGINT with the assumption that the child will catch & process it.
        # We'll pass that back up below.
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        # Forward the control of the terminal to the child so it can manage input.
        _safe_tcsetpgrp(sys.stdin.fileno(), pid)

        # Reap the children as the parent of the new namespace.
        exit_as_status(_reap_children(pid))
    else:
        # Make sure to unshare the existing mount point if needed.  Some distros
        # create shared mount points everywhere by default.
        try:
            _mount(None, "/proc", "proc", MS_PRIVATE | MS_REC)
        except OSError as e:
            if e.errno != errno.EINVAL:
                raise

        # The child needs its own proc mount as it'll be different.
        _mount("proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME)

        if pid := os.fork():
            # Mask SIGINT with the assumption that the child will catch & process it.
            # We'll pass that back up below.
            signal.signal(signal.SIGINT, signal.SIG_IGN)

            # Now that we're in a new pid namespace, start a new process group so that
            # children have something valid to use.  Otherwise getpgrp/etc... will get
            # back 0 which tends to confuse -- you can't setpgrp(0) for example.
            os.setpgrp()

            # Forward the control of the terminal to the child so it can manage input.
            _safe_tcsetpgrp(sys.stdin.fileno(), pid)

            # Watch all of the children.  We need to act as the master inside the
            # namespace and reap old processes.
            exit_as_status(_reap_children(pid))

    # Create a process group for the grandchild so it can manage things
    # independent of the init process.
    os.setpgrp()

    # The grandchild will return and take over the rest of the sdk steps.
    return first_pid


def create_netns():
    """Start a new net namespace

    We will bring up the loopback interface, but that is all.

    If functionality is not available, then it will return w/out doing anything.
    """
    # The net namespace was added in 2.6.24 and may be disabled in the kernel.
    try:
        unshare(CLONE_NEWNET)
    except OSError as e:
        if e.errno == errno.EINVAL:
            return
        else:
            # For all other errors, abort.  They shouldn't happen.
            raise

    # Since we've unshared the net namespace, we need to bring up loopback.
    # The kernel automatically adds the various ip addresses, so skip that.
    try:
        subprocess.call(["ip", "link", "set", "up", "lo"])
    except OSError as e:
        if e.errno == errno.ENOENT:
            sys.stderr.write(
                "warning: could not bring up loopback for network; "
                "install the iproute2 package\n"
            )
        else:
            raise


def create_utsns(hostname=None):
    """Start a new UTS namespace

    If functionality is not available, then it will return w/out doing anything.
    """
    # The UTS namespace was added 2.6.19 and may be disabled in the kernel.
    try:
        unshare(CLONE_NEWUTS)
    except OSError as e:
        if e.errno != errno.EINVAL:
            return
        else:
            raise

    # hostname/domainname default to the parent namespace settings if unset
    if hostname is not None:
        socket.sethostname(hostname)


def create_userns():
    """Start a new user namespace

    If functionality is not available, then it will return w/out doing anything.
    """

    # Get original uid/gid values before they're changed on entering the namespace.
    uid = os.getuid()
    gid = os.getgid()

    try:
        unshare(CLONE_NEWUSER)
    except OSError as e:
        if e.errno == errno.EINVAL:
            return
        else:
            # For all other errors, abort.  They shouldn't happen.
            raise

    with open("/proc/self/setgroups", "w") as f:
        f.write("deny")
    with open("/proc/self/uid_map", "w") as f:
        f.write("0 %s 1\n" % uid)
    with open("/proc/self/gid_map", "w") as f:
        f.write("0 %s 1\n" % gid)


def simple_unshare(
    mount=True, uts=True, ipc=True, net=False, pid=False, user=False, hostname=None
):
    """Simpler helper for setting up namespaces quickly.

    If support for any namespace type is not available, we'll silently skip it.

    :param mount: Create a mount namespace.
    :param uts: Create a UTS namespace.
    :param ipc: Create an IPC namespace.
    :param net: Create a net namespace.
    :param pid: Create a pid namespace.
    :param user: Create a user namespace.
    :param hostname: hostname to use for the UTS namespace
    """
    # user namespace must be first
    if user:
        create_userns()

    # The mount namespace is the only one really guaranteed to exist --
    # it's been supported forever and it cannot be turned off.
    if mount:
        unshare(CLONE_NEWNS)

        # Avoid mounts in the new namespace from affecting the parent namespace
        # on systems that share the rootfs by default, but allow events in the
        # parent to propagate down.
        try:
            _mount(None, "/", None, MS_REC | MS_SLAVE)
        except OSError as e:
            if e.errno != errno.EINVAL:
                raise

    if uts:
        create_utsns(hostname)

    # The IPC namespace was added 2.6.19 and may be disabled in the kernel.
    if ipc:
        try:
            unshare(CLONE_NEWIPC)
        except OSError as e:
            if e.errno != errno.EINVAL:
                pass

    if net:
        create_netns()

    if pid:
        create_pidns()