# frozen_string_literal: true # Run a program inside an isolated sandbox, much like a Docker container. Inside the sandbox, # the program doesn't have network access, can't see other programs, and can only see read-only # OS directories like /usr and /lib. It can only communicate by reading from stdin and printing # output to stdout. # # This is based on a combination of Linux namespaces, to isolate the process in a container, # and Seccomp, to restrict system calls. # # @example # # Run a command in a sandbox and return the result as a string. # output = Sandbox.new(stdin: File.open("image.jpg")).run!("exiftool -json -") # # # Open a shell inside the sandbox. # Sandbox.new.shell # # # Doesn't work - no network access. # Sandbox.new.system("ping 127.0.0.1") # # # Doesn't work - no access to /home. # Sandbox.new.system("cat ~/.ssh/id_rsa") # # # Doesn't work - no access to test.txt. # FileUtils.touch("test.txt") # Sandbox.new.system("rm test.txt") # # Documentation: # # @see https://en.wikipedia.org/wiki/Linux_namespaces # @see https://blog.lizzie.io/linux-containers-in-500-loc.html # @see https://jvns.ca/blog/2020/04/27/new-zine-how-containers-work/ # @see https://zserge.com/posts/containers/ # @see https://man7.org/linux/man-pages/man7/namespaces.7.html # @see https://www.youtube.com/watch?v=8fi7uSYlOdc # # Utilities: # # @see https://github.com/netblue30/firejail # @see https://github.com/google/nsjail # @see https://man7.org/linux/man-pages/man1/setpriv.1.html # @see https://man7.org/linux/man-pages/man1/unshare.1.html class Sandbox class Error < StandardError; end attr_accessor :stdin, :stdout, :stderr, :root, :process, :network, :hostname, :tmp, :ro, :rw, :env, :seccomp # Configure a new sandbox. Call `#confine!` afterward to run code in the sandbox. # # @param stdin [File, nil] The stdin use inside the sandbox. If nil, redirect stdin to /dev/null. # @param stdout [File, nil] The stdout to use inside the sandbox. If nil, redirect stdout to /dev/null. # @param stderr [File, nil] The stderr to use inside the sandbox. If nil, redirect stderr to /dev/null. # @param process [Boolean] If true, allow sandboxed processes to see processes outside the sandbox. Default: false. # @param network [Boolean] If true, allow network access inside the sandbox. Default: false. # @param hostname [Boolean] If true, allow sandboxed processes to see the system hostname. # If false, generate a random hostname inside the sandbox. Default: false. # @param root [Boolean] If true, run the sandboxed process with root privileges. Note: this # doesn't give root privileges outside the sandbox. Default: false. # @param tmp [Boolean] If true, mount /tmp, /run, and /dev/shm in the sandbox. Default: false. # @param ro [Array] The list of directories to allow read-only access to inside the sandbox. # @param rw [Array] The list of directories to allow read-write access to inside the sandbox. # @param env [Array] The list of environment variables to keep inside the sandbox. Default: none. # @param seccomp [Seccomp::Filter, nil] If present, a system call filter to apply inside the sandbox. def initialize(stdin: $stdin, stdout: $stdout, stderr: $stderr, root: false, process: false, network: false, hostname: false, tmp: false, seccomp: Seccomp.allow("@common"), ro: %w[/usr /lib /lib64 /bin /sbin], rw: [], env: []) @stdin = stdin.nil? ? File.open("/dev/null", "r") : stdin @stdout = stdout.nil? ? File.open("/dev/null", "w") : stdout @stderr = stderr.nil? ? File.open("/dev/null", "w") : stderr @root = root @network = network @process = process @hostname = hostname @tmp = tmp @ro = ro @rw = rw @env = env @seccomp = seccomp end # Run a block of code in a sandboxed subprocess. # @return [Integer] the process ID of the subprocess def confine!(&block) clear_env! redirect_stdio! close_fds! no_new_privs! new_user_namespace! new_pid_namespace! do new_hostname_namespace! new_network_namespace! new_cgroup_namespace! new_ipc_namespace! new_mount_namespace! filter_syscalls! yield end end # Run a program in the sandbox and return immediately. # @return [Integer] the process ID of the command def spawn(*args) Process.fork do pid = confine! do Process.exec(*args) end status = waitpid!(pid) exit status.exitstatus end end # Run a program in the sandbox and return its output. Raise an error if it fails. # @return [String] The stdout of the command. def run!(*args) IO.pipe do |reader, writer| sandbox = dup.tap { |o| o.stdout = writer } pid = sandbox.spawn(*args) writer.close ret = reader.read status = waitpid!(pid) raise Error, "run!(#{args.map(&:inspect).join}) failed (exit #{status.exitstatus})" if !status.success? ret end end # Run a program in the sandbox and return its output. Return nil if it fails. # @return [String, nil] The stdout of the command, or nil if it failed. def run(*args) run!(*args) rescue Error nil end # Run a program in the sandbox and wait for it to finish. # @return [Process::Status] the exit status of the program def system(command, *args) pid = spawn(command, *args) waitpid!(pid) end # Run an interactive shell in the sandbox. def shell(shell = "/bin/sh") system(shell) end protected # Wait on a subprocess to exit. Raise an error if it is killed because it called an # unauthorized syscall blocked by the seccomp filter. def waitpid!(pid) pid, status = Process.wait2(pid) if status.signaled? && Signal.signame(status.termsig) == "SYS" raise Error, "Command failed: called unauthorized syscall (see dmesg for details)" else status end end # Move our process to a new user namespace. Inside the namespace, our process runs under a # different UID/GID than outside the namespace. # # Creating a user namespace grants us root privileges inside the namespace. This lets us set # up other namespaces. We later drop these privileges after setting up the other namespaces. # # @see https://man7.org/linux/man-pages/man7/user_namespaces.7.html def new_user_namespace! outer_uid, outer_gid = Process.uid, Process.gid uid = root ? 0 : outer_uid gid = root ? 0 : outer_gid raise Error, "multi-threaded processes can't be sandboxed (hint: set DANBOORU_DEBUG_MODE=1)" if Thread.list.count > 1 Linux.unshare!([:clone_newuser]) # Tell the kernel how to map our UID and GID outside the namespace to new # (potentially different) IDs inside the namespace. See user_namespaces(7). File.write("/proc/self/setgroups", "deny") File.write("/proc/self/uid_map", "#{uid} #{outer_uid} 1\n") File.write("/proc/self/gid_map", "#{gid} #{outer_gid} 1\n") end # Move our process to a new hostname namespace. Inside the namespace, we set a new random hostname. def new_hostname_namespace! return if hostname Linux.unshare!([:clone_newuts]) sethostname!(SecureRandom.uuid) end # Move our process to a new PID namespace. Inside the namespace, we run as PID 1 and we can't # see any processes outside the namespace. This requires a fork to spawn a new child in the namespace. def new_pid_namespace!(&block) return yield if process Linux.unshare!([:clone_newpid]) Process.fork(&block) end # Move our process to a new network namespace. Inside the namespace, all we have is a # disabled localhost interface, so we have no network access. def new_network_namespace! return if network Linux.unshare!([:clone_newnet]) end # Move our process to a new Cgroup namespace. def new_cgroup_namespace! Linux.unshare!([:clone_newcgroup]) end # Move our process to a new IPC namespace. def new_ipc_namespace! Linux.unshare!([:clone_newipc]) end # Move our process to a new mount namespace. Inside the namespace, our process has its own # unique view of of the filesystem. def new_mount_namespace! Linux.unshare!([:clone_newns]) mount!("tmpfs", "/tmp", fstype: "tmpfs") ro.each do |path| # XXX bug: submounts don't get mounted readonly. bind_mount!(path, File.join("/tmp", path), flags: %i[rdonly nodev nosuid]) end rw.each do |path| bind_mount!(path, File.join("/tmp", path), flags: %i[nodev nosuid]) end if process bind_mount!("/proc", "/tmp/proc") else mount!("proc", "/tmp/proc", fstype: "proc", flags: %i[rdonly]) end if tmp mount!("tmpfs", "/tmp/tmp", fstype: "tmpfs") mount!("tmpfs", "/tmp/run", fstype: "tmpfs") mount!("tmpfs", "/tmp/dev/shm", fstype: "tmpfs") end remount!("/tmp", flags: %i[rdonly nodev nosuid noexec noatime]) pivot_root!("/tmp") end # @return [Array] The list of currently open file descriptors for the process. def open_fds Dir.open("/proc/self/fd") do |dir| # Don't include "/proc/self/fd" itself in the list of open files dir.children.map(&:to_i) - [dir.fileno] end end # Close all open files for the process, except stdin, stdout, and stderr. # # @param keep [Array] The list of open files to keep. # @return [void] def close_fds!(keep: [0, 1, 2]) fds = open_fds - keep fds.each do |fd| IO.new(fd).close rescue ArgumentError # Trying to close FDs 3 and 4 will raise an ArgumentError because these # FDs are used internally by the Ruby VM. Ignore the error. end end # Redirect stdin, stdout, and stderr for our process. def redirect_stdio! IO.new(0).reopen(stdin) IO.new(1).reopen(stdout) IO.new(2).reopen(stderr) end def clear_env! ENV.delete_if do |name, value| !env.include?(name) end end # Activate seccomp(2) filtering. def filter_syscalls! seccomp&.apply! end # Call prctl(PR_SET_NO_NEW_PRIVS, 1). This makes it so setuid binaries have # no effect, so we can't elevate privileges by running things like sudo(1). # This is required by seccomp(2). # # @see https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html def no_new_privs! Linux.prctl!(:set_no_new_privs, 1, 0, 0, 0) end # Mount the `source` filesystem on the `target` directory. def mount!(source, target, fstype: nil, flags: []) FileUtils.mkdir_p(target, mode: 0755) Linux.mount!(source, target, fstype, flags, nil) end # Remount an existing mountpoint with the new `flags`. def remount!(target, flags: []) mount!(nil, target, flags: flags + [:remount]) end # Bind mount a directory to a new mountpoint. Bind mounting `/usr` to # `/tmp/usr` means `/tmp/usr` refers to the same directory as `/usr`. def bind_mount!(source, target, flags: []) mount!(source, target, flags: [:bind, :rec, :private]) remount!(target, flags: [:bind, *flags]) end # Change the root (`/`) directory to the given directory. def pivot_root!(newroot) Linux.pivot_root!(newroot, newroot) Dir.chdir("/") # The new root was mounted on top of the old root. This unmounts the old root. Linux.umount2!(".", :detach) end # Change the system hostname. def sethostname!(hostname) Linux.sethostname!(hostname, hostname.size) end # Create Ruby bindings for various Linux kernel syscalls. # https://github.com/ffi/ffi/wiki module Linux extend FFI::Library ffi_lib FFI::Library::LIBC # Create a Ruby method that calls the given system call, and define a bang version that # raises an error if the syscall fails. def self.attach_function(name, *args) define_singleton_method("#{name}!") do |*args| ret = send(name, *args) if ret < 0 message = "#{name}(#{args.map(&:inspect).join(", ")})" raise SystemCallError.new(message, FFI.errno) else ret end end super(name, *args) end # https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html # https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h enum :prctl_command, [ :set_no_new_privs, 38 ] # https://github.com/torvalds/linux/blob/master/include/uapi/linux/sched.h bitmask :unshare_flags, [ :clone_time, 7, # 0x00000080, New time namespace :clone_newns, 17, # 0x00020000, New mount (filesystem) namespace :clone_newcgroup, 25, # 0x02000000, New cgroup namespace :clone_newuts, 26, # 0x04000000, New utsname (hostname) namespace :clone_newipc, 27, # 0x08000000, New ipc namespace :clone_newuser, 28, # 0x10000000, New user namespace :clone_newpid, 29, # 0x20000000, New pid namespace :clone_newnet, 30, # 0x40000000, New network namespace ] # https://github.com/torvalds/linux/blob/master/include/uapi/linux/mount.h bitmask :mount_flags, [ :rdonly, 0, :nosuid, 1, :nodev, 2, :noexec, 3, :remount, 5, :noatime, 10, :bind, 12, :rec, 14, :private, 18, :slave, 19, ] # https://github.com/torvalds/linux/blob/master/include/linux/fs.h#L1425 bitmask :umount_flags, [ :detach, 1 # 0x2 ] # prctl - operations on a process or thread # https://man7.org/linux/man-pages/man2/prctl.2.html # int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); attach_function :prctl, [:prctl_command, :long, :long, :long, :long], :int # unshare - disassociate parts of the process execution context # https://man7.org/linux/man-pages/man2/unshare.2.html # https://man7.org/linux/man-pages/man7/namespaces.7.html # int unshare(int flags); attach_function :unshare, [:unshare_flags], :int # mount - mount filesystem # https://man7.org/linux/man-pages/man2/mount.2.html # int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data); attach_function :mount, [:string, :string, :string, :mount_flags, :pointer], :int # umount, umount2 - unmount filesystem # https://man7.org/linux/man-pages/man2/umount2.2.html # int umount2(const char *target, int flags); attach_function :umount2, [:string, :umount_flags], :int # pivot_root - change the root mount # https://man7.org/linux/man-pages/man2/pivot_root.2.html # int pivot_root(const char *new_root, const char *put_old); attach_function :pivot_root, [:string, :string], :int # sethostname - set system hostname # https://man7.org/linux/man-pages/man2/sethostname.2.html # int sethostname(const char *name, size_t len); attach_function :sethostname, [:string, :size_t], :int end end