danbooru/app/logical/sandbox.rb

# frozen_string_literal: true

# Run a program inside an isolated sandbox, much like a Docker container. Inside the sandbox,
# the program doesn't have network access, can't see other programs, and can only see read-only
# OS directories like /usr and /lib. It can only communicate by reading from stdin and printing
# output to stdout.
#
# This is based on a combination of Linux namespaces, to isolate the process in a container,
# and Seccomp, to restrict system calls.
#
# @example
#   # Run a command in a sandbox and return the result as a string.
#   output = Sandbox.new(stdin: File.open("image.jpg")).run!("exiftool -json -")
#
#   # Open a shell inside the sandbox.
#   Sandbox.new.shell
#
#   # Doesn't work - no network access.
#   Sandbox.new.system("ping 127.0.0.1")
#
#   # Doesn't work - no access to /home.
#   Sandbox.new.system("cat ~/.ssh/id_rsa")
#
#   # Doesn't work - no access to test.txt.
#   FileUtils.touch("test.txt")
#   Sandbox.new.system("rm test.txt")
#
# Documentation:
#
# @see https://en.wikipedia.org/wiki/Linux_namespaces
# @see https://blog.lizzie.io/linux-containers-in-500-loc.html
# @see https://jvns.ca/blog/2020/04/27/new-zine-how-containers-work/
# @see https://zserge.com/posts/containers/
# @see https://man7.org/linux/man-pages/man7/namespaces.7.html
# @see https://www.youtube.com/watch?v=8fi7uSYlOdc
#
# Utilities:
#
# @see https://github.com/netblue30/firejail
# @see https://github.com/google/nsjail
# @see https://man7.org/linux/man-pages/man1/setpriv.1.html
# @see https://man7.org/linux/man-pages/man1/unshare.1.html
class Sandbox
  class Error < StandardError; end

  attr_accessor :stdin, :stdout, :stderr, :root, :process, :network, :hostname, :tmp, :ro, :rw, :env, :seccomp

  # Configure a new sandbox. Call `#confine!` afterward to run code in the sandbox.
  #
  # @param stdin [File, nil] The stdin use inside the sandbox. If nil, redirect stdin to /dev/null.
  # @param stdout [File, nil] The stdout to use inside the sandbox. If nil, redirect stdout to /dev/null.
  # @param stderr [File, nil] The stderr to use inside the sandbox. If nil, redirect stderr to /dev/null.
  # @param process [Boolean] If true, allow sandboxed processes to see processes outside the sandbox. Default: false.
  # @param network [Boolean] If true, allow network access inside the sandbox. Default: false.
  # @param hostname [Boolean] If true, allow sandboxed processes to see the system hostname.
  #   If false, generate a random hostname inside the sandbox. Default: false.
  # @param root [Boolean] If true, run the sandboxed process with root privileges. Note: this
  #   doesn't give root privileges outside the sandbox. Default: false.
  # @param tmp [Boolean] If true, mount /tmp, /run, and /dev/shm in the sandbox. Default: false.
  # @param ro [Array<String>] The list of directories to allow read-only access to inside the sandbox.
  # @param rw [Array<String>] The list of directories to allow read-write access to inside the sandbox.
  # @param env [Array<String>] The list of environment variables to keep inside the sandbox. Default: none.
  # @param seccomp [Seccomp::Filter, nil] If present, a system call filter to apply inside the sandbox.
  def initialize(stdin: $stdin, stdout: $stdout, stderr: $stderr, root: false, process: false,
                 network: false, hostname: false, tmp: false, seccomp: Seccomp.allow("@common"),
                 ro: %w[/usr /lib /lib64 /bin /sbin], rw: [], env: [])
    @stdin  = stdin.nil?  ? File.open("/dev/null", "r") : stdin
    @stdout = stdout.nil? ? File.open("/dev/null", "w") : stdout
    @stderr = stderr.nil? ? File.open("/dev/null", "w") : stderr
    @root = root
    @network = network
    @process = process
    @hostname = hostname
    @tmp = tmp
    @ro = ro
    @rw = rw
    @env = env
    @seccomp = seccomp
  end

  # Run a block of code in a sandboxed subprocess.
  # @return [Integer] the process ID of the subprocess
  def confine!(&block)
    clear_env!
    redirect_stdio!
    close_fds!
    no_new_privs!

    new_user_namespace!
    new_pid_namespace! do
      new_hostname_namespace!
      new_network_namespace!
      new_cgroup_namespace!
      new_ipc_namespace!
      new_mount_namespace!
      filter_syscalls!

      yield
    end
  end

  # Run a program in the sandbox and return immediately.
  # @return [Integer] the process ID of the command
  def spawn(*args)
    Process.fork do
      pid = confine! do
        Process.exec(*args)
      end

      status = waitpid!(pid)
      exit status.exitstatus
    end
  end

  # Run a program in the sandbox and return its output. Raise an error if it fails.
  # @return [String] The stdout of the command.
  def run!(*args)
    IO.pipe do |reader, writer|
      sandbox = dup.tap { |o| o.stdout = writer }
      pid = sandbox.spawn(*args)
      writer.close

      ret = reader.read
      status = waitpid!(pid)
      raise Error, "run!(#{args.map(&:inspect).join}) failed (exit #{status.exitstatus})" if !status.success?

      ret
    end
  end

  # Run a program in the sandbox and return its output. Return nil if it fails.
  # @return [String, nil] The stdout of the command, or nil if it failed.
  def run(*args)
    run!(*args)
  rescue Error
    nil
  end

  # Run a program in the sandbox and wait for it to finish.
  # @return [Process::Status] the exit status of the program
  def system(command, *args)
    pid = spawn(command, *args)
    waitpid!(pid)
  end

  # Run an interactive shell in the sandbox.
  def shell(shell = "/bin/sh")
    system(shell)
  end

  protected

  # Wait on a subprocess to exit. Raise an error if it is killed because it called an
  # unauthorized syscall blocked by the seccomp filter.
  def waitpid!(pid)
    pid, status = Process.wait2(pid)

    if status.signaled? && Signal.signame(status.termsig) == "SYS"
      raise Error, "Command failed: called unauthorized syscall (see dmesg for details)"
    else
      status
    end
  end

  # Move our process to a new user namespace. Inside the namespace, our process runs under a
  # different UID/GID than outside the namespace.
  #
  # Creating a user namespace grants us root privileges inside the namespace. This lets us set
  # up other namespaces. We later drop these privileges after setting up the other namespaces.
  #
  # @see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
  def new_user_namespace!
    outer_uid, outer_gid = Process.uid, Process.gid
    uid = root ? 0 : outer_uid
    gid = root ? 0 : outer_gid

    raise Error, "multi-threaded processes can't be sandboxed (hint: set DANBOORU_DEBUG_MODE=1)" if Thread.list.count > 1
    Linux.unshare!([:clone_newuser])

    # Tell the kernel how to map our UID and GID outside the namespace to new
    # (potentially different) IDs inside the namespace. See user_namespaces(7).
    File.write("/proc/self/setgroups", "deny")
    File.write("/proc/self/uid_map", "#{uid} #{outer_uid} 1\n")
    File.write("/proc/self/gid_map", "#{gid} #{outer_gid} 1\n")
  end

  # Move our process to a new hostname namespace. Inside the namespace, we set a new random hostname.
  def new_hostname_namespace!
    return if hostname

    Linux.unshare!([:clone_newuts])
    sethostname!(SecureRandom.uuid)
  end

  # Move our process to a new PID namespace. Inside the namespace, we run as PID 1 and we can't
  # see any processes outside the namespace. This requires a fork to spawn a new child in the namespace.
  def new_pid_namespace!(&block)
    return yield if process

    Linux.unshare!([:clone_newpid])
    Process.fork(&block)
  end

  # Move our process to a new network namespace. Inside the namespace, all we have is a
  # disabled localhost interface, so we have no network access.
  def new_network_namespace!
    return if network
    Linux.unshare!([:clone_newnet])
  end

  # Move our process to a new Cgroup namespace.
  def new_cgroup_namespace!
    Linux.unshare!([:clone_newcgroup])
  end

  # Move our process to a new IPC namespace.
  def new_ipc_namespace!
    Linux.unshare!([:clone_newipc])
  end

  # Move our process to a new mount namespace. Inside the namespace, our process has its own
  # unique view of of the filesystem.
  def new_mount_namespace!
    Linux.unshare!([:clone_newns])
    mount!("tmpfs", "/tmp", fstype: "tmpfs")

    ro.each do |path|
      # XXX bug: submounts don't get mounted readonly.
      bind_mount!(path, File.join("/tmp", path), flags: %i[rdonly nodev nosuid])
    end
    rw.each do |path|
      bind_mount!(path, File.join("/tmp", path), flags: %i[nodev nosuid])
    end

    if process
      bind_mount!("/proc", "/tmp/proc")
    else
      mount!("proc", "/tmp/proc", fstype: "proc", flags: %i[rdonly])
    end

    if tmp
      mount!("tmpfs", "/tmp/tmp", fstype: "tmpfs")
      mount!("tmpfs", "/tmp/run", fstype: "tmpfs")
      mount!("tmpfs", "/tmp/dev/shm", fstype: "tmpfs")
    end

    remount!("/tmp", flags: %i[rdonly nodev nosuid noexec noatime])
    pivot_root!("/tmp")
  end

  # @return [Array<Integer>] The list of currently open file descriptors for the process.
  def open_fds
    Dir.open("/proc/self/fd") do |dir|
      # Don't include "/proc/self/fd" itself in the list of open files
      dir.children.map(&:to_i) - [dir.fileno]
    end
  end

  # Close all open files for the process, except stdin, stdout, and stderr.
  #
  # @param keep [Array<Integer>] The list of open files to keep.
  # @return [void]
  def close_fds!(keep: [0, 1, 2])
    fds = open_fds - keep

    fds.each do |fd|
      IO.new(fd).close
    rescue ArgumentError
      # Trying to close FDs 3 and 4 will raise an ArgumentError because these
      # FDs are used internally by the Ruby VM. Ignore the error.
    end
  end

  # Redirect stdin, stdout, and stderr for our process.
  def redirect_stdio!
    IO.new(0).reopen(stdin)
    IO.new(1).reopen(stdout)
    IO.new(2).reopen(stderr)
  end

  def clear_env!
    ENV.delete_if do |name, value|
      !env.include?(name)
    end
  end

  # Activate seccomp(2) filtering.
  def filter_syscalls!
    seccomp&.apply!
  end

  # Call prctl(PR_SET_NO_NEW_PRIVS, 1). This makes it so setuid binaries have
  # no effect, so we can't elevate privileges by running things like sudo(1).
  # This is required by seccomp(2).
  #
  # @see https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html
  def no_new_privs!
    Linux.prctl!(:set_no_new_privs, 1, 0, 0, 0)
  end

  # Mount the `source` filesystem on the `target` directory.
  def mount!(source, target, fstype: nil, flags: [])
    FileUtils.mkdir_p(target, mode: 0755)
    Linux.mount!(source, target, fstype, flags, nil)
  end

  # Remount an existing mountpoint with the new `flags`.
  def remount!(target, flags: [])
    mount!(nil, target, flags: flags + [:remount])
  end

  # Bind mount a directory to a new mountpoint. Bind mounting `/usr` to
  # `/tmp/usr` means `/tmp/usr` refers to the same directory as `/usr`.
  def bind_mount!(source, target, flags: [])
    mount!(source, target, flags: [:bind, :rec, :private])
    remount!(target, flags: [:bind, *flags])
  end

  # Change the root (`/`) directory to the given directory.
  def pivot_root!(newroot)
    Linux.pivot_root!(newroot, newroot)
    Dir.chdir("/")

    # The new root was mounted on top of the old root. This unmounts the old root.
    Linux.umount2!(".", :detach)
  end

  # Change the system hostname.
  def sethostname!(hostname)
    Linux.sethostname!(hostname, hostname.size)
  end

  # Create Ruby bindings for various Linux kernel syscalls.
  # https://github.com/ffi/ffi/wiki
  module Linux
    extend FFI::Library
    ffi_lib FFI::Library::LIBC

    # Create a Ruby method that calls the given system call, and define a bang version that
    # raises an error if the syscall fails.
    def self.attach_function(name, *args)
      define_singleton_method("#{name}!") do |*args|
        ret = send(name, *args)

        if ret < 0
          message = "#{name}(#{args.map(&:inspect).join(", ")})"
          raise SystemCallError.new(message, FFI.errno)
        else
          ret
        end
      end

      super(name, *args)
    end

    # https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html
    # https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
    enum :prctl_command, [
      :set_no_new_privs, 38
    ]

    # https://github.com/torvalds/linux/blob/master/include/uapi/linux/sched.h
    bitmask :unshare_flags, [
      :clone_time,      7,  # 0x00000080, New time namespace
      :clone_newns,     17, # 0x00020000, New mount (filesystem) namespace
      :clone_newcgroup, 25, # 0x02000000, New cgroup namespace
      :clone_newuts,    26, # 0x04000000, New utsname (hostname) namespace
      :clone_newipc,    27, # 0x08000000, New ipc namespace
      :clone_newuser,   28, # 0x10000000, New user namespace
      :clone_newpid,    29, # 0x20000000, New pid namespace
      :clone_newnet,    30, # 0x40000000, New network namespace
    ]

    # https://github.com/torvalds/linux/blob/master/include/uapi/linux/mount.h
    bitmask :mount_flags, [
      :rdonly,  0,
      :nosuid,  1,
      :nodev,   2,
      :noexec,  3,
      :remount, 5,
      :noatime, 10,
      :bind,    12,
      :rec,     14,
      :private, 18,
      :slave,   19,
    ]

    # https://github.com/torvalds/linux/blob/master/include/linux/fs.h#L1425
    bitmask :umount_flags, [
      :detach, 1 # 0x2
    ]

    # prctl - operations on a process or thread
    # https://man7.org/linux/man-pages/man2/prctl.2.html
    # int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5);
    attach_function :prctl, [:prctl_command, :long, :long, :long, :long], :int

    # unshare - disassociate parts of the process execution context
    # https://man7.org/linux/man-pages/man2/unshare.2.html
    # https://man7.org/linux/man-pages/man7/namespaces.7.html
    # int unshare(int flags);
    attach_function :unshare, [:unshare_flags], :int

    # mount - mount filesystem
    # https://man7.org/linux/man-pages/man2/mount.2.html
    # int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
    attach_function :mount, [:string, :string, :string, :mount_flags, :pointer], :int

    # umount, umount2 - unmount filesystem
    # https://man7.org/linux/man-pages/man2/umount2.2.html
    # int umount2(const char *target, int flags);
    attach_function :umount2, [:string, :umount_flags], :int

    # pivot_root - change the root mount
    # https://man7.org/linux/man-pages/man2/pivot_root.2.html
    # int pivot_root(const char *new_root, const char *put_old);
    attach_function :pivot_root, [:string, :string], :int

    # sethostname - set system hostname
    # https://man7.org/linux/man-pages/man2/sethostname.2.html
    # int sethostname(const char *name, size_t len);
    attach_function :sethostname, [:string, :size_t], :int
  end
end