From 8cf00cd1a6a93fe85ce26a0ece9d2697d45a89f0 Mon Sep 17 00:00:00 2001
From: evazion <noizave@gmail.com>
Date: Sat, 13 Nov 2021 04:58:31 -0600
Subject: [PATCH] Add sandbox for running untrusted code.

Add a Sandbox class for running untrusted external programs like ffmpeg
or exiftool inside a sandbox. This uses Linux namespaces to run the
process in an isolated container, much like a Docker container. Unlike a
Docker container, we can use it to sandbox programs when Danbooru itself
is already running inside a Docker container.

This is also more restrictive than Docker in several ways:

* It has a system call filter that is more restrictive and more
  customizable than Docker's filter by default. Even if the process
  breaks out of the container, the syscall filter will limit what it can
  do, even if it escalates to root.
* It blocks the use of setuid binaries, so the process can't use things
  like sudo to escalate to root inside the sandbox.
* It blocks all network access inside the sandbox by default.
* All files in the container are read-only by default. The sandboxed
  process can only communicate by writing to stdout.

See app/logical/sandbox.rb for more details.

This isn't actually enabled yet. It will be rolled out progressively to
ensure it doesn't break things.
---
 app/logical/sandbox.rb             | 418 +++++++++++++++++++++++++++++
 config/environments/development.rb |   2 +-
 2 files changed, 419 insertions(+), 1 deletion(-)
 create mode 100644 app/logical/sandbox.rb

diff --git a/app/logical/sandbox.rb b/app/logical/sandbox.rb
new file mode 100644
index 000000000..c37cae285
--- /dev/null
+++ b/app/logical/sandbox.rb
@@ -0,0 +1,418 @@
+# Run a program inside an isolated sandbox, much like a Docker container. Inside the sandbox,
+# the program doesn't have network access, can't see other programs, and can only see read-only
+# OS directories like /usr and /lib. It can only communicate by reading from stdin and printing
+# output to stdout.
+#
+# This is based on a combination of Linux namespaces, to isolate the process in a container,
+# and Seccomp, to restrict system calls.
+#
+# @example
+#   # Run a command in a sandbox and return the result as a string.
+#   output = Sandbox.new(stdin: File.open("image.jpg")).run!("exiftool -json -")
+#
+#   # Open a shell inside the sandbox.
+#   Sandbox.new.shell
+#
+#   # Doesn't work - no network access.
+#   Sandbox.new.system("ping 127.0.0.1")
+#
+#   # Doesn't work - no access to /home.
+#   Sandbox.new.system("cat ~/.ssh/id_rsa")
+#
+#   # Doesn't work - no access to test.txt.
+#   FileUtils.touch("test.txt")
+#   Sandbox.new.system("rm test.txt")
+#
+# Documentation:
+#
+# @see https://en.wikipedia.org/wiki/Linux_namespaces
+# @see https://blog.lizzie.io/linux-containers-in-500-loc.html
+# @see https://jvns.ca/blog/2020/04/27/new-zine-how-containers-work/
+# @see https://zserge.com/posts/containers/
+# @see https://man7.org/linux/man-pages/man7/namespaces.7.html
+# @see https://www.youtube.com/watch?v=8fi7uSYlOdc
+#
+# Utilities:
+#
+# @see https://github.com/netblue30/firejail
+# @see https://github.com/google/nsjail
+# @see https://man7.org/linux/man-pages/man1/setpriv.1.html
+# @see https://man7.org/linux/man-pages/man1/unshare.1.html
+class Sandbox
+  class Error < StandardError; end
+
+  attr_accessor :stdin, :stdout, :stderr, :root, :process, :network, :hostname, :tmp, :ro, :rw, :env, :seccomp
+
+  # Configure a new sandbox. Call `#confine!` afterward to run code in the sandbox.
+  #
+  # @param stdin [File, nil] The stdin use inside the sandbox. If nil, redirect stdin to /dev/null.
+  # @param stdout [File, nil] The stdout to use inside the sandbox. If nil, redirect stdout to /dev/null.
+  # @param stderr [File, nil] The stderr to use inside the sandbox. If nil, redirect stderr to /dev/null.
+  # @param process [Boolean] If true, allow sandboxed processes to see processes outside the sandbox. Default: false.
+  # @param network [Boolean] If true, allow network access inside the sandbox. Default: false.
+  # @param hostname [Boolean] If true, allow sandboxed processes to see the system hostname.
+  #   If false, generate a random hostname inside the sandbox. Default: false.
+  # @param root [Boolean] If true, run the sandboxed process with root privileges. Note: this
+  #   doesn't give root privileges outside the sandbox. Default: false.
+  # @param tmp [Boolean] If true, mount /tmp, /run, and /dev/shm in the sandbox. Default: false.
+  # @param ro [Array<String>] The list of directories to allow read-only access to inside the sandbox.
+  # @param rw [Array<String>] The list of directories to allow read-write access to inside the sandbox.
+  # @param env [Array<String>] The list of environment variables to keep inside the sandbox. Default: none.
+  # @param seccomp [Seccomp::Filter, nil] If present, a system call filter to apply inside the sandbox.
+  def initialize(stdin: $stdin, stdout: $stdout, stderr: $stderr, root: false, process: false,
+                 network: false, hostname: false, tmp: false, seccomp: Seccomp.allow("@common"),
+                 ro: %w[/usr /lib /lib64 /bin /sbin], rw: [], env: [])
+    @stdin  = stdin.nil?  ? File.open("/dev/null", "r") : stdin
+    @stdout = stdout.nil? ? File.open("/dev/null", "w") : stdout
+    @stderr = stderr.nil? ? File.open("/dev/null", "w") : stderr
+    @root = root
+    @network = network
+    @process = process
+    @hostname = hostname
+    @tmp = tmp
+    @ro = ro
+    @rw = rw
+    @env = env
+    @seccomp = seccomp
+  end
+
+  # Run a block of code in a sandboxed subprocess.
+  # @return [Integer] the process ID of the subprocess
+  def confine!(&block)
+    clear_env!
+    redirect_stdio!
+    close_fds!
+    no_new_privs!
+
+    new_user_namespace!
+    new_pid_namespace! do
+      new_hostname_namespace!
+      new_network_namespace!
+      new_cgroup_namespace!
+      new_ipc_namespace!
+      new_mount_namespace!
+      filter_syscalls!
+
+      yield
+    end
+  end
+
+  # Run a program in the sandbox and return immediately.
+  # @return [Integer] the process ID of the command
+  def spawn(*args)
+    Process.fork do
+      pid = confine! do
+        Process.exec(*args)
+      end
+
+      status = waitpid!(pid)
+      exit status.exitstatus
+    end
+  end
+
+  # Run a program in the sandbox and return its output. Raise an error if it fails.
+  # @return [String] The stdout of the command.
+  def run!(*args)
+    IO.pipe do |reader, writer|
+      sandbox = dup.tap { |o| o.stdout = writer }
+      pid = sandbox.spawn(*args)
+      writer.close
+
+      ret = reader.read
+      status = waitpid!(pid)
+      raise Error, "run!(#{args.map(&:inspect).join}) failed (exit #{status.exitstatus})" if !status.success?
+
+      ret
+    end
+  end
+
+  # Run a program in the sandbox and return its output. Return nil if it fails.
+  # @return [String, nil] The stdout of the command, or nil if it failed.
+  def run(*args)
+    run!(*args)
+  rescue Error
+    nil
+  end
+
+  # Run a program in the sandbox and wait for it to finish.
+  # @return [Process::Status] the exit status of the program
+  def system(command, *args)
+    pid = spawn(command, *args)
+    waitpid!(pid)
+  end
+
+  # Run an interactive shell in the sandbox.
+  def shell(shell = "/bin/sh")
+    system(shell)
+  end
+
+  protected
+
+  # Wait on a subprocess to exit. Raise an error if it is killed because it called an
+  # unauthorized syscall blocked by the seccomp filter.
+  def waitpid!(pid)
+    pid, status = Process.wait2(pid)
+
+    if status.signaled? && Signal.signame(status.termsig) == "SYS"
+      raise Error, "Command failed: called unauthorized syscall (see dmesg for details)"
+    else
+      status
+    end
+  end
+
+  # Move our process to a new user namespace. Inside the namespace, our process runs under a
+  # different UID/GID than outside the namespace.
+  #
+  # Creating a user namespace grants us root privileges inside the namespace. This lets us set
+  # up other namespaces. We later drop these privileges after setting up the other namespaces.
+  #
+  # @see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
+  def new_user_namespace!
+    outer_uid, outer_gid = Process.uid, Process.gid
+    uid = root ? 0 : outer_uid
+    gid = root ? 0 : outer_gid
+
+    raise Error, "multi-threaded processes can't be sandboxed (hint: set DANBOORU_DEBUG_MODE=1)" if Thread.list.count > 1
+    Linux.unshare!([:clone_newuser])
+
+    # Tell the kernel how to map our UID and GID outside the namespace to new
+    # (potentially different) IDs inside the namespace. See user_namespaces(7).
+    File.write("/proc/self/setgroups", "deny")
+    File.write("/proc/self/uid_map", "#{uid} #{outer_uid} 1\n")
+    File.write("/proc/self/gid_map", "#{gid} #{outer_gid} 1\n")
+  end
+
+  # Move our process to a new hostname namespace. Inside the namespace, we set a new random hostname.
+  def new_hostname_namespace!
+    return if hostname
+
+    Linux.unshare!([:clone_newuts])
+    sethostname!(SecureRandom.uuid)
+  end
+
+  # Move our process to a new PID namespace. Inside the namespace, we run as PID 1 and we can't
+  # see any processes outside the namespace. This requires a fork to spawn a new child in the namespace.
+  def new_pid_namespace!(&block)
+    return yield if process
+
+    Linux.unshare!([:clone_newpid])
+    Process.fork(&block)
+  end
+
+  # Move our process to a new network namespace. Inside the namespace, all we have is a
+  # disabled localhost interface, so we have no network access.
+  def new_network_namespace!
+    return if network
+    Linux.unshare!([:clone_newnet])
+  end
+
+  # Move our process to a new Cgroup namespace.
+  def new_cgroup_namespace!
+    Linux.unshare!([:clone_newcgroup])
+  end
+
+  # Move our process to a new IPC namespace.
+  def new_ipc_namespace!
+    Linux.unshare!([:clone_newipc])
+  end
+
+  # Move our process to a new mount namespace. Inside the namespace, our process has its own
+  # unique view of of the filesystem.
+  def new_mount_namespace!
+    Linux.unshare!([:clone_newns])
+    mount!("tmpfs", "/tmp", fstype: "tmpfs")
+
+    ro.each do |path|
+      bind_mount!(path, File.join("/tmp", path), flags: %i[rdonly nodev nosuid])
+    end
+    rw.each do |path|
+      bind_mount!(path, File.join("/tmp", path), flags: %i[nodev nosuid])
+    end
+
+    if process
+      bind_mount!("/proc", "/tmp/proc")
+    else
+      mount!("proc", "/tmp/proc", fstype: "proc")
+    end
+
+    if tmp
+      mount!("tmpfs", "/tmp/tmp", fstype: "tmpfs")
+      mount!("tmpfs", "/tmp/run", fstype: "tmpfs")
+      mount!("tmpfs", "/tmp/dev/shm", fstype: "tmpfs")
+    end
+
+    remount!("/tmp", flags: %i[rdonly nodev nosuid])
+    pivot_root!("/tmp")
+  end
+
+  # @return [Array<Integer>] The list of currently open file descriptors for the process.
+  def open_fds
+    Dir.open("/proc/self/fd") do |dir|
+      # Don't include "/proc/self/fd" itself in the list of open files
+      dir.children.map(&:to_i) - [dir.fileno]
+    end
+  end
+
+  # Close all open files for the process, except stdin, stdout, and stderr.
+  #
+  # @param keep [Array<Integer>] The list of open files to keep.
+  # @return [void]
+  def close_fds!(keep: [0, 1, 2])
+    fds = open_fds - keep
+
+    fds.each do |fd|
+      IO.new(fd).close
+    rescue ArgumentError
+      # Trying to close FDs 3 and 4 will raise an ArgumentError because these
+      # FDs are used internally by the Ruby VM. Ignore the error.
+    end
+  end
+
+  # Redirect stdin, stdout, and stderr for our process.
+  def redirect_stdio!
+    IO.new(0).reopen(stdin)
+    IO.new(1).reopen(stdout)
+    IO.new(2).reopen(stderr)
+  end
+
+  def clear_env!
+    ENV.delete_if do |name, value|
+      !env.include?(name)
+    end
+  end
+
+  # Activate seccomp(2) filtering.
+  def filter_syscalls!
+    seccomp&.apply!
+  end
+
+  # Call prctl(PR_SET_NO_NEW_PRIVS, 1). This makes it so setuid binaries have
+  # no effect, so we can't elevate privileges by running things like sudo(1).
+  # This is required by seccomp(2).
+  #
+  # @see https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html
+  def no_new_privs!
+    Linux.prctl!(:set_no_new_privs, 1, 0, 0, 0)
+  end
+
+  # Mount the `source` filesystem on the `target` directory.
+  def mount!(source, target, fstype: nil, flags: [])
+    FileUtils.mkdir_p(target, mode: 0755)
+    Linux.mount!(source, target, fstype, flags, nil)
+  end
+
+  # Remount an existing mountpoint with the new `flags`.
+  def remount!(target, flags: [])
+    mount!(nil, target, flags: flags + [:remount])
+  end
+
+  # Bind mount a directory to a new mountpoint. Bind mounting `/usr` to
+  # `/tmp/usr` means `/tmp/usr` refers to the same directory as `/usr`.
+  def bind_mount!(source, target, flags: [])
+    mount!(source, target, flags: [:bind, :rec, :private, *flags])
+  end
+
+  # Change the root (`/`) directory to the given directory.
+  def pivot_root!(newroot)
+    Linux.pivot_root!(newroot, newroot)
+    Dir.chdir("/")
+
+    # The new root was mounted on top of the old root. This unmounts the old root.
+    Linux.umount2!(".", :detach)
+  end
+
+  # Change the system hostname.
+  def sethostname!(hostname)
+    Linux.sethostname!(hostname, hostname.size)
+  end
+
+  # Create Ruby bindings for various Linux kernel syscalls.
+  # https://github.com/ffi/ffi/wiki
+  module Linux
+    extend FFI::Library
+    ffi_lib FFI::Library::LIBC
+
+    # Create a Ruby method that calls the given system call, and define a bang version that
+    # raises an error if the syscall fails.
+    def self.attach_function(name, *args)
+      define_singleton_method("#{name}!") do |*args|
+        ret = send(name, *args)
+
+        if ret < 0
+          message = "#{name}(#{args.map(&:inspect).join(", ")})"
+          raise SystemCallError.new(message, FFI.errno)
+        else
+          ret
+        end
+      end
+
+      super(name, *args)
+    end
+
+    # https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html
+    # https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
+    enum :prctl_command, [
+      :set_no_new_privs, 38
+    ]
+
+    # https://github.com/torvalds/linux/blob/master/include/uapi/linux/sched.h
+    bitmask :unshare_flags, [
+      :clone_time,      7,  # 0x00000080, New time namespace
+      :clone_newns,	    17, # 0x00020000,	New mount (filesystem) namespace
+      :clone_newcgroup, 25, # 0x02000000, New cgroup namespace
+      :clone_newuts,	  26, # 0x04000000,	New utsname (hostname) namespace
+      :clone_newipc,	  27, # 0x08000000,	New ipc namespace
+      :clone_newuser,	  28, # 0x10000000, New user namespace
+      :clone_newpid,	  29, # 0x20000000,	New pid namespace
+      :clone_newnet,	  30, # 0x40000000,	New network namespace
+    ]
+
+    # https://github.com/torvalds/linux/blob/master/include/uapi/linux/mount.h
+    bitmask :mount_flags, [
+      :rdonly,  0,
+      :nosuid,  1,
+      :nodev,   2,
+      :remount, 5,
+      :bind,    12,
+      :rec,     14,
+      :private, 18,
+      :slave,   19,
+    ]
+
+    # https://github.com/torvalds/linux/blob/master/include/linux/fs.h#L1425
+    bitmask :umount_flags, [
+      :detach, 1 # 0x2
+    ]
+
+    # prctl - operations on a process or thread
+    # https://man7.org/linux/man-pages/man2/prctl.2.html
+    # int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5);
+    attach_function :prctl, [:prctl_command, :long, :long, :long, :long], :int
+
+    # unshare - disassociate parts of the process execution context
+    # https://man7.org/linux/man-pages/man2/unshare.2.html
+    # https://man7.org/linux/man-pages/man7/namespaces.7.html
+    # int unshare(int flags);
+    attach_function :unshare, [:unshare_flags], :int
+
+    # mount - mount filesystem
+    # https://man7.org/linux/man-pages/man2/mount.2.html
+    # int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
+    attach_function :mount, [:string, :string, :string, :mount_flags, :pointer], :int
+
+    # umount, umount2 - unmount filesystem
+    # https://man7.org/linux/man-pages/man2/umount2.2.html
+    # int umount2(const char *target, int flags);
+    attach_function :umount2, [:string, :umount_flags], :int
+
+    # pivot_root - change the root mount
+    # https://man7.org/linux/man-pages/man2/pivot_root.2.html
+    # int pivot_root(const char *new_root, const char *put_old);
+    attach_function :pivot_root, [:string, :string], :int
+
+    # sethostname - set system hostname
+    # https://man7.org/linux/man-pages/man2/sethostname.2.html
+    # int sethostname(const char *name, size_t len);
+    attach_function :sethostname, [:string, :size_t], :int
+  end
+end
diff --git a/config/environments/development.rb b/config/environments/development.rb
index 226eaf286..38e1ddd63 100644
--- a/config/environments/development.rb
+++ b/config/environments/development.rb
@@ -59,7 +59,7 @@ Rails.application.configure do
 
   # Use an evented file watcher to asynchronously detect changes in source code,
   # routes, locales, etc. This feature depends on the listen gem.
-  config.file_watcher = ActiveSupport::EventedFileUpdateChecker
+  config.file_watcher = ActiveSupport::EventedFileUpdateChecker unless Danbooru.config.debug_mode
 
   # Uncomment if you wish to allow Action Cable access from any origin.
   # config.action_cable.disable_request_forgery_protection = true