diff --git a/Gemfile b/Gemfile index c8c1e3a9a..aa357dadb 100644 --- a/Gemfile +++ b/Gemfile @@ -55,6 +55,7 @@ gem "rack-timeout", require: "rack/timeout/base" gem "parallel" gem "pry-byebug" gem "pry-rails" +gem "ffi" group :development do gem 'rubocop', require: false diff --git a/Gemfile.lock b/Gemfile.lock index 325742ef6..6c901de51 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -508,6 +508,7 @@ DEPENDENCIES ed25519 factory_bot ffaker + ffi flamegraph google-cloud-bigquery google-cloud-storage diff --git a/app/logical/seccomp.rb b/app/logical/seccomp.rb new file mode 100644 index 000000000..a141cf1d2 --- /dev/null +++ b/app/logical/seccomp.rb @@ -0,0 +1,533 @@ +# This is a wrapper around seccomp, a Linux kernel feature used to limit the +# system calls the current process is allowed to make. This is used for +# sandboxing code when processing user-uploaded files. +# +# @example +# # Allow only the read(2), write(2), close(2), and exit_group(2) syscalls to be used +# # for the remainder of the program; kill the process if any other syscalls are called. +# Seccomp.allow!("read write close exit_group") +# +# # Kill the process if sync(2) is called; allow all other syscalls. +# Seccomp.deny!("sync") +# +# # Run exiftool in a sandboxed subprocess, allowing it to only use syscalls +# # from the @exec, @signals, and @tty syscall groups. +# Seccomp.allow!("@exec @signals @tty") do +# exec "exiftool -json image.jpg" +# end +# +# # Run a shell inside a seccomp sandbox. +# Seccomp.allow!("@common") { exec "dash" } +# +# # Print a human-readable dump of the seccomp filter. +# puts Seccomp.allow("@exec @signals @tty").to_pfc +# +# # Show all available syscalls. +# puts Seccomp.syscalls +# +# Documentation: +# +# @see https://en.wikipedia.org/wiki/Seccomp +# @see https://lwn.net/Articles/656307/ A seccomp overview +# @see https://lwn.net/Articles/494252/ A library for seccomp filters +# @see https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html +# @see https://man7.org/linux/man-pages/man2/seccomp.2.html +# @see https://github.com/seccomp/libseccomp +# @see https://blog.cloudflare.com/sandboxing-in-linux-with-zero-lines-of-code/ +# @see https://docs.docker.com/engine/security/seccomp/ +# @see https://kubernetes.io/docs/tutorials/clusters/seccomp/ +# @see https://www.freedesktop.org/software/systemd/man/systemd.exec.html#System%20Call%20Filtering +# +# Related projects: +# +# @see https://github.com/cloudflare/sandbox +# @see https://github.com/david942j/seccomp-tools +# @see https://man.openbsd.org/pledge.2 +# @see https://dev.exherbo.org/~alip/sydbox/ +# +# Syscall lists: +# +# @see https://github.com/seccomp/libseccomp/blob/main/src/syscalls.csv +# @see https://github.com/systemd/systemd/blob/main/src/shared/seccomp-util.c#L281 +# @see https://github.com/torvalds/linux/blob/master/arch/x86/entry/syscalls/syscall_64.tbl +# @see https://marcin.juszkiewicz.com.pl/download/tables/syscalls.html +# @see https://filippo.io/linux-syscall-table/ +module Seccomp + class Error < StandardError; end + + # Symbolic groups of syscalls that can be used in filters. + SYSCALL_GROUPS = { + # A broad set of common syscalls sufficient to run most programs. + "@common" => %w[ + @exec @exit @fs @memory @network @process-control @process-info @signals + @stdio @system-info @threads @time @tty + ], + # Syscalls needed to cleanly exit a Ruby program. + "@exit" => %w[ + exit exit_group getpid munmap rt_sigaction timer_delete + ], + # Syscalls needed to allocate and manage memory. + "@memory" => %w[ + brk mmap mmap2 munmap mprotect mremap + ], + # Syscalls needed by multi-threaded Ruby programs. + "@threads" => %w[ + futex getpid mmap ppoll read write sched_yield + ], + # Syscalls commonly needed to execute external programs. + "@exec" => %w[ + @memory + @stdio + @fs-read + @process-info + @exit + arch_prctl + execve execveat + futex + set_robust_list + set_tid_address + prlimit64 + timer_settime + ], + # Syscalls for reading and writing open files. + "@stdio" => %w[ + close + dup dup2 dup3 + getdents getdents64 + fadvise64 + fcntl + fgetxattr + fstat + lseek + pipe pipe2 + read pread64 readv preadv preadv2 + write pwrite64 writev pwritev pwritev2 + ], + # Syscalls for opening files. + "@fs-open" => %w[ + open openat openat2 + ], + # Syscalls that read information from the filesystem. + "@fs-read" => %w[ + @fs-open + access faccessat faccessat2 + chdir fchdir + getcwd + getxattr lgetxattr fgetxattr + readlink readlinkat + stat fstat newfstatat lstat + statfs fstatfs + ], + # Syscalls that modify data on the filesystem. + "@fs-write" => %w[ + @fs-read + creat + fallocate + link linkat + mkdir mkdirat + rename renameat renameat2 + rmdir + symlink symlinkat + truncate ftruncate + umask + unlink unlinkat + ], + # Syscalls that modify metadata on the filesystem. + "@fs-attr" => %w[ + @fs-write + chmod fchmod fchmodat + chown fchown fchownat lchown + setxattr lsetxattr fsetxattr + utime utimes utimensat futimesat + ], + # Syscalls for reading or writing to the filesystem. + "@fs" => %w[ + @stdio @fs-attr + ], + "@evented-io" => %w[ + epoll_create epoll_create1 epoll_ctl epoll_wait epoll_pwait + eventfd eventfd2 + poll ppoll + select pselect6 + ], + "@network" => %w[ + socket socketpair + accept accept4 + bind + connect + listen + shutdown + + recv recvfrom recvmsg recvmmsg recvmmsg_time64 + send sendto sendmsg sendmmsg + getpeername + getsockname + getsockopt setsockopt + ], + "@process-info" => %w[ + capget + getpid getppid + getpgid getpgrp + getsid gettid + getuid geteuid getresuid + getgid getegid getresgid getgroups + sched_getaffinity + times + ], + "@process-control" => %w[ + clone clone3 fork vfork + getpriority setpriority + kill tkill tgkill rt_sigqueueinfo rt_tgsigqueueinfo + nice + pidfd_open pidfd_send_signal + prlimit64 + setpgid + wait4 waitid waitpid + ], + "@signals" => %w[ + alarm + rt_sigaction sigaction + rt_sigpending sigpending + rt_sigprocmask sigprocmask + rt_sigsuspend sigsuspend + rt_sigtimedwait rt_sigtimedwait_time64 + rt_sigreturn + signalfd signalfd4 + sigaltstack + signal + pause + ], + "@system-info" => %w[ + sysinfo + uname + ], + "@time" => %w[ + nanosleep clock_nanosleep + clock_getres + clock_gettime + gettimeofday + time + ], + "@tty" => %w[ + ioctl + ], + } + + # A lowlevel wrapper around libseccomp using the Ruby FFI. + # + # https://github.com/ffi/ffi/wiki + # https://github.com/seccomp/libseccomp + module LibSeccomp + extend FFI::Library + ffi_lib "libseccomp" + + # https://github.com/seccomp/libseccomp/blob/main/include/seccomp.h.in#L121 + enum :arch, [:native, 0] + + # https://github.com/seccomp/libseccomp/blob/main/include/seccomp.h.in#L332 + enum FFI::Type::UINT32, :action, [ + :kill, 0x80000000, + :kill_process, 0x80000000, + :kill_thread, 0x00000000, + :log, 0x7ffc0000, + :allow, 0x7fff0000, + ] + + # https://github.com/seccomp/libseccomp/blob/main/include/seccomp.h.in#L64 + enum :attr, [ + :tsync, 4, + :optimize, 8, + ] + + typedef :pointer, :scmp_filter_ctx + + # seccomp_init - Initialize the seccomp filter state + # https://man7.org/linux/man-pages/man3/seccomp_init.3.html + # scmp_filter_ctx seccomp_init(uint32_t def_action); + attach_function :seccomp_init, [:action], :scmp_filter_ctx + + # seccomp_load - Load the current seccomp filter into the kernel + # https://man7.org/linux/man-pages/man3/seccomp_load.3.html + # int seccomp_load(scmp_filter_ctx ctx); + attach_function :seccomp_load, [:scmp_filter_ctx], :int + + # seccomp_release - Release the seccomp filter state + # https://man7.org/linux/man-pages/man3/seccomp_release.3.html + # void seccomp_release(scmp_filter_ctx ctx); + attach_function :seccomp_release, [:scmp_filter_ctx], :void + + # seccomp_rule_add - Add a seccomp filter rule + # https://man7.org/linux/man-pages/man3/seccomp_rule_add.3.html + # int seccomp_rule_add(scmp_filter_ctx ctx, uint32_t action, int syscall, unsigned int arg_cnt, ...); + attach_function :seccomp_rule_add, [:scmp_filter_ctx, :action, :int, :uint32], :int + + # seccomp_syscall_resolve_name - Resolve a syscall name to a number + # https://man7.org/linux/man-pages/man3/seccomp_syscall_resolve_name.3.html + # int seccomp_syscall_resolve_name(const char *name); + attach_function :seccomp_syscall_resolve_name, [:string], :int + + # seccomp_syscall_resolve_name - Resolve a syscall number to a name + # https://man7.org/linux/man-pages/man3/seccomp_syscall_resolve_num_arch.3.html + # char* seccomp_syscall_resolve_num_arch(uint32_t arch_token, int num) + attach_function :seccomp_syscall_resolve_num_arch, [:arch, :int], :strptr + + # seccomp_attr_set - Manage the seccomp filter attributes + # https://man7.org/linux/man-pages/man3/seccomp_attr_set.3.html + # int seccomp_attr_set(scmp_filter_ctx ctx, enum scmp_filter_attr attr, uint32_t value); + attach_function :seccomp_attr_set, [:scmp_filter_ctx, :attr, :uint32], :int + + # seccomp_export_bpf - Export the seccomp filter as BPF + # int seccomp_export_bpf(const scmp_filter_ctx ctx, int fd); + # https://man7.org/linux/man-pages/man3/seccomp_export_bpf.3.html + attach_function :seccomp_export_bpf, [:scmp_filter_ctx, :int], :int + + # seccomp_export_pfc - Export the seccomp filter as PFC + # int seccomp_export_pfc(const scmp_filter_ctx ctx, int fd); + # https://man7.org/linux/man-pages/man3/seccomp_export_pfc.3.html + attach_function :seccomp_export_pfc, [:scmp_filter_ctx, :int], :int + end + + module LibC + extend FFI::Library + ffi_lib FFI::Library::LIBC + + attach_function :free, [:pointer], :void + end + + # A Seccomp::Filter represents a single seccomp filter, containing a set of + # syscall filtering rules and a default action. + class Filter + attr_reader :context, :tsync, :optimize + + # Create a new syscall filter. Use `add_rule` to add rules to the filter. + # Use `apply!` to activate the filter after all rules have been added. + # + # If a block is given, run the block with the new filter. + # + # @param default_action [Symbol] The default action to take when a syscall + # doesn't match a rule. + # @param tsync [Boolean] True to apply the filter to all threads in the + # current process. False to apply it just to the current thread. + # @param optimize [Boolean] True to generate the filter as a binary tree, + # false as a sequential list. + def initialize(default_action = :kill, tsync: true, optimize: false) + @context = init!(default_action) + self.tsync = tsync + self.optimize = optimize + + yield self if block_given? + end + + # Add a syscall rule to the filter. If the syscall doesn't exist, raise an error. + # + # @param syscall_name [String] The name of the syscall + # @param action [Symbol] The action to take when the syscall is called (:allow, :log, :kill) + # @return [self] + # @raise [SystemCallError] If the rule couldn't be added + def add_rule(syscall_name, action) + syscall_number = Seccomp.resolve_syscall_name(syscall_name) + ret = LibSeccomp.seccomp_rule_add(context, action, syscall_number, 0) + raise SystemCallError.new("seccomp_rule_add(#{action}, #{syscall_name}) failed", -ret) if ret < 0 + + self + end + + # Activate the syscall filter by loading it into the kernel. All code after + # this point must obey the syscall filter. + # + # If a block is given, apply the filter to the given block of code. The + # block is run in a forked subprocess, which means the filter only applies + # to the block of code. + # + # @return [self] + def apply!(&block) + return apply_to!(&block) if block_given? + + ret = LibSeccomp.seccomp_load(context) + raise SystemCallError.new("seccomp_load(#{context}) failed", -ret) if ret < 0 + self + end + + # Apply the filter to a block of code in a forked subprocess. + def apply_to!(&block) + raise ArgumentError, "Seccomp::Filter#apply_block!: block required" unless block_given? + + pid = Process.fork do + apply! + yield self + end + + pid, status = Process.wait2(pid) + if status.signaled? && Signal.signame(status.termsig) == "SYS" + raise Error, "Subprocess called unauthorized syscall (see dmesg for details)" + end + + self + end + + # Return a string representing the filter in BPF (Berkeley Packet Filter) format. + # + # @return [String] + def to_bpf + IO.pipe do |reader, writer| + ret = LibSeccomp.seccomp_export_bpf(context, writer.fileno) + raise SystemCallError.new("seccomp_export_bpf() failed", -ret) if ret < 0 + writer.close + reader.read + end + end + + # Return a string representing the filter in PFC (Pseudo Filter Code) format. + # + # @return [String] + def to_pfc + IO.pipe do |reader, writer| + ret = LibSeccomp.seccomp_export_pfc(context, writer.fileno) + raise SystemCallError.new("seccomp_export_pfc() failed", -ret) if ret < 0 + writer.close + reader.read + end + end + + protected + + # Create a new libseccomp context. + # + # @param default_action [Symbol] The default_action for the context + # @return [FFI::AutoPointer] The libseccomp context + def init!(default_action) + context = LibSeccomp.seccomp_init(default_action) + raise Errno::ENOMEM, "seccomp_init(#{default_action}) failed" if context == nil + + FFI::AutoPointer.new(context, LibSeccomp.method(:seccomp_release)) + end + + # Set an attribute on the filter. + # + # @see https://man7.org/linux/man-pages/man3/seccomp_attr_set.3.html + # @param [Symbol] the attribute name + # @param [Integer] the attribute value + def set_attr(attr, value) + ret = LibSeccomp.seccomp_attr_set(context, attr, value) + raise SystemCallError.new("seccomp_attr_set(context, #{attr.inspect}, #{value}) failed", -ret) if ret < 0 + end + + # If true, apply the filter to all threads in the process. If false, + # apply it only to the current thread. + # + # @param value [Boolean] + # @return [void] + def tsync=(value) + set_attr(:tsync, value ? 1 : 0) + @tsync = value + end + + # If true, generate the BPF code as a binary tree of if-else statements. + # May be faster for large rule sets. If false, generate the BPF code as a + # sequential list of if-else statements. + # + # @param value [Boolean] + # @return [void] + def optimize=(value) + set_attr(:optimize, value ? 2 : 1) + @optimize = value + end + end + + # Create a filter allowing only the given set of syscalls. Deny all other + # syscalls by default. Calling a denied syscall will kill the process by default. + def self.allow(syscalls, default_action: :kill) + filter(syscalls, :allow, default_action: default_action) + end + + # Create a filter denying the given set of syscalls. Allow all other + # syscalls by default. Calling a denied syscall will kill the process by default. + def self.deny(syscalls, default_action: :allow) + filter(syscalls, :kill, default_action: default_action) + end + + # Create and immediately apply a filter allowing only the given set of syscalls. + def self.allow!(syscalls, default_action: :kill, &block) + allow(syscalls, default_action: default_action).apply!(&block) + end + + # Create and immediately apply a filter denying the given set of syscalls. + def self.deny!(syscalls, default_action: :allow, &block) + deny(syscalls, default_action: default_action).apply!(&block) + end + + # Create a syscall filter for the current process that performs `action` when + # any of the given syscalls are called, or `default_action` when any other + # syscall is called. + # + # Call `apply!` on the result to activate the filter. + # + # @param syscalls [Array] The set of syscalls + # @param action [Symbol] The action to take when any of the given syscalls are called (:allow, :log, :kill) + # @param default_action [Symbol] The action to take when any other syscall is called (:allow, :log, :kill) + # @param options [Hash] Options to pass to Seccomp::Filter#initialize + # @return [Seccomp::Filter] the seccomp filter + def self.filter(syscalls, action, default_action: :kill, **options) + Filter.new(default_action, **options) do |filter| + expand_syscall_names(syscalls).each do |syscall| + filter.add_rule(syscall, action) + end + end + end + + # Return the list of syscalls available on the current system. This list may + # vary depending on the CPU architecture and kernel version. + # + # @return [Hash] a hash of syscall numbers to syscall names + def self.syscalls + @syscalls ||= 0.upto(8192).map do |n| + [n, resolve_syscall_number(n) ] + end.to_h.compact + end + + # Recursively expand a list of syscall names, that may contain a mixture of regular + # names and syscall group names (e.g. `@stdio`), to a flat list of syscall names. + # + # @param syscall_names [Array] A list of syscall names. May include syscall + # groups (e.g `@stdio`). May be a space-separated string, or a list of strings. + # @return [Array] A list of syscall names + def self.expand_syscall_names(*syscall_names) + syscall_names.flatten.flat_map(&:split).flat_map do |syscall| + if syscall.start_with?("@") + group = SYSCALL_GROUPS.fetch(syscall) + expand_syscall_names(group) + else + syscall + end + end.sort.uniq + end + + # Resolve a syscall name to a syscall number. + # + # May return a negative number if the syscall exists on another architecture, + # but not on this architecture. For example, `arch_prctl` exists on x86 but + # not on ARM or other architectures. + # + # Raises an error if the syscall doesn't exist on any architecture. + # + # @param [String] the syscall name + # @return [Integer] the syscall number + # @raise [Errno::EINVAL] if the syscall doesn't exist + def self.resolve_syscall_name(syscall_name) + syscall_number = LibSeccomp.seccomp_syscall_resolve_name(syscall_name.to_s) + raise Errno::EINVAL, "Syscall '#{syscall_name}' doesn't exist" if syscall_number == -1 + syscall_number + end + + # Resolve a syscall number to a syscall name. + # + # @param syscall_number [Integer] The syscall number + # @param arch [Symbol] The CPU architecture (x86_64, aarch64, etc) + # @return [String, nil] The syscall name, or nil if a syscall by that number doesn't exist + def self.resolve_syscall_number(syscall_number, arch = :native) + name, ptr = LibSeccomp.seccomp_syscall_resolve_num_arch(arch, syscall_number) + name + ensure + LibC.free(ptr) + end +end diff --git a/config/docker/build-base-image.sh b/config/docker/build-base-image.sh index 9e3a8961e..e97578a19 100755 --- a/config/docker/build-base-image.sh +++ b/config/docker/build-base-image.sh @@ -23,6 +23,7 @@ DANBOORU_RUNTIME_DEPS=" ca-certificates mkvtoolnix rclone libpq5 zlib1g libfftw3-3 libwebp6 libwebpmux3 libwebpdemux2 liborc-0.4.0 liblcms2-2 libpng16-16 libjpeg-turbo8 libexpat1 libglib2.0 libgif7 libexif12 libvpx6 + libseccomp2 " COMMON_RUNTIME_DEPS=" $DANBOORU_RUNTIME_DEPS $EXIFTOOL_RUNTIME_DEPS tini busybox less ncdu