536 lines
18 KiB
Ruby
536 lines
18 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# This is a wrapper around seccomp, a Linux kernel feature used to limit the
|
|
# system calls the current process is allowed to make. This is used for
|
|
# sandboxing code when processing user-uploaded files.
|
|
#
|
|
# @example
|
|
# # Allow only the read(2), write(2), close(2), and exit_group(2) syscalls to be used
|
|
# # for the remainder of the program; kill the process if any other syscalls are called.
|
|
# Seccomp.allow!("read write close exit_group")
|
|
#
|
|
# # Kill the process if sync(2) is called; allow all other syscalls.
|
|
# Seccomp.deny!("sync")
|
|
#
|
|
# # Run exiftool in a sandboxed subprocess, allowing it to only use syscalls
|
|
# # from the @exec, @signals, and @tty syscall groups.
|
|
# Seccomp.allow!("@exec @signals @tty") do
|
|
# exec "exiftool -json image.jpg"
|
|
# end
|
|
#
|
|
# # Run a shell inside a seccomp sandbox.
|
|
# Seccomp.allow!("@common") { exec "dash" }
|
|
#
|
|
# # Print a human-readable dump of the seccomp filter.
|
|
# puts Seccomp.allow("@exec @signals @tty").to_pfc
|
|
#
|
|
# # Show all available syscalls.
|
|
# puts Seccomp.syscalls
|
|
#
|
|
# Documentation:
|
|
#
|
|
# @see https://en.wikipedia.org/wiki/Seccomp
|
|
# @see https://lwn.net/Articles/656307/ A seccomp overview
|
|
# @see https://lwn.net/Articles/494252/ A library for seccomp filters
|
|
# @see https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html
|
|
# @see https://man7.org/linux/man-pages/man2/seccomp.2.html
|
|
# @see https://github.com/seccomp/libseccomp
|
|
# @see https://blog.cloudflare.com/sandboxing-in-linux-with-zero-lines-of-code/
|
|
# @see https://docs.docker.com/engine/security/seccomp/
|
|
# @see https://kubernetes.io/docs/tutorials/clusters/seccomp/
|
|
# @see https://www.freedesktop.org/software/systemd/man/systemd.exec.html#System%20Call%20Filtering
|
|
#
|
|
# Related projects:
|
|
#
|
|
# @see https://github.com/cloudflare/sandbox
|
|
# @see https://github.com/david942j/seccomp-tools
|
|
# @see https://man.openbsd.org/pledge.2
|
|
# @see https://dev.exherbo.org/~alip/sydbox/
|
|
#
|
|
# Syscall lists:
|
|
#
|
|
# @see https://github.com/seccomp/libseccomp/blob/main/src/syscalls.csv
|
|
# @see https://github.com/systemd/systemd/blob/main/src/shared/seccomp-util.c#L281
|
|
# @see https://github.com/torvalds/linux/blob/master/arch/x86/entry/syscalls/syscall_64.tbl
|
|
# @see https://marcin.juszkiewicz.com.pl/download/tables/syscalls.html
|
|
# @see https://filippo.io/linux-syscall-table/
|
|
module Seccomp
|
|
class Error < StandardError; end
|
|
|
|
# Symbolic groups of syscalls that can be used in filters.
|
|
SYSCALL_GROUPS = {
|
|
# A broad set of common syscalls sufficient to run most programs.
|
|
"@common" => %w[
|
|
@exec @exit @fs @memory @network @process-control @process-info @signals
|
|
@stdio @system-info @threads @time @tty
|
|
],
|
|
# Syscalls needed to cleanly exit a Ruby program.
|
|
"@exit" => %w[
|
|
exit exit_group getpid munmap rt_sigaction timer_delete
|
|
],
|
|
# Syscalls needed to allocate and manage memory.
|
|
"@memory" => %w[
|
|
brk mmap mmap2 munmap mprotect mremap
|
|
],
|
|
# Syscalls needed by multi-threaded Ruby programs.
|
|
"@threads" => %w[
|
|
futex getpid mmap ppoll read write sched_yield
|
|
],
|
|
# Syscalls commonly needed to execute external programs.
|
|
"@exec" => %w[
|
|
@memory
|
|
@stdio
|
|
@fs-read
|
|
@process-info
|
|
@exit
|
|
arch_prctl
|
|
execve execveat
|
|
futex
|
|
set_robust_list
|
|
set_tid_address
|
|
prlimit64
|
|
timer_settime
|
|
],
|
|
# Syscalls for reading and writing open files.
|
|
"@stdio" => %w[
|
|
close
|
|
dup dup2 dup3
|
|
getdents getdents64
|
|
fadvise64
|
|
fcntl
|
|
fgetxattr
|
|
fstat
|
|
lseek
|
|
pipe pipe2
|
|
read pread64 readv preadv preadv2
|
|
write pwrite64 writev pwritev pwritev2
|
|
],
|
|
# Syscalls for opening files.
|
|
"@fs-open" => %w[
|
|
open openat openat2
|
|
],
|
|
# Syscalls that read information from the filesystem.
|
|
"@fs-read" => %w[
|
|
@fs-open
|
|
access faccessat faccessat2
|
|
chdir fchdir
|
|
getcwd
|
|
getxattr lgetxattr fgetxattr
|
|
readlink readlinkat
|
|
stat fstat newfstatat lstat
|
|
statfs fstatfs
|
|
],
|
|
# Syscalls that modify data on the filesystem.
|
|
"@fs-write" => %w[
|
|
@fs-read
|
|
creat
|
|
fallocate
|
|
link linkat
|
|
mkdir mkdirat
|
|
rename renameat renameat2
|
|
rmdir
|
|
symlink symlinkat
|
|
truncate ftruncate
|
|
umask
|
|
unlink unlinkat
|
|
],
|
|
# Syscalls that modify metadata on the filesystem.
|
|
"@fs-attr" => %w[
|
|
@fs-write
|
|
chmod fchmod fchmodat
|
|
chown fchown fchownat lchown
|
|
setxattr lsetxattr fsetxattr
|
|
utime utimes utimensat futimesat
|
|
],
|
|
# Syscalls for reading or writing to the filesystem.
|
|
"@fs" => %w[
|
|
@stdio @fs-attr
|
|
],
|
|
"@evented-io" => %w[
|
|
epoll_create epoll_create1 epoll_ctl epoll_wait epoll_pwait
|
|
eventfd eventfd2
|
|
poll ppoll
|
|
select pselect6
|
|
],
|
|
"@network" => %w[
|
|
socket socketpair
|
|
accept accept4
|
|
bind
|
|
connect
|
|
listen
|
|
shutdown
|
|
|
|
recv recvfrom recvmsg recvmmsg recvmmsg_time64
|
|
send sendto sendmsg sendmmsg
|
|
getpeername
|
|
getsockname
|
|
getsockopt setsockopt
|
|
],
|
|
"@process-info" => %w[
|
|
capget
|
|
getpid getppid
|
|
getpgid getpgrp
|
|
getsid gettid
|
|
getuid geteuid getresuid
|
|
getgid getegid getresgid getgroups
|
|
sched_getaffinity
|
|
times
|
|
],
|
|
"@process-control" => %w[
|
|
clone clone3 fork vfork
|
|
getpriority setpriority
|
|
kill tkill tgkill rt_sigqueueinfo rt_tgsigqueueinfo
|
|
nice
|
|
pidfd_open pidfd_send_signal
|
|
prlimit64
|
|
setpgid
|
|
wait4 waitid waitpid
|
|
],
|
|
"@signals" => %w[
|
|
alarm
|
|
rt_sigaction sigaction
|
|
rt_sigpending sigpending
|
|
rt_sigprocmask sigprocmask
|
|
rt_sigsuspend sigsuspend
|
|
rt_sigtimedwait rt_sigtimedwait_time64
|
|
rt_sigreturn
|
|
signalfd signalfd4
|
|
sigaltstack
|
|
signal
|
|
pause
|
|
],
|
|
"@system-info" => %w[
|
|
sysinfo
|
|
uname
|
|
],
|
|
"@time" => %w[
|
|
nanosleep clock_nanosleep
|
|
clock_getres
|
|
clock_gettime
|
|
gettimeofday
|
|
time
|
|
],
|
|
"@tty" => %w[
|
|
ioctl
|
|
],
|
|
}
|
|
|
|
# A lowlevel wrapper around libseccomp using the Ruby FFI.
|
|
#
|
|
# https://github.com/ffi/ffi/wiki
|
|
# https://github.com/seccomp/libseccomp
|
|
module LibSeccomp
|
|
extend FFI::Library
|
|
ffi_lib "libseccomp"
|
|
|
|
# https://github.com/seccomp/libseccomp/blob/main/include/seccomp.h.in#L121
|
|
enum :arch, [:native, 0]
|
|
|
|
# https://github.com/seccomp/libseccomp/blob/main/include/seccomp.h.in#L332
|
|
enum FFI::Type::UINT32, :action, [
|
|
:kill, 0x80000000,
|
|
:kill_process, 0x80000000,
|
|
:kill_thread, 0x00000000,
|
|
:log, 0x7ffc0000,
|
|
:allow, 0x7fff0000,
|
|
]
|
|
|
|
# https://github.com/seccomp/libseccomp/blob/main/include/seccomp.h.in#L64
|
|
enum :attr, [
|
|
:tsync, 4,
|
|
:optimize, 8,
|
|
]
|
|
|
|
typedef :pointer, :scmp_filter_ctx
|
|
|
|
# seccomp_init - Initialize the seccomp filter state
|
|
# https://man7.org/linux/man-pages/man3/seccomp_init.3.html
|
|
# scmp_filter_ctx seccomp_init(uint32_t def_action);
|
|
attach_function :seccomp_init, [:action], :scmp_filter_ctx
|
|
|
|
# seccomp_load - Load the current seccomp filter into the kernel
|
|
# https://man7.org/linux/man-pages/man3/seccomp_load.3.html
|
|
# int seccomp_load(scmp_filter_ctx ctx);
|
|
attach_function :seccomp_load, [:scmp_filter_ctx], :int
|
|
|
|
# seccomp_release - Release the seccomp filter state
|
|
# https://man7.org/linux/man-pages/man3/seccomp_release.3.html
|
|
# void seccomp_release(scmp_filter_ctx ctx);
|
|
attach_function :seccomp_release, [:scmp_filter_ctx], :void
|
|
|
|
# seccomp_rule_add - Add a seccomp filter rule
|
|
# https://man7.org/linux/man-pages/man3/seccomp_rule_add.3.html
|
|
# int seccomp_rule_add(scmp_filter_ctx ctx, uint32_t action, int syscall, unsigned int arg_cnt, ...);
|
|
attach_function :seccomp_rule_add, [:scmp_filter_ctx, :action, :int, :uint32], :int
|
|
|
|
# seccomp_syscall_resolve_name - Resolve a syscall name to a number
|
|
# https://man7.org/linux/man-pages/man3/seccomp_syscall_resolve_name.3.html
|
|
# int seccomp_syscall_resolve_name(const char *name);
|
|
attach_function :seccomp_syscall_resolve_name, [:string], :int
|
|
|
|
# seccomp_syscall_resolve_name - Resolve a syscall number to a name
|
|
# https://man7.org/linux/man-pages/man3/seccomp_syscall_resolve_num_arch.3.html
|
|
# char* seccomp_syscall_resolve_num_arch(uint32_t arch_token, int num)
|
|
attach_function :seccomp_syscall_resolve_num_arch, [:arch, :int], :strptr
|
|
|
|
# seccomp_attr_set - Manage the seccomp filter attributes
|
|
# https://man7.org/linux/man-pages/man3/seccomp_attr_set.3.html
|
|
# int seccomp_attr_set(scmp_filter_ctx ctx, enum scmp_filter_attr attr, uint32_t value);
|
|
attach_function :seccomp_attr_set, [:scmp_filter_ctx, :attr, :uint32], :int
|
|
|
|
# seccomp_export_bpf - Export the seccomp filter as BPF
|
|
# int seccomp_export_bpf(const scmp_filter_ctx ctx, int fd);
|
|
# https://man7.org/linux/man-pages/man3/seccomp_export_bpf.3.html
|
|
attach_function :seccomp_export_bpf, [:scmp_filter_ctx, :int], :int
|
|
|
|
# seccomp_export_pfc - Export the seccomp filter as PFC
|
|
# int seccomp_export_pfc(const scmp_filter_ctx ctx, int fd);
|
|
# https://man7.org/linux/man-pages/man3/seccomp_export_pfc.3.html
|
|
attach_function :seccomp_export_pfc, [:scmp_filter_ctx, :int], :int
|
|
end
|
|
|
|
module LibC
|
|
extend FFI::Library
|
|
ffi_lib FFI::Library::LIBC
|
|
|
|
attach_function :free, [:pointer], :void
|
|
end
|
|
|
|
# A Seccomp::Filter represents a single seccomp filter, containing a set of
|
|
# syscall filtering rules and a default action.
|
|
class Filter
|
|
attr_reader :context, :tsync, :optimize
|
|
|
|
# Create a new syscall filter. Use `add_rule` to add rules to the filter.
|
|
# Use `apply!` to activate the filter after all rules have been added.
|
|
#
|
|
# If a block is given, run the block with the new filter.
|
|
#
|
|
# @param default_action [Symbol] The default action to take when a syscall
|
|
# doesn't match a rule.
|
|
# @param tsync [Boolean] True to apply the filter to all threads in the
|
|
# current process. False to apply it just to the current thread.
|
|
# @param optimize [Boolean] True to generate the filter as a binary tree,
|
|
# false as a sequential list.
|
|
def initialize(default_action = :kill, tsync: true, optimize: false)
|
|
@context = init!(default_action)
|
|
self.tsync = tsync
|
|
self.optimize = optimize
|
|
|
|
yield self if block_given?
|
|
end
|
|
|
|
# Add a syscall rule to the filter. If the syscall doesn't exist, raise an error.
|
|
#
|
|
# @param syscall_name [String] The name of the syscall
|
|
# @param action [Symbol] The action to take when the syscall is called (:allow, :log, :kill)
|
|
# @return [self]
|
|
# @raise [SystemCallError] If the rule couldn't be added
|
|
def add_rule(syscall_name, action)
|
|
syscall_number = Seccomp.resolve_syscall_name(syscall_name)
|
|
ret = LibSeccomp.seccomp_rule_add(context, action, syscall_number, 0)
|
|
raise SystemCallError.new("seccomp_rule_add(#{action}, #{syscall_name}) failed", -ret) if ret < 0
|
|
|
|
self
|
|
end
|
|
|
|
# Activate the syscall filter by loading it into the kernel. All code after
|
|
# this point must obey the syscall filter.
|
|
#
|
|
# If a block is given, apply the filter to the given block of code. The
|
|
# block is run in a forked subprocess, which means the filter only applies
|
|
# to the block of code.
|
|
#
|
|
# @return [self]
|
|
def apply!(&block)
|
|
return apply_to!(&block) if block_given?
|
|
|
|
ret = LibSeccomp.seccomp_load(context)
|
|
raise SystemCallError.new("seccomp_load(#{context}) failed", -ret) if ret < 0
|
|
self
|
|
end
|
|
|
|
# Apply the filter to a block of code in a forked subprocess.
|
|
def apply_to!(&block)
|
|
raise ArgumentError, "Seccomp::Filter#apply_block!: block required" unless block_given?
|
|
|
|
pid = Process.fork do
|
|
apply!
|
|
yield self
|
|
end
|
|
|
|
pid, status = Process.wait2(pid)
|
|
if status.signaled? && Signal.signame(status.termsig) == "SYS"
|
|
raise Error, "Subprocess called unauthorized syscall (see dmesg for details)"
|
|
end
|
|
|
|
self
|
|
end
|
|
|
|
# Return a string representing the filter in BPF (Berkeley Packet Filter) format.
|
|
#
|
|
# @return [String]
|
|
def to_bpf
|
|
IO.pipe do |reader, writer|
|
|
ret = LibSeccomp.seccomp_export_bpf(context, writer.fileno)
|
|
raise SystemCallError.new("seccomp_export_bpf() failed", -ret) if ret < 0
|
|
writer.close
|
|
reader.read
|
|
end
|
|
end
|
|
|
|
# Return a string representing the filter in PFC (Pseudo Filter Code) format.
|
|
#
|
|
# @return [String]
|
|
def to_pfc
|
|
IO.pipe do |reader, writer|
|
|
ret = LibSeccomp.seccomp_export_pfc(context, writer.fileno)
|
|
raise SystemCallError.new("seccomp_export_pfc() failed", -ret) if ret < 0
|
|
writer.close
|
|
reader.read
|
|
end
|
|
end
|
|
|
|
protected
|
|
|
|
# Create a new libseccomp context.
|
|
#
|
|
# @param default_action [Symbol] The default_action for the context
|
|
# @return [FFI::AutoPointer] The libseccomp context
|
|
def init!(default_action)
|
|
context = LibSeccomp.seccomp_init(default_action)
|
|
raise Errno::ENOMEM, "seccomp_init(#{default_action}) failed" if context == nil
|
|
|
|
FFI::AutoPointer.new(context, LibSeccomp.method(:seccomp_release))
|
|
end
|
|
|
|
# Set an attribute on the filter.
|
|
#
|
|
# @see https://man7.org/linux/man-pages/man3/seccomp_attr_set.3.html
|
|
# @param [Symbol] the attribute name
|
|
# @param [Integer] the attribute value
|
|
def set_attr(attr, value)
|
|
ret = LibSeccomp.seccomp_attr_set(context, attr, value)
|
|
raise SystemCallError.new("seccomp_attr_set(context, #{attr.inspect}, #{value}) failed", -ret) if ret < 0
|
|
end
|
|
|
|
# If true, apply the filter to all threads in the process. If false,
|
|
# apply it only to the current thread.
|
|
#
|
|
# @param value [Boolean]
|
|
# @return [void]
|
|
def tsync=(value)
|
|
set_attr(:tsync, value ? 1 : 0)
|
|
@tsync = value
|
|
end
|
|
|
|
# If true, generate the BPF code as a binary tree of if-else statements.
|
|
# May be faster for large rule sets. If false, generate the BPF code as a
|
|
# sequential list of if-else statements.
|
|
#
|
|
# @param value [Boolean]
|
|
# @return [void]
|
|
def optimize=(value)
|
|
set_attr(:optimize, value ? 2 : 1)
|
|
@optimize = value
|
|
end
|
|
end
|
|
|
|
# Create a filter allowing only the given set of syscalls. Deny all other
|
|
# syscalls by default. Calling a denied syscall will kill the process by default.
|
|
def self.allow(syscalls, default_action: :kill)
|
|
filter(syscalls, :allow, default_action: default_action)
|
|
end
|
|
|
|
# Create a filter denying the given set of syscalls. Allow all other
|
|
# syscalls by default. Calling a denied syscall will kill the process by default.
|
|
def self.deny(syscalls, default_action: :allow)
|
|
filter(syscalls, :kill, default_action: default_action)
|
|
end
|
|
|
|
# Create and immediately apply a filter allowing only the given set of syscalls.
|
|
def self.allow!(syscalls, default_action: :kill, &block)
|
|
allow(syscalls, default_action: default_action).apply!(&block)
|
|
end
|
|
|
|
# Create and immediately apply a filter denying the given set of syscalls.
|
|
def self.deny!(syscalls, default_action: :allow, &block)
|
|
deny(syscalls, default_action: default_action).apply!(&block)
|
|
end
|
|
|
|
# Create a syscall filter for the current process that performs `action` when
|
|
# any of the given syscalls are called, or `default_action` when any other
|
|
# syscall is called.
|
|
#
|
|
# Call `apply!` on the result to activate the filter.
|
|
#
|
|
# @param syscalls [Array<String>] The set of syscalls
|
|
# @param action [Symbol] The action to take when any of the given syscalls are called (:allow, :log, :kill)
|
|
# @param default_action [Symbol] The action to take when any other syscall is called (:allow, :log, :kill)
|
|
# @param options [Hash] Options to pass to Seccomp::Filter#initialize
|
|
# @return [Seccomp::Filter] the seccomp filter
|
|
def self.filter(syscalls, action, default_action: :kill, **options)
|
|
Filter.new(default_action, **options) do |filter|
|
|
expand_syscall_names(syscalls).each do |syscall|
|
|
filter.add_rule(syscall, action)
|
|
end
|
|
end
|
|
end
|
|
|
|
# Return the list of syscalls available on the current system. This list may
|
|
# vary depending on the CPU architecture and kernel version.
|
|
#
|
|
# @return [Hash<Integer, String>] a hash of syscall numbers to syscall names
|
|
def self.syscalls
|
|
@syscalls ||= 0.upto(8192).map do |n|
|
|
[n, resolve_syscall_number(n) ]
|
|
end.to_h.compact
|
|
end
|
|
|
|
# Recursively expand a list of syscall names, that may contain a mixture of regular
|
|
# names and syscall group names (e.g. `@stdio`), to a flat list of syscall names.
|
|
#
|
|
# @param syscall_names [Array<String>] A list of syscall names. May include syscall
|
|
# groups (e.g `@stdio`). May be a space-separated string, or a list of strings.
|
|
# @return [Array<String>] A list of syscall names
|
|
def self.expand_syscall_names(*syscall_names)
|
|
syscall_names.flatten.flat_map(&:split).flat_map do |syscall|
|
|
if syscall.start_with?("@")
|
|
group = SYSCALL_GROUPS.fetch(syscall)
|
|
expand_syscall_names(group)
|
|
else
|
|
syscall
|
|
end
|
|
end.sort.uniq
|
|
end
|
|
|
|
# Resolve a syscall name to a syscall number.
|
|
#
|
|
# May return a negative number if the syscall exists on another architecture,
|
|
# but not on this architecture. For example, `arch_prctl` exists on x86 but
|
|
# not on ARM or other architectures.
|
|
#
|
|
# Raises an error if the syscall doesn't exist on any architecture.
|
|
#
|
|
# @param [String] the syscall name
|
|
# @return [Integer] the syscall number
|
|
# @raise [Errno::EINVAL] if the syscall doesn't exist
|
|
def self.resolve_syscall_name(syscall_name)
|
|
syscall_number = LibSeccomp.seccomp_syscall_resolve_name(syscall_name.to_s)
|
|
raise Errno::EINVAL, "Syscall '#{syscall_name}' doesn't exist" if syscall_number == -1
|
|
syscall_number
|
|
end
|
|
|
|
# Resolve a syscall number to a syscall name.
|
|
#
|
|
# @param syscall_number [Integer] The syscall number
|
|
# @param arch [Symbol] The CPU architecture (x86_64, aarch64, etc)
|
|
# @return [String, nil] The syscall name, or nil if a syscall by that number doesn't exist
|
|
def self.resolve_syscall_number(syscall_number, arch = :native)
|
|
name, ptr = LibSeccomp.seccomp_syscall_resolve_num_arch(arch, syscall_number)
|
|
name
|
|
ensure
|
|
LibC.free(ptr)
|
|
end
|
|
end
|