From 0c1e9a16181ce496db2c2acb9bcfbd314ab80a70 Mon Sep 17 00:00:00 2001 From: evazion Date: Mon, 14 Nov 2022 13:36:09 -0600 Subject: [PATCH] Add Danbooru::Archive library for handling .zip and .rar files. Introduce a new Danbooru::Archive library. This is a wrapper around libarchive that lets us extract .zip, .rar, .7z, and other archive formats. Replace the rubyzip library in MediaFile::Ugoira with the new Danbooru::Archive library. This is a step towards fixing #5340: Add support for extracting archive attachments from certain sources. This adds a new dependency on libarchive. Downstream users should `apt-get install libarchive13` if they're not using Docker. https://github.com/chef/ffi-libarchive https://github.com/libarchive/libarchive https://www.rubydoc.info/gems/ffi-libarchive/0.4.2 https://github.com/libarchive/libarchive/wiki/Examples#a-complete-extractor --- Gemfile | 2 +- Gemfile.lock | 4 +- app/logical/danbooru/archive.rb | 236 +++++++++++++++++++++++++++++ app/logical/media_file/ugoira.rb | 33 ++-- config/docker/build-base-image.sh | 2 +- test/unit/danbooru_archive_test.rb | 111 ++++++++++++++ 6 files changed, 361 insertions(+), 27 deletions(-) create mode 100644 app/logical/danbooru/archive.rb create mode 100644 test/unit/danbooru_archive_test.rb diff --git a/Gemfile b/Gemfile index d97d929a0..0f29f34ae 100644 --- a/Gemfile +++ b/Gemfile @@ -9,7 +9,6 @@ gem "sanitize" gem 'ruby-vips' gem 'diff-lcs', :require => "diff/lcs/array" gem 'bcrypt', :require => "bcrypt" -gem 'rubyzip', :require => "zip" gem 'stripe' gem 'aws-sdk-sqs', '~> 1' gem 'responders' @@ -59,6 +58,7 @@ gem "ffaker" gem "composite_primary_keys" gem "resolv" gem "rover-df" +gem "ffi-libarchive" group :development do gem 'rubocop', require: false diff --git a/Gemfile.lock b/Gemfile.lock index ac45fa5a9..8d7c933ba 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -187,6 +187,8 @@ GEM ffi-compiler (1.0.1) ffi (>= 1.0.0) rake + ffi-libarchive (1.1.3) + ffi (~> 1.0) flamegraph (0.9.5) fugit (1.7.1) et-orbi (~> 1, >= 1.2.7) @@ -550,6 +552,7 @@ DEPENDENCIES factory_bot ffaker ffi + ffi-libarchive flamegraph good_job google-cloud-bigquery @@ -589,7 +592,6 @@ DEPENDENCIES rubocop rubocop-rails ruby-vips - rubyzip sanitize scenic selenium-webdriver diff --git a/app/logical/danbooru/archive.rb b/app/logical/danbooru/archive.rb new file mode 100644 index 000000000..8c67bbaa2 --- /dev/null +++ b/app/logical/danbooru/archive.rb @@ -0,0 +1,236 @@ +# frozen_string_literal: true + +# Danbooru::Archive is a utility class representing a .zip, .rar, or .7z archive file. This is a wrapper around +# libarchive that adds some utility methods for extracting an archive safely. +# +# @example +# Danbooru::Archive.extract!("foo.zip") do |dir, filenames| +# puts dir, filenames +# end +# +# @see https://github.com/chef/ffi-libarchive +# @see https://www.rubydoc.info/gems/ffi-libarchive/0.4.2 +# @see https://github.com/libarchive/libarchive/wiki/ManualPages + +module Archive + module C + # XXX Monkey patch ffi-libarchive to add some functions we need. + # https://www.freebsd.org/cgi/man.cgi?query=archive_util&sektion=3&format=html + attach_function_maybe :archive_format_name, [:pointer], :string + attach_function_maybe :archive_filter_name, [:pointer, :int], :string + attach_function_maybe :archive_filter_count, [:pointer], :int + end +end + +module Danbooru + class Archive + class Error < StandardError; end + + # Default flags when extracting files. + # @see https://www.freebsd.org/cgi/man.cgi?query=archive_write_disk&sektion=3&format=html + DEFAULT_FLAGS = + ::Archive::EXTRACT_NO_OVERWRITE | + #::Archive::EXTRACT_SECURE_NOABSOLUTEPATHS | + ::Archive::EXTRACT_SECURE_SYMLINKS | + ::Archive::EXTRACT_SECURE_NODOTDOT + + attr_reader :file + + # Open an archive, or raise an error if the archive can't be opened. If given a block, pass the archive to the block + # and close the archive after the block finishes. + # + # @param filelike [String, File] The filename of the archive, or an open archive file. + # @yieldparam [Danbooru::Archive] The archive. + # @return [Danbooru::Archive] The archive. + def self.open!(filelike, &block) + file = filelike.is_a?(File) ? filelike : Kernel.open(filelike, binmode: true) + archive = new(file) + + if block_given? + begin + yield archive + ensure + archive.close + end + else + archive + end + rescue => error + archive&.close + raise Error, error + end + + # Open an archive, or return nil if the archive can't be opened. See `#open!` for details. + def self.open(filelike, &block) + open!(filelike, &block) + rescue Error + nil + end + + # Extract the archive to the given directory. If a block is given, extract the archive to a temp directory and + # delete the directory afterwards. The block is given the name of the directory and the list of files. + # + # @param filelike [String, File] The filename of the archive, or an open archive file. + # @param directory [String] The directory to extract the files to. By default, this is a temp directory the caller must clean up. + # @yieldparam [String, Array] The path to the temp directory, and the list of extracted files in the directory. + # @return [(String, Array)] The path to the directory, and the list of extracted files in the directory. + def self.extract!(filelike, directory = nil, flags: DEFAULT_FLAGS, &block) + open!(filelike) do |archive| + archive.extract!(directory, flags: flags, &block) + end + end + + # @param file [File] The archive file. + def initialize(file) + @file = file + end + + def close + # no-op + end + + # Iterate across each entry (file) in the archive. + # + # @return [Enumerator, Danbooru:Archive] If given a block, call the block on each entry and return the archive + # itself. If not given a block, return an Enumerator. + def each_entry(&block) + return enum_for(:each_entry) unless block_given? + + # XXX We have to re-open the archive on every call because libarchive is designed for streaming and doesn't + # support iterating across the archive multiple times. + archive = ::Archive::Reader.open_filename(file.path) + while (entry = archive.next_header(clone_entry: true)) + yield Entry.new(archive, entry) + end + + self + ensure + archive&.close + end + alias_method :entries, :each_entry + + # Extract the files in the archive to a directory. Subdirectories inside the archive are ignored; all files are + # extracted to a single top-level directory. + # + # If a block is given, extract the archive to a temp directory and delete the directory after the block finishes. + # Otherwise, extract to a temp directory and return the directory. The caller should delete the directory afterwards. + # + # @param directory [String] The directory to extract the files to. By default, this is a temp directory the caller must clean up. + # @yieldparam [String, Array] The name of the temp directory, and the list of files in the directory. + # @return [(String, Array)] The path to the directory, and the list of extracted files. + def extract!(directory = nil, flags: DEFAULT_FLAGS, &block) + raise ArgumentError, "can't pass directory and block at the same time" if block_given? && directory.present? + + if block_given? + Dir.mktmpdir(["danbooru-archive-", "-" + File.basename(file.path)]) do |dir| + filenames = extract_to!(dir, flags: flags) + yield dir, filenames + end + else + dir = directory.presence || Dir.mktmpdir(["danbooru-archive-", "-" + File.basename(file.path)]) + filenames = extract_to!(dir, flags: flags) + [dir, filenames] + end + end + + # Extract the archive to a directory. See `extract!` for details. + def extract_to!(directory, flags: DEFAULT_FLAGS) + entries.map do |entry| + raise Danbooru::Archive::Error, "Can't extract archive containing absolute path (path: '#{entry.pathname_utf8}')" if entry.pathname_utf8.starts_with?("/") + raise Danbooru::Archive::Error, "'#{entry.pathname_utf8}' is not a regular file" if !entry.file? + + path = "#{directory}/#{entry.pathname_utf8.tr("/", "_")}" + entry.extract!(path, flags: flags) + end + end + + # @return [Integer] The total decompressed size of all files in the archive. + def uncompressed_size + @uncompressed_size ||= entries.sum(&:size) + end + + # @return [Boolean] True if any entry in the archive satisfies the condition; otherwise false. + def exists?(&block) + entries.with_index { |entry, index| return true if yield entry, index + 1 } + false + end + + # @return [String] The archive format ("RAR", "ZIP", etc). + def format + @format ||= entries.lazy.map(&:format).first + end + + # Print the archive contents in `ls -l` format. + def ls(io = STDOUT) + io.puts(entries.map(&:ls).join("\n")) + end + end + + # An entry represents a single file in an archive. + class Entry + attr_reader :archive, :entry + delegate :directory?, :file?, :close, :pathname, :pathname=, :size, :strmode, :uid, :gid, :mtime, to: :entry + + # @param entry [::Archive] The archive the entry belongs to. + # @param entry [::Archive::Entry] The archive entry. + def initialize(archive, entry) + @archive = archive + @entry = entry + end + + # Copy the entry. Called by `dup`. + def initialize_copy(entry) + @archive = entry.archive + @entry = ::Archive::Entry.new(entry.ffi_ptr, clone: true) + end + + # Extract the file to the given destination. By default, don't overwrite files, don't allow symlinks or paths + # containing '..', and don't extract file ownership, permission, or timestamp information. + # + # @param destination [String] The path to extract the file to. + # @param flags [Integer] The extraction flags. + # @return [String] The path to the extracted file. + def extract!(destination, flags: Danbooru::Archive::DEFAULT_FLAGS) + entry = dup + entry.pathname = destination + + result = ::Archive::C.archive_read_extract(entry.archive_ffi_ptr, entry.ffi_ptr, flags) + raise Danbooru::Archive::Error, "Error extracting '#{entry.pathname_utf8}': #{archive.error_string}" if result != ::Archive::C::OK + + entry.pathname_utf8 + end + + # @return [String] The pathname encoded as UTF-8 instead of ASCII-8BIT. May be wrong if the original pathname wasn't UTF-8. + def pathname_utf8 + pathname.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?") + end + + # @return [String] The archive entry format ("RAR", "ZIP", etc). + def format + ::Archive::C::archive_format_name(archive_ffi_ptr) + end + + # @return [Array] The list of filters for the entry. + def filters + count = ::Archive::C::archive_filter_count(archive_ffi_ptr) + + count.times.map do |n| + ::Archive::C::archive_filter_name(archive_ffi_ptr, n) + end + end + + # @return [String] The entry in `ls -l` format. + def ls + "#{strmode} #{uid} #{gid} #{"%9d" % size} #{mtime.to_fs(:db)} #{pathname_utf8}" + end + + def archive_ffi_ptr + archive.send(:archive) + end + + # @return [FFI::Pointer] The pointer to the libarchive entry object. + def ffi_ptr + entry.entry + end + end +end diff --git a/app/logical/media_file/ugoira.rb b/app/logical/media_file/ugoira.rb index 0ef45fa93..bf7f10c05 100644 --- a/app/logical/media_file/ugoira.rb +++ b/app/logical/media_file/ugoira.rb @@ -18,7 +18,6 @@ class MediaFile::Ugoira < MediaFile def close file.close - zipfile.close preview_frame.close end @@ -52,24 +51,15 @@ class MediaFile::Ugoira < MediaFile raise NotImplementedError, "can't convert ugoira to webm: ffmpeg or mkvmerge not installed" unless self.class.videos_enabled? raise RuntimeError, "can't convert ugoira to webm: no ugoira frame data was provided" unless frame_delays.present? - Dir.mktmpdir("ugoira-#{md5}") do |tmpdir| + Danbooru::Archive.extract!(file) do |tmpdir, filenames| output_file = Tempfile.new(["ugoira-conversion", ".webm"], binmode: true) - FileUtils.mkdir_p("#{tmpdir}/images") - - zipfile.each do |entry| - path = File.join(tmpdir, "images", entry.name) - entry.extract(path) - end - # Duplicate last frame to avoid it being displayed only for a very short amount of time. - last_file_name = zipfile.entries.last.name - last_file_name =~ /\A(\d{6})(\.\w{,4})\Z/ - new_last_index = $1.to_i + 1 - file_ext = $2 - new_last_filename = ("%06d" % new_last_index) + file_ext - path_from = File.join(tmpdir, "images", last_file_name) - path_to = File.join(tmpdir, "images", new_last_filename) + last_file_name = File.basename(filenames.last) + last_index, file_ext = last_file_name.split(".") + new_last_filename = "#{"%06d" % (last_index.to_i + 1)}.#{file_ext}" + path_from = File.join(tmpdir, last_file_name) + path_to = File.join(tmpdir, new_last_filename) FileUtils.cp(path_from, path_to) delay_sum = 0 @@ -84,11 +74,10 @@ class MediaFile::Ugoira < MediaFile f.write("#{delay_sum}\n") end - ext = zipfile.first.name.match(/\.(\w{,4})$/)[1] - ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/images/%06d.#{ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 1 -passlogfile #{tmpdir}/ffmpeg2pass -f null /dev/null") + ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/%06d.#{file_ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 1 -passlogfile #{tmpdir}/ffmpeg2pass -f null /dev/null") raise Error, "ffmpeg failed: #{ffmpeg_out}" unless status.success? - ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/images/%06d.#{ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 2 -passlogfile #{tmpdir}/ffmpeg2pass #{tmpdir}/tmp.webm") + ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/%06d.#{file_ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 2 -passlogfile #{tmpdir}/ffmpeg2pass #{tmpdir}/tmp.webm") raise Error, "ffmpeg failed: #{ffmpeg_out}" unless status.success? mkvmerge_out, status = Open3.capture2e("mkvmerge -o #{output_file.path} --webm --timecodes 0:#{tmpdir}/timecodes.tc #{tmpdir}/tmp.webm") @@ -100,13 +89,9 @@ class MediaFile::Ugoira < MediaFile private - def zipfile - Zip::File.new(file.path) - end - def preview_frame FFmpeg.new(convert).smart_video_preview end - memoize :zipfile, :preview_frame, :dimensions, :convert, :metadata + memoize :preview_frame, :dimensions, :convert, :metadata end diff --git a/config/docker/build-base-image.sh b/config/docker/build-base-image.sh index 20ef18684..83b8f75ff 100755 --- a/config/docker/build-base-image.sh +++ b/config/docker/build-base-image.sh @@ -26,7 +26,7 @@ DANBOORU_RUNTIME_DEPS=" ca-certificates mkvtoolnix rclone libpq5 openssl libgmpxx4ldbl zlib1g libfftw3-3 libwebp7 libwebpmux3 libwebpdemux2 liborc-0.4.0 liblcms2-2 libpng16-16 libexpat1 libglib2.0 libgif7 libexif12 libheif1 libvpx7 libdav1d6 - libseccomp2 libseccomp-dev libjemalloc2 + libseccomp2 libseccomp-dev libjemalloc2 libarchive13 " COMMON_RUNTIME_DEPS=" $DANBOORU_RUNTIME_DEPS $EXIFTOOL_RUNTIME_DEPS tini busybox less ncdu diff --git a/test/unit/danbooru_archive_test.rb b/test/unit/danbooru_archive_test.rb new file mode 100644 index 000000000..b07b38e68 --- /dev/null +++ b/test/unit/danbooru_archive_test.rb @@ -0,0 +1,111 @@ +require 'test_helper' + +class DanbooruArchiveTest < ActiveSupport::TestCase + context "Danbooru::Archive" do + context ".open! method" do + should "work without a block" do + archive = Danbooru::Archive.open!("test/files/ugoira.zip") + assert_equal(5, archive.entries.count) + end + + should "work with a block" do + Danbooru::Archive.open!("test/files/ugoira.zip") do |archive| + assert_equal(5, archive.entries.count) + end + end + + should "raise an error if the block raises an error" do + assert_raises(Danbooru::Archive::Error) { Danbooru::Archive.open!("test/files/ugoira.zip") { raise "failed" } } + end + + should "raise an error if the file doesn't exist" do + assert_raises(Danbooru::Archive::Error) { Danbooru::Archive.open!("test/files/does_not_exist.zip") } + end + end + + context ".open method" do + should "work without a block" do + archive = Danbooru::Archive.open("test/files/ugoira.zip") + assert_equal(5, archive.entries.count) + end + + should "work with a block" do + Danbooru::Archive.open("test/files/ugoira.zip") do |archive| + assert_equal(5, archive.entries.count) + end + end + + should "return nil if the block raises an error" do + assert_nil(Danbooru::Archive.open("test/files/ugoira.zip") { raise "failed" }) + end + + should "return nil if the file doesn't exist" do + assert_nil(Danbooru::Archive.open("test/files/does_not_exist.zip")) + end + end + + context ".extract! method" do + should "extract to temp directory if not given a block or directory" do + dir, filenames = Danbooru::Archive.extract!("test/files/ugoira.zip") + + assert_equal(true, File.directory?(dir)) + assert_equal(5, filenames.size) + filenames.each { |filename| assert_equal(true, File.exist?(filename)) } + ensure + FileUtils.rm_rf(dir) + end + + should "extract to a temp directory and delete it afterwards if given a block" do + Danbooru::Archive.extract!("test/files/ugoira.zip") do |dir, filenames| + @tmpdir = dir + assert_equal(true, File.directory?(dir)) + assert_equal(5, filenames.size) + filenames.each { |filename| assert_equal(true, File.exist?(filename)) } + end + + assert_equal(true, @tmpdir.present?) + assert_equal(false, File.exist?(@tmpdir)) + end + + should "extract to given directory if given a directory" do + Dir.mktmpdir do |tmpdir| + dir, filenames = Danbooru::Archive.extract!("test/files/ugoira.zip", tmpdir) + assert_equal(dir, tmpdir) + assert_equal(5, filenames.size) + filenames.each { |filename| assert_equal(true, File.exist?(filename)) } + end + end + end + + context "#uncompressed_size method" do + should "work" do + archive = Danbooru::Archive.open!("test/files/ugoira.zip") + assert_equal(6161, archive.uncompressed_size) + end + end + + context "#exists? method" do + should "work" do + archive = Danbooru::Archive.open!("test/files/ugoira.zip") + assert_equal(true, archive.exists? { |entry, count| count > 4 }) + end + end + + context "#format method" do + should "work" do + archive = Danbooru::Archive.open!("test/files/ugoira.zip") + assert_equal("ZIP 2.0 (uncompressed)", archive.format) + end + end + + context "#ls method" do + should "work" do + archive = Danbooru::Archive.open!("test/files/ugoira.zip") + output = StringIO.new + + archive.ls(output) + assert_match(/^-rw-rw-r-- *0 0 *1639 2014-10-05 23:31:06 000000\.jpg$/, output.tap(&:rewind).read) + end + end + end +end