Add Danbooru::Archive library for handling .zip and .rar files.

Introduce a new Danbooru::Archive library. This is a wrapper around libarchive that lets us extract
.zip, .rar, .7z, and other archive formats. Replace the rubyzip library in MediaFile::Ugoira with
the new Danbooru::Archive library.

This is a step towards fixing #5340: Add support for extracting archive attachments from certain sources.

This adds a new dependency on libarchive. Downstream users should `apt-get install libarchive13` if
they're not using Docker.

https://github.com/chef/ffi-libarchive
https://github.com/libarchive/libarchive
https://www.rubydoc.info/gems/ffi-libarchive/0.4.2
https://github.com/libarchive/libarchive/wiki/Examples#a-complete-extractor
This commit is contained in:
evazion
2022-11-14 13:36:09 -06:00
parent f942768ce8
commit 0c1e9a1618
6 changed files with 361 additions and 27 deletions

View File

@@ -9,7 +9,6 @@ gem "sanitize"
gem 'ruby-vips'
gem 'diff-lcs', :require => "diff/lcs/array"
gem 'bcrypt', :require => "bcrypt"
gem 'rubyzip', :require => "zip"
gem 'stripe'
gem 'aws-sdk-sqs', '~> 1'
gem 'responders'
@@ -59,6 +58,7 @@ gem "ffaker"
gem "composite_primary_keys"
gem "resolv"
gem "rover-df"
gem "ffi-libarchive"
group :development do
gem 'rubocop', require: false

View File

@@ -187,6 +187,8 @@ GEM
ffi-compiler (1.0.1)
ffi (>= 1.0.0)
rake
ffi-libarchive (1.1.3)
ffi (~> 1.0)
flamegraph (0.9.5)
fugit (1.7.1)
et-orbi (~> 1, >= 1.2.7)
@@ -550,6 +552,7 @@ DEPENDENCIES
factory_bot
ffaker
ffi
ffi-libarchive
flamegraph
good_job
google-cloud-bigquery
@@ -589,7 +592,6 @@ DEPENDENCIES
rubocop
rubocop-rails
ruby-vips
rubyzip
sanitize
scenic
selenium-webdriver

View File

@@ -0,0 +1,236 @@
# frozen_string_literal: true
# Danbooru::Archive is a utility class representing a .zip, .rar, or .7z archive file. This is a wrapper around
# libarchive that adds some utility methods for extracting an archive safely.
#
# @example
# Danbooru::Archive.extract!("foo.zip") do |dir, filenames|
# puts dir, filenames
# end
#
# @see https://github.com/chef/ffi-libarchive
# @see https://www.rubydoc.info/gems/ffi-libarchive/0.4.2
# @see https://github.com/libarchive/libarchive/wiki/ManualPages
module Archive
module C
# XXX Monkey patch ffi-libarchive to add some functions we need.
# https://www.freebsd.org/cgi/man.cgi?query=archive_util&sektion=3&format=html
attach_function_maybe :archive_format_name, [:pointer], :string
attach_function_maybe :archive_filter_name, [:pointer, :int], :string
attach_function_maybe :archive_filter_count, [:pointer], :int
end
end
module Danbooru
class Archive
class Error < StandardError; end
# Default flags when extracting files.
# @see https://www.freebsd.org/cgi/man.cgi?query=archive_write_disk&sektion=3&format=html
DEFAULT_FLAGS =
::Archive::EXTRACT_NO_OVERWRITE |
#::Archive::EXTRACT_SECURE_NOABSOLUTEPATHS |
::Archive::EXTRACT_SECURE_SYMLINKS |
::Archive::EXTRACT_SECURE_NODOTDOT
attr_reader :file
# Open an archive, or raise an error if the archive can't be opened. If given a block, pass the archive to the block
# and close the archive after the block finishes.
#
# @param filelike [String, File] The filename of the archive, or an open archive file.
# @yieldparam [Danbooru::Archive] The archive.
# @return [Danbooru::Archive] The archive.
def self.open!(filelike, &block)
file = filelike.is_a?(File) ? filelike : Kernel.open(filelike, binmode: true)
archive = new(file)
if block_given?
begin
yield archive
ensure
archive.close
end
else
archive
end
rescue => error
archive&.close
raise Error, error
end
# Open an archive, or return nil if the archive can't be opened. See `#open!` for details.
def self.open(filelike, &block)
open!(filelike, &block)
rescue Error
nil
end
# Extract the archive to the given directory. If a block is given, extract the archive to a temp directory and
# delete the directory afterwards. The block is given the name of the directory and the list of files.
#
# @param filelike [String, File] The filename of the archive, or an open archive file.
# @param directory [String] The directory to extract the files to. By default, this is a temp directory the caller must clean up.
# @yieldparam [String, Array<String>] The path to the temp directory, and the list of extracted files in the directory.
# @return [(String, Array<String>)] The path to the directory, and the list of extracted files in the directory.
def self.extract!(filelike, directory = nil, flags: DEFAULT_FLAGS, &block)
open!(filelike) do |archive|
archive.extract!(directory, flags: flags, &block)
end
end
# @param file [File] The archive file.
def initialize(file)
@file = file
end
def close
# no-op
end
# Iterate across each entry (file) in the archive.
#
# @return [Enumerator, Danbooru:Archive] If given a block, call the block on each entry and return the archive
# itself. If not given a block, return an Enumerator.
def each_entry(&block)
return enum_for(:each_entry) unless block_given?
# XXX We have to re-open the archive on every call because libarchive is designed for streaming and doesn't
# support iterating across the archive multiple times.
archive = ::Archive::Reader.open_filename(file.path)
while (entry = archive.next_header(clone_entry: true))
yield Entry.new(archive, entry)
end
self
ensure
archive&.close
end
alias_method :entries, :each_entry
# Extract the files in the archive to a directory. Subdirectories inside the archive are ignored; all files are
# extracted to a single top-level directory.
#
# If a block is given, extract the archive to a temp directory and delete the directory after the block finishes.
# Otherwise, extract to a temp directory and return the directory. The caller should delete the directory afterwards.
#
# @param directory [String] The directory to extract the files to. By default, this is a temp directory the caller must clean up.
# @yieldparam [String, Array<String>] The name of the temp directory, and the list of files in the directory.
# @return [(String, Array<String>)] The path to the directory, and the list of extracted files.
def extract!(directory = nil, flags: DEFAULT_FLAGS, &block)
raise ArgumentError, "can't pass directory and block at the same time" if block_given? && directory.present?
if block_given?
Dir.mktmpdir(["danbooru-archive-", "-" + File.basename(file.path)]) do |dir|
filenames = extract_to!(dir, flags: flags)
yield dir, filenames
end
else
dir = directory.presence || Dir.mktmpdir(["danbooru-archive-", "-" + File.basename(file.path)])
filenames = extract_to!(dir, flags: flags)
[dir, filenames]
end
end
# Extract the archive to a directory. See `extract!` for details.
def extract_to!(directory, flags: DEFAULT_FLAGS)
entries.map do |entry|
raise Danbooru::Archive::Error, "Can't extract archive containing absolute path (path: '#{entry.pathname_utf8}')" if entry.pathname_utf8.starts_with?("/")
raise Danbooru::Archive::Error, "'#{entry.pathname_utf8}' is not a regular file" if !entry.file?
path = "#{directory}/#{entry.pathname_utf8.tr("/", "_")}"
entry.extract!(path, flags: flags)
end
end
# @return [Integer] The total decompressed size of all files in the archive.
def uncompressed_size
@uncompressed_size ||= entries.sum(&:size)
end
# @return [Boolean] True if any entry in the archive satisfies the condition; otherwise false.
def exists?(&block)
entries.with_index { |entry, index| return true if yield entry, index + 1 }
false
end
# @return [String] The archive format ("RAR", "ZIP", etc).
def format
@format ||= entries.lazy.map(&:format).first
end
# Print the archive contents in `ls -l` format.
def ls(io = STDOUT)
io.puts(entries.map(&:ls).join("\n"))
end
end
# An entry represents a single file in an archive.
class Entry
attr_reader :archive, :entry
delegate :directory?, :file?, :close, :pathname, :pathname=, :size, :strmode, :uid, :gid, :mtime, to: :entry
# @param entry [::Archive] The archive the entry belongs to.
# @param entry [::Archive::Entry] The archive entry.
def initialize(archive, entry)
@archive = archive
@entry = entry
end
# Copy the entry. Called by `dup`.
def initialize_copy(entry)
@archive = entry.archive
@entry = ::Archive::Entry.new(entry.ffi_ptr, clone: true)
end
# Extract the file to the given destination. By default, don't overwrite files, don't allow symlinks or paths
# containing '..', and don't extract file ownership, permission, or timestamp information.
#
# @param destination [String] The path to extract the file to.
# @param flags [Integer] The extraction flags.
# @return [String] The path to the extracted file.
def extract!(destination, flags: Danbooru::Archive::DEFAULT_FLAGS)
entry = dup
entry.pathname = destination
result = ::Archive::C.archive_read_extract(entry.archive_ffi_ptr, entry.ffi_ptr, flags)
raise Danbooru::Archive::Error, "Error extracting '#{entry.pathname_utf8}': #{archive.error_string}" if result != ::Archive::C::OK
entry.pathname_utf8
end
# @return [String] The pathname encoded as UTF-8 instead of ASCII-8BIT. May be wrong if the original pathname wasn't UTF-8.
def pathname_utf8
pathname.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
end
# @return [String] The archive entry format ("RAR", "ZIP", etc).
def format
::Archive::C::archive_format_name(archive_ffi_ptr)
end
# @return [Array<String>] The list of filters for the entry.
def filters
count = ::Archive::C::archive_filter_count(archive_ffi_ptr)
count.times.map do |n|
::Archive::C::archive_filter_name(archive_ffi_ptr, n)
end
end
# @return [String] The entry in `ls -l` format.
def ls
"#{strmode} #{uid} #{gid} #{"%9d" % size} #{mtime.to_fs(:db)} #{pathname_utf8}"
end
def archive_ffi_ptr
archive.send(:archive)
end
# @return [FFI::Pointer] The pointer to the libarchive entry object.
def ffi_ptr
entry.entry
end
end
end

View File

@@ -18,7 +18,6 @@ class MediaFile::Ugoira < MediaFile
def close
file.close
zipfile.close
preview_frame.close
end
@@ -52,24 +51,15 @@ class MediaFile::Ugoira < MediaFile
raise NotImplementedError, "can't convert ugoira to webm: ffmpeg or mkvmerge not installed" unless self.class.videos_enabled?
raise RuntimeError, "can't convert ugoira to webm: no ugoira frame data was provided" unless frame_delays.present?
Dir.mktmpdir("ugoira-#{md5}") do |tmpdir|
Danbooru::Archive.extract!(file) do |tmpdir, filenames|
output_file = Tempfile.new(["ugoira-conversion", ".webm"], binmode: true)
FileUtils.mkdir_p("#{tmpdir}/images")
zipfile.each do |entry|
path = File.join(tmpdir, "images", entry.name)
entry.extract(path)
end
# Duplicate last frame to avoid it being displayed only for a very short amount of time.
last_file_name = zipfile.entries.last.name
last_file_name =~ /\A(\d{6})(\.\w{,4})\Z/
new_last_index = $1.to_i + 1
file_ext = $2
new_last_filename = ("%06d" % new_last_index) + file_ext
path_from = File.join(tmpdir, "images", last_file_name)
path_to = File.join(tmpdir, "images", new_last_filename)
last_file_name = File.basename(filenames.last)
last_index, file_ext = last_file_name.split(".")
new_last_filename = "#{"%06d" % (last_index.to_i + 1)}.#{file_ext}"
path_from = File.join(tmpdir, last_file_name)
path_to = File.join(tmpdir, new_last_filename)
FileUtils.cp(path_from, path_to)
delay_sum = 0
@@ -84,11 +74,10 @@ class MediaFile::Ugoira < MediaFile
f.write("#{delay_sum}\n")
end
ext = zipfile.first.name.match(/\.(\w{,4})$/)[1]
ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/images/%06d.#{ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 1 -passlogfile #{tmpdir}/ffmpeg2pass -f null /dev/null")
ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/%06d.#{file_ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 1 -passlogfile #{tmpdir}/ffmpeg2pass -f null /dev/null")
raise Error, "ffmpeg failed: #{ffmpeg_out}" unless status.success?
ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/images/%06d.#{ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 2 -passlogfile #{tmpdir}/ffmpeg2pass #{tmpdir}/tmp.webm")
ffmpeg_out, status = Open3.capture2e("ffmpeg -i #{tmpdir}/%06d.#{file_ext} -codec:v libvpx-vp9 -crf 12 -b:v 0 -an -threads 8 -tile-columns 2 -tile-rows 1 -row-mt 1 -pass 2 -passlogfile #{tmpdir}/ffmpeg2pass #{tmpdir}/tmp.webm")
raise Error, "ffmpeg failed: #{ffmpeg_out}" unless status.success?
mkvmerge_out, status = Open3.capture2e("mkvmerge -o #{output_file.path} --webm --timecodes 0:#{tmpdir}/timecodes.tc #{tmpdir}/tmp.webm")
@@ -100,13 +89,9 @@ class MediaFile::Ugoira < MediaFile
private
def zipfile
Zip::File.new(file.path)
end
def preview_frame
FFmpeg.new(convert).smart_video_preview
end
memoize :zipfile, :preview_frame, :dimensions, :convert, :metadata
memoize :preview_frame, :dimensions, :convert, :metadata
end

View File

@@ -26,7 +26,7 @@ DANBOORU_RUNTIME_DEPS="
ca-certificates mkvtoolnix rclone libpq5 openssl libgmpxx4ldbl
zlib1g libfftw3-3 libwebp7 libwebpmux3 libwebpdemux2 liborc-0.4.0 liblcms2-2
libpng16-16 libexpat1 libglib2.0 libgif7 libexif12 libheif1 libvpx7 libdav1d6
libseccomp2 libseccomp-dev libjemalloc2
libseccomp2 libseccomp-dev libjemalloc2 libarchive13
"
COMMON_RUNTIME_DEPS="
$DANBOORU_RUNTIME_DEPS $EXIFTOOL_RUNTIME_DEPS tini busybox less ncdu

View File

@@ -0,0 +1,111 @@
require 'test_helper'
class DanbooruArchiveTest < ActiveSupport::TestCase
context "Danbooru::Archive" do
context ".open! method" do
should "work without a block" do
archive = Danbooru::Archive.open!("test/files/ugoira.zip")
assert_equal(5, archive.entries.count)
end
should "work with a block" do
Danbooru::Archive.open!("test/files/ugoira.zip") do |archive|
assert_equal(5, archive.entries.count)
end
end
should "raise an error if the block raises an error" do
assert_raises(Danbooru::Archive::Error) { Danbooru::Archive.open!("test/files/ugoira.zip") { raise "failed" } }
end
should "raise an error if the file doesn't exist" do
assert_raises(Danbooru::Archive::Error) { Danbooru::Archive.open!("test/files/does_not_exist.zip") }
end
end
context ".open method" do
should "work without a block" do
archive = Danbooru::Archive.open("test/files/ugoira.zip")
assert_equal(5, archive.entries.count)
end
should "work with a block" do
Danbooru::Archive.open("test/files/ugoira.zip") do |archive|
assert_equal(5, archive.entries.count)
end
end
should "return nil if the block raises an error" do
assert_nil(Danbooru::Archive.open("test/files/ugoira.zip") { raise "failed" })
end
should "return nil if the file doesn't exist" do
assert_nil(Danbooru::Archive.open("test/files/does_not_exist.zip"))
end
end
context ".extract! method" do
should "extract to temp directory if not given a block or directory" do
dir, filenames = Danbooru::Archive.extract!("test/files/ugoira.zip")
assert_equal(true, File.directory?(dir))
assert_equal(5, filenames.size)
filenames.each { |filename| assert_equal(true, File.exist?(filename)) }
ensure
FileUtils.rm_rf(dir)
end
should "extract to a temp directory and delete it afterwards if given a block" do
Danbooru::Archive.extract!("test/files/ugoira.zip") do |dir, filenames|
@tmpdir = dir
assert_equal(true, File.directory?(dir))
assert_equal(5, filenames.size)
filenames.each { |filename| assert_equal(true, File.exist?(filename)) }
end
assert_equal(true, @tmpdir.present?)
assert_equal(false, File.exist?(@tmpdir))
end
should "extract to given directory if given a directory" do
Dir.mktmpdir do |tmpdir|
dir, filenames = Danbooru::Archive.extract!("test/files/ugoira.zip", tmpdir)
assert_equal(dir, tmpdir)
assert_equal(5, filenames.size)
filenames.each { |filename| assert_equal(true, File.exist?(filename)) }
end
end
end
context "#uncompressed_size method" do
should "work" do
archive = Danbooru::Archive.open!("test/files/ugoira.zip")
assert_equal(6161, archive.uncompressed_size)
end
end
context "#exists? method" do
should "work" do
archive = Danbooru::Archive.open!("test/files/ugoira.zip")
assert_equal(true, archive.exists? { |entry, count| count > 4 })
end
end
context "#format method" do
should "work" do
archive = Danbooru::Archive.open!("test/files/ugoira.zip")
assert_equal("ZIP 2.0 (uncompressed)", archive.format)
end
end
context "#ls method" do
should "work" do
archive = Danbooru::Archive.open!("test/files/ugoira.zip")
output = StringIO.new
archive.ls(output)
assert_match(/^-rw-rw-r-- *0 0 *1639 2014-10-05 23:31:06 000000\.jpg$/, output.tap(&:rewind).read)
end
end
end
end