downloads: refactor Downloads::File into Danbooru::Http.

Remove the Downloads::File class. Move download methods to
Danbooru::Http instead. This means that:

* HTTParty has been replaced with http.rb for downloading files.

* Downloading is no longer tightly coupled to source strategies. Before
  Downloads::File tried to automatically look up the source and download
  the full size image instead if we gave it a sample url. Now we can
  do plain downloads without source strategies altering the url.

* The Cloudflare Polish check has been changed from checking for a
  Cloudflare IP to checking for the CF-Polished header. Looking up the
  list of Cloudflare IPs was slow and flaky during testing.

* The SSRF protection code has been factored out so it can be used for
  normal http requests, not just for downloads.

* The Webmock gem can be removed, since it was only used for stubbing
  out certain HTTParty requests in the download tests. The Webmock gem
  is buggy and caused certain tests to fail during CI.

* The retriable gem can be removed, since we no longer autoretry failed
  downloads. We assume that if a download fails once then retrying
  probably won't help.
This commit is contained in:
evazion
2020-06-15 04:41:42 -05:00
committed by evazion
parent 10b7a53449
commit 26ad844bbe
15 changed files with 173 additions and 285 deletions

View File

@@ -9,15 +9,6 @@ class CloudflareService
api_token.present? && zone.present?
end
def ips(expiry: 24.hours)
response = Danbooru::Http.cache(expiry).get("https://api.cloudflare.com/client/v4/ips")
return [] if response.code != 200
result = response.parse["result"]
ips = result["ipv4_cidrs"] + result["ipv6_cidrs"]
ips.map { |ip| IPAddr.new(ip) }
end
def purge_cache(urls)
return unless enabled?

View File

@@ -1,18 +1,25 @@
module Danbooru
class Http
class DownloadError < StandardError; end
class FileTooLargeError < StandardError; end
DEFAULT_TIMEOUT = 10
MAX_REDIRECTS = 5
attr_writer :cache, :http
attr_writer :cache, :max_size, :http
class << self
delegate :get, :put, :post, :delete, :cache, :follow, :timeout, :auth, :basic_auth, :headers, to: :new
delegate :get, :head, :put, :post, :delete, :cache, :follow, :max_size, :timeout, :auth, :basic_auth, :headers, :public_only, :download_media, to: :new
end
def get(url, **options)
request(:get, url, **options)
end
def head(url, **options)
request(:head, url, **options)
end
def put(url, **options)
request(:get, url, **options)
end
@@ -33,6 +40,10 @@ module Danbooru
dup.tap { |o| o.http = o.http.follow(*args) }
end
def max_size(size)
dup.tap { |o| o.max_size = size }
end
def timeout(*args)
dup.tap { |o| o.http = o.http.timeout(*args) }
end
@@ -49,6 +60,46 @@ module Danbooru
dup.tap { |o| o.http = o.http.headers(*args) }
end
# allow requests only to public IPs, not to local or private networks.
def public_only
dup.tap do |o|
o.http = o.http.dup.tap do |http|
http.default_options = http.default_options.with_socket_class(ValidatingSocket)
end
end
end
concerning :DownloadMethods do
def download_media(url, no_polish: true, **options)
url = Addressable::URI.heuristic_parse(url)
response = headers(Referer: url.origin).get(url)
# prevent Cloudflare Polish from modifying images.
if no_polish && response.headers["CF-Polished"].present?
url.query_values = url.query_values.to_h.merge(danbooru_no_polish: SecureRandom.uuid)
return download_media(url, no_polish: false)
end
file = download_response(response, **options)
[response, MediaFile.open(file)]
end
def download_response(response, file: Tempfile.new("danbooru-download-", binmode: true))
raise DownloadError, response if response.status != 200
raise FileTooLargeError, response if @max_size && response.content_length.to_i > @max_size
size = 0
response.body.each do |chunk|
size += chunk.size
raise FileTooLargeError if @max_size && size > @max_size
file.write(chunk)
end
file.rewind
file
end
end
protected
def request(method, url, **options)
@@ -57,11 +108,12 @@ module Danbooru
else
raw_request(method, url, **options)
end
rescue ValidatingSocket::ProhibitedIpError
fake_response(597, "")
rescue HTTP::Redirector::TooManyRedirectsError
::HTTP::Response.new(status: 598, body: "", version: "1.1")
fake_response(598, "")
rescue HTTP::TimeoutError
# return a synthetic http error on connection timeouts
::HTTP::Response.new(status: 599, body: "", version: "1.1")
fake_response(599, "")
end
def cached_request(method, url, **options)
@@ -79,6 +131,10 @@ module Danbooru
http.send(method, url, **options)
end
def fake_response(status, body)
::HTTP::Response.new(status: status, version: "1.1", body: ::HTTP::Response::Body.new(body))
end
def http
@http ||= ::HTTP.
follow(strict: false, max_hops: MAX_REDIRECTS).

View File

@@ -1,123 +0,0 @@
require 'resolv'
module Downloads
class File
include ActiveModel::Validations
class Error < StandardError; end
RETRIABLE_ERRORS = [Errno::ECONNRESET, Errno::ETIMEDOUT, Errno::EIO, Errno::EHOSTUNREACH, Errno::ECONNREFUSED, Timeout::Error, IOError]
delegate :data, to: :strategy
attr_reader :url, :referer
validate :validate_url
def initialize(url, referer = nil)
@url = Addressable::URI.parse(url) rescue nil
@referer = referer
validate!
end
def size
res = HTTParty.head(uncached_url, **httparty_options, timeout: 3)
if res.success?
res.content_length
else
raise HTTParty::ResponseError.new(res)
end
end
def download!(url: uncached_url, tries: 3, **options)
Retriable.retriable(on: RETRIABLE_ERRORS, tries: tries, base_interval: 0) do
file = http_get_streaming(url, headers: strategy.headers, **options)
return [file, strategy]
end
end
def validate_url
errors[:base] << "URL must not be blank" if url.blank?
errors[:base] << "'#{url}' is not a valid url" if !url.host.present?
errors[:base] << "'#{url}' is not a valid url. Did you mean 'http://#{url}'?" if !url.scheme.in?(%w[http https])
end
def http_get_streaming(url, file: Tempfile.new(binmode: true), headers: {}, max_size: Danbooru.config.max_file_size)
size = 0
res = HTTParty.get(url, httparty_options) do |chunk|
next if chunk.code == 302
size += chunk.size
raise Error.new("File is too large (max size: #{max_size})") if size > max_size && max_size > 0
file.write(chunk)
end
if res.success?
file.rewind
return file
else
raise Error.new("HTTP error code: #{res.code} #{res.message}")
end
end
# Prevent Cloudflare from potentially mangling the image. See issue #3528.
def uncached_url
return file_url unless is_cloudflare?(file_url)
url = file_url.dup
url.query_values = url.query_values.to_h.merge(danbooru_no_cache: SecureRandom.uuid)
url
end
def preview_url
@preview_url ||= Addressable::URI.parse(strategy.preview_url)
end
def file_url
@file_url ||= Addressable::URI.parse(strategy.image_url)
end
def strategy
@strategy ||= Sources::Strategies.find(url.to_s, referer)
end
def httparty_options
{
timeout: 10,
stream_body: true,
headers: strategy.headers,
connection_adapter: ValidatingConnectionAdapter
}.deep_merge(Danbooru.config.httparty_options)
end
def is_cloudflare?(url)
ip_addr = IPAddr.new(Resolv.getaddress(url.hostname))
CloudflareService.new.ips.any? { |subnet| subnet.include?(ip_addr) }
end
def self.banned_ip?(ip)
ip = IPAddress.parse(ip.to_s) unless ip.is_a?(IPAddress)
if ip.ipv4?
ip.loopback? || ip.link_local? || ip.multicast? || ip.private?
elsif ip.ipv6?
ip.loopback? || ip.link_local? || ip.unique_local? || ip.unspecified?
end
end
end
# Hook into HTTParty to validate the IP before following redirects.
# https://www.rubydoc.info/github/jnunemaker/httparty/HTTParty/ConnectionAdapter
class ValidatingConnectionAdapter < HTTParty::ConnectionAdapter
def self.call(uri, options)
ip_addr = IPAddress.parse(::Resolv.getaddress(uri.hostname))
if Downloads::File.banned_ip?(ip_addr)
raise Downloads::File::Error, "Downloads from #{ip_addr} are not allowed"
end
super(uri, options)
end
end
end

View File

@@ -12,8 +12,9 @@ class IqdbProxy
end
def download(url, type)
download = Downloads::File.new(url)
file, strategy = download.download!(url: download.send(type))
strategy = Sources::Strategies.find(url)
download_url = strategy.send(type)
file = strategy.download_file!(download_url)
file
end
@@ -32,7 +33,7 @@ class IqdbProxy
file = download(params[:image_url], :url)
results = query(file: file, limit: limit)
elsif params[:file_url].present?
file = download(params[:file_url], :file_url)
file = download(params[:file_url], :image_url)
results = query(file: file, limit: limit)
elsif params[:post_id].present?
url = Post.find(params[:post_id]).preview_file_url

View File

@@ -14,6 +14,8 @@
module Sources
module Strategies
class Base
class DownloadError < StandardError; end
attr_reader :url, :referer_url, :urls, :parsed_url, :parsed_referer, :parsed_urls
extend Memoist
@@ -35,8 +37,8 @@ module Sources
# <tt>referrer_url</tt> so the strategy can discover the HTML
# page and other information.
def initialize(url, referer_url = nil)
@url = url
@referer_url = referer_url
@url = url.to_s
@referer_url = referer_url&.to_s
@urls = [url, referer_url].select(&:present?)
@parsed_url = Addressable::URI.heuristic_parse(url) rescue nil
@@ -144,10 +146,22 @@ module Sources
# Returns the size of the image resource without actually downloading the file.
def size
Downloads::File.new(image_url).size
http.head(image_url).content_length.to_i
end
memoize :size
# Download the file at the given url, or at the main image url by default.
def download_file!(download_url = image_url)
response, file = http.download_media(download_url)
raise DownloadError, "Download failed: #{download_url} returned error #{response.status}" if response.status != 200
file
end
def http
Danbooru::Http.public_only.timeout(30).max_size(Danbooru.config.max_file_size)
end
memoize :http
# The url to use for artist finding purposes. This will be stored in the
# artist entry. Normally this will be the profile url.
def normalize_for_artist_finder

View File

@@ -7,11 +7,8 @@ class UploadService
# this gets called from UploadsController#new so we need to preprocess async
UploadPreprocessorDelayedStartJob.perform_later(url, ref, CurrentUser.user)
begin
download = Downloads::File.new(url, ref)
remote_size = download.size
rescue Exception
end
strategy = Sources::Strategies.find(url, ref)
remote_size = strategy.size
return [upload, remote_size]
end

View File

@@ -71,13 +71,13 @@ class UploadService
return file if file.present?
raise "No file or source URL provided" if upload.source_url.blank?
download = Downloads::File.new(upload.source_url, upload.referer_url)
file, strategy = download.download!
strategy = Sources::Strategies.find(upload.source_url, upload.referer_url)
file = strategy.download_file!
if download.data[:ugoira_frame_data].present?
if strategy.data[:ugoira_frame_data].present?
upload.context = {
"ugoira" => {
"frame_data" => download.data[:ugoira_frame_data],
"frame_data" => strategy.data[:ugoira_frame_data],
"content_type" => "image/jpeg"
}
}

View File

@@ -0,0 +1,27 @@
# A TCPSocket wrapper that disallows connections to local or private IPs. Used for SSRF protection.
# https://owasp.org/www-community/attacks/Server_Side_Request_Forgery
require "resolv"
class ValidatingSocket < TCPSocket
class ProhibitedIpError < StandardError; end
def initialize(hostname, port)
ip = validate_hostname!(hostname)
super(ip, port)
end
def validate_hostname!(hostname)
ip = IPAddress.parse(::Resolv.getaddress(hostname))
raise ProhibitedIpError, "Connection to #{hostname} failed; #{ip} is a prohibited IP" if prohibited_ip?(ip)
ip.to_s
end
def prohibited_ip?(ip)
if ip.ipv4?
ip.loopback? || ip.link_local? || ip.multicast? || ip.private?
elsif ip.ipv6?
ip.loopback? || ip.link_local? || ip.unique_local? || ip.unspecified?
end
end
end

View File

@@ -1,8 +1,8 @@
module DownloadTestHelper
def assert_downloaded(expected_filesize, source, referer = nil)
download = Downloads::File.new(source, referer)
tempfile, strategy = download.download!
assert_equal(expected_filesize, tempfile.size, "Tested source URL: #{source}")
strategy = Sources::Strategies.find(source, referer)
file = strategy.download_file!
assert_equal(expected_filesize, file.size, "Tested source URL: #{source}")
rescue Net::OpenTimeout
skip "Remote connection to #{source} failed"
end

View File

@@ -14,10 +14,4 @@ class CloudflareServiceTest < ActiveSupport::TestCase
assert_requested(:delete, "https://api.cloudflare.com/client/v4/zones/123/purge_cache", times: 1)
end
end
context "#ips" do
should "work" do
refute_empty(@cloudflare.ips)
end
end
end

View File

@@ -4,7 +4,7 @@ class DanbooruHttpTest < ActiveSupport::TestCase
context "Danbooru::Http" do
context "#get method" do
should "work for all basic methods" do
%i[get put post delete].each do |method|
%i[get head put post delete].each do |method|
response = Danbooru::Http.send(method, "https://httpbin.org/status/200")
assert_equal(200, response.status)
end
@@ -26,8 +26,10 @@ class DanbooruHttpTest < ActiveSupport::TestCase
end
should "fail if the request takes too long to download" do
response = Danbooru::Http.timeout(1).get("https://httpbin.org/drip?duration=5&numbytes=5")
assert_equal(599, response.status)
# XXX should return status 599 instead
assert_raises(HTTP::TimeoutError) do
response = Danbooru::Http.timeout(1).get("https://httpbin.org/drip?duration=10&numbytes=10").flush
end
end
should "automatically decompress gzipped responses" do
@@ -45,5 +47,45 @@ class DanbooruHttpTest < ActiveSupport::TestCase
assert_equal(response2.body, response1.body)
end
end
context "#download method" do
should "download files" do
response, file = Danbooru::Http.download_media("https://httpbin.org/bytes/1000")
assert_equal(200, response.status)
assert_equal(1000, file.size)
end
should "follow redirects when downloading files" do
response, file = Danbooru::Http.download_media("https://httpbin.org/redirect-to?url=https://httpbin.org/bytes/1000")
assert_equal(200, response.status)
assert_equal(1000, file.size)
end
should "fail if the url points to a private IP" do
assert_raises(Danbooru::Http::DownloadError) do
Danbooru::Http.public_only.download_media("https://127.0.0.1.xip.io")
end
end
should "fail if the url redirects to a private IP" do
assert_raises(Danbooru::Http::DownloadError) do
Danbooru::Http.public_only.download_media("https://httpbin.org/redirect-to?url=https://127.0.0.1.xip.io")
end
end
should "fail if a download is too large" do
assert_raises(Danbooru::Http::FileTooLargeError) do
response, file = Danbooru::Http.max_size(500).download_media("https://httpbin.org/bytes/1000")
end
end
should "fail if a streaming download is too large" do
assert_raises(Danbooru::Http::FileTooLargeError) do
response, file = Danbooru::Http.max_size(500).download_media("https://httpbin.org/stream-bytes/1000")
end
end
end
end
end

View File

@@ -3,53 +3,27 @@ require 'test_helper'
module Downloads
class ArtStationTest < ActiveSupport::TestCase
context "a download for a (small) artstation image" do
setup do
@asset = "https://cdnb3.artstation.com/p/assets/images/images/003/716/071/small/aoi-ogata-hate-city.jpg?1476754974"
@download = Downloads::File.new(@asset)
end
should "download the /4k/ image instead" do
file, strategy = @download.download!
assert_equal(1_880_910, ::File.size(file.path))
assert_downloaded(1_880_910, "https://cdnb3.artstation.com/p/assets/images/images/003/716/071/small/aoi-ogata-hate-city.jpg?1476754974")
end
end
context "for an image where an original does not exist" do
setup do
@asset = "https://cdna.artstation.com/p/assets/images/images/004/730/278/large/mendel-oh-dragonll.jpg"
@download = Downloads::File.new(@asset)
end
should "not try to download the original" do
file, strategy = @download.download!
assert_equal(483_192, ::File.size(file.path))
assert_downloaded(483_192, "https://cdna.artstation.com/p/assets/images/images/004/730/278/large/mendel-oh-dragonll.jpg")
end
end
context "a download for an ArtStation image hosted on CloudFlare" do
setup do
@asset = "https://cdnb.artstation.com/p/assets/images/images/003/716/071/large/aoi-ogata-hate-city.jpg?1476754974"
end
should "return the original file, not the polished file" do
@asset = "https://cdnb.artstation.com/p/assets/images/images/003/716/071/large/aoi-ogata-hate-city.jpg?1476754974"
assert_downloaded(1_880_910, @asset)
end
should "return the original filesize, not the polished filesize" do
assert_equal(1_880_910, Downloads::File.new(@asset).size)
end
end
context "a download for a https://$artist.artstation.com/projects/$id page" do
setup do
@source = "https://dantewontdie.artstation.com/projects/YZK5q"
@download = Downloads::File.new(@source)
end
should "download the original image instead" do
file, strategy = @download.download!
assert_equal(247_350, ::File.size(file.path))
assert_downloaded(247_350, "https://dantewontdie.artstation.com/projects/YZK5q")
end
end
end

View File

@@ -1,81 +0,0 @@
require 'test_helper'
module Downloads
class FileTest < ActiveSupport::TestCase
context "A post download" do
setup do
@source = "http://www.google.com/intl/en_ALL/images/logo.gif"
@download = Downloads::File.new(@source)
end
context "for a banned IP" do
setup do
Resolv.expects(:getaddress).returns("127.0.0.1").at_least_once
end
should "not try to download the file" do
assert_raise(Downloads::File::Error) { Downloads::File.new("http://evil.com").download! }
end
should "not try to fetch the size" do
assert_raise(Downloads::File::Error) { Downloads::File.new("http://evil.com").size }
end
should "not follow redirects to banned IPs" do
url = "http://httpbin.org/redirect-to?url=http://127.0.0.1"
stub_request(:get, url).to_return(status: 301, headers: { "Location": "http://127.0.0.1" })
assert_raise(Downloads::File::Error) { Downloads::File.new(url).download! }
end
should "not follow redirects that resolve to a banned IP" do
url = "http://httpbin.org/redirect-to?url=http://127.0.0.1.nip.io"
stub_request(:get, url).to_return(status: 301, headers: { "Location": "http://127.0.0.1.xip.io" })
assert_raise(Downloads::File::Error) { Downloads::File.new(url).download! }
end
end
context "that fails" do
should "retry three times before giving up" do
HTTParty.expects(:get).times(3).raises(Errno::ETIMEDOUT)
assert_raises(Errno::ETIMEDOUT) { @download.download! }
end
should "return an uncorrupted file on the second try" do
bomb = stub("bomb")
bomb.stubs(:code).raises(IOError)
resp = stub("resp", success?: true)
chunk = stub("a")
chunk.stubs(:code).returns(200)
chunk.stubs(:size).returns(1)
chunk.stubs(:to_s).returns("a")
HTTParty.expects(:get).twice.multiple_yields(chunk, bomb).then.multiple_yields(chunk, chunk).returns(resp)
@download.stubs(:is_cloudflare?).returns(false)
tempfile, _strategy = @download.download!
assert_equal("aa", tempfile.read)
end
end
should "throw an exception when the file is larger than the maximum" do
assert_raise(Downloads::File::Error) do
@download.download!(max_size: 1)
end
end
should "store the file in the tempfile path" do
tempfile, strategy = @download.download!
assert_operator(tempfile.size, :>, 0, "should have data")
end
should "correctly save the file when following 302 redirects" do
download = Downloads::File.new("https://yande.re/post/show/578014")
file, strategy = download.download!(url: download.preview_url)
assert_equal(19134, file.size)
end
end
end
end

View File

@@ -136,18 +136,14 @@ module Downloads
end
context "An ugoira site for pixiv" do
setup do
@download = Downloads::File.new("http://www.pixiv.net/member_illust.php?mode=medium&illust_id=62247364")
@tempfile, strategy = @download.download!
@tempfile.close!
end
should "capture the data" do
assert_equal(2, @download.data[:ugoira_frame_data].size)
if @download.data[:ugoira_frame_data][0]["file"]
@strategy = Sources::Strategies.find("http://www.pixiv.net/member_illust.php?mode=medium&illust_id=62247364")
assert_equal(2, @strategy.data[:ugoira_frame_data].size)
if @strategy.data[:ugoira_frame_data][0]["file"]
assert_equal([{"file" => "000000.jpg", "delay" => 125}, {"file" => "000001.jpg", "delay" => 125}], @download.data[:ugoira_frame_data])
else
assert_equal([{"delay_msec" => 125}, {"delay_msec" => 125}], @download.data[:ugoira_frame_data])
assert_equal([{"delay_msec" => 125}, {"delay_msec" => 125}], @strategy.data[:ugoira_frame_data])
end
end
end

View File

@@ -212,7 +212,7 @@ class UploadServiceTest < ActiveSupport::TestCase
context "on timeout errors" do
setup do
@source = "https://cdn.donmai.us/original/93/f4/93f4dd66ef1eb11a89e56d31f9adc8d0.jpg"
HTTParty.stubs(:get).raises(Net::ReadTimeout)
Danbooru::Http.any_instance.stubs(:get).raises(HTTP::TimeoutError)
end
should "leave the upload in an error state" do