Refactor sources

This commit is contained in:
Albert Yi
2018-08-06 17:39:25 -07:00
parent 54363ffecf
commit 762dc3da24
71 changed files with 2340 additions and 2430 deletions

View File

@@ -3,16 +3,33 @@ module Downloads
class Error < Exception ; end
attr_reader :data, :options
attr_accessor :source, :original_source, :downloaded_source
attr_accessor :source, :referer
def initialize(source, options = {})
# Prevent Cloudflare from potentially mangling the image. See issue #3528.
def self.uncached_url(url, headers = {})
url = Addressable::URI.parse(url)
if is_cloudflare?(url, headers)
url.query_values = (url.query_values || {}).merge(danbooru_no_cache: SecureRandom.uuid)
end
url
end
def self.is_cloudflare?(url, headers = {})
Cache.get("is_cloudflare:#{url.origin}", 4.hours) do
res = HTTParty.head(url, { headers: headers }.deep_merge(Danbooru.config.httparty_options))
raise Error.new("HTTP error code: #{res.code} #{res.message}") unless res.success?
res.key?("CF-Ray")
end
end
def initialize(source, referer=nil, options = {})
# source can potentially get rewritten in the course
# of downloading a file, so check it again
@source = source
@original_source = source
# the URL actually downloaded after rewriting the original source.
@downloaded_source = nil
@referer = referer
# we sometimes need to capture data from the source page
@data = {}
@@ -22,48 +39,31 @@ module Downloads
@data[:get_thumbnail] = options[:get_thumbnail]
end
def rewrite_url
url, _, _ = before_download(@source, @data)
return url
end
def size
url, headers, _ = before_download(@source, @data)
options = { timeout: 3, headers: headers }.deep_merge(Danbooru.config.httparty_options)
res = HTTParty.head(url, options)
res.content_length
strategy = Sources::Strategies.find(source, referer)
options = { timeout: 3, headers: strategy.headers }.deep_merge(Danbooru.config.httparty_options)
res = HTTParty.head(strategy.file_url, options)
if res.success?
res.content_length
else
raise HTTParty::ResponseError.new(res)
end
end
def download!
url, headers, @data = before_download(@source, @data)
strategy = Sources::Strategies.find(source, referer)
output_file = Tempfile.new(binmode: true)
http_get_streaming(uncached_url(url, headers), output_file, headers)
@data = strategy.data
@downloaded_source = url
@source = after_download(url)
http_get_streaming(
self.class.uncached_url(strategy.file_url, strategy.headers),
output_file,
strategy.headers
)
output_file
end
def before_download(url, datums)
original_url = url
headers = Danbooru.config.http_headers
RewriteStrategies::Base.strategies.each do |strategy|
url, headers, datums = strategy.new(url).rewrite(url, headers, datums)
url = original_url if url.nil?
end
return [url, headers, datums]
end
def after_download(src)
src = fix_twitter_sources(src)
if options[:referer_url].present?
src = set_source_to_referer(src, options[:referer_url])
end
src
[output_file, strategy]
end
def validate_local_hosts(url)
@@ -111,50 +111,5 @@ module Downloads
end
end # while
end # def
def fix_twitter_sources(src)
if src =~ %r!^https?://(?:video|pbs)\.twimg\.com/! && original_source =~ %r!^https?://twitter\.com/!
original_source
elsif src =~ %r!^https?://img\.pawoo\.net/! && original_source =~ %r!^https?://pawoo\.net/!
original_source
else
src
end
end
def set_source_to_referer(src, referer)
if Sources::Strategies::Nijie.url_match?(src) ||
Sources::Strategies::Twitter.url_match?(src) || Sources::Strategies::Twitter.url_match?(referer) ||
Sources::Strategies::Pawoo.url_match?(src) ||
Sources::Strategies::Tumblr.url_match?(src) || Sources::Strategies::Tumblr.url_match?(referer) ||
Sources::Strategies::ArtStation.url_match?(src) || Sources::Strategies::ArtStation.url_match?(referer)
strategy = Sources::Site.new(src, :referer_url => referer)
strategy.referer_url
else
src
end
end
private
# Prevent Cloudflare from potentially mangling the image. See issue #3528.
def uncached_url(url, headers = {})
url = Addressable::URI.parse(url)
if is_cloudflare?(url, headers)
url.query_values = (url.query_values || {}).merge(danbooru_no_cache: SecureRandom.uuid)
end
url
end
def is_cloudflare?(url, headers = {})
Cache.get("is_cloudflare:#{url.origin}", 4.hours) do
res = HTTParty.head(url, { headers: headers }.deep_merge(Danbooru.config.httparty_options))
raise Error.new("HTTP error code: #{res.code} #{res.message}") unless res.success?
res.key?("CF-Ray")
end
end
end
end

View File

@@ -1,33 +0,0 @@
module Downloads
module RewriteStrategies
class ArtStation < Base
def rewrite(url, headers, data = {})
# example: https://cdnb3.artstation.com/p/assets/images/images/003/716/071/large/aoi-ogata-hate-city.jpg?1476754974
if url =~ %r!^https?://cdn\w*\.artstation\.com/p/assets/images/images/\d+/\d+/\d+/(?:medium|small|large)/!
original_url, headers = rewrite_large_url(url, headers)
if http_exists?(original_url, headers)
url = original_url
end
else
url, headers = rewrite_html_url(url, headers)
end
return [url, headers, data]
end
protected
def rewrite_html_url(url, headers)
return [url, headers] unless Sources::Strategies::ArtStation.url_match?(url)
source = Sources::Site.new(url)
source.get
[source.image_url, headers]
end
def rewrite_large_url(url, headers)
# example: https://cdnb3.artstation.com/p/assets/images/images/003/716/071/original/aoi-ogata-hate-city.jpg?1476754974
url = url.sub(%r!/(?:medium|small|large)/!, "/original/")
return [url, headers]
end
end
end
end

View File

@@ -1,29 +0,0 @@
# This is a collection of strategies for normalizing URLs. Most strategies
# typically work by parsing and rewriting the URL itself, but some strategies
# may delegate to Sources::Strategies to obtain a more canonical URL.
module Downloads
module RewriteStrategies
class Base
attr_reader :url
def initialize(url = nil)
@url = url
end
def self.strategies
[Downloads::RewriteStrategies::Pixiv, Downloads::RewriteStrategies::NicoSeiga, Downloads::RewriteStrategies::ArtStation, Downloads::RewriteStrategies::Twitpic, Downloads::RewriteStrategies::DeviantArt, Downloads::RewriteStrategies::Tumblr, Downloads::RewriteStrategies::Moebooru, Downloads::RewriteStrategies::Twitter, Downloads::RewriteStrategies::Nijie, Downloads::RewriteStrategies::Pawoo]
end
def rewrite(url, headers, data = {})
return [url, headers, data]
end
protected
def http_exists?(url, headers)
res = HTTParty.head(url, Danbooru.config.httparty_options.deep_merge(headers: headers))
res.success?
end
end
end
end

View File

@@ -1,53 +0,0 @@
module Downloads
module RewriteStrategies
class DeviantArt < Base
attr_accessor :url, :source
def initialize(url)
@url = url
end
def rewrite(url, headers, data = {})
if url =~ %r{deviantart\.com/art/} || url =~ %r{\Ahttps?://www\.deviantart\.com/([^/]+)/art/} || url =~ %r{deviantart\.net/.+/[a-z0-9_]+(_by_[a-z0-9_]+)?-d([a-z0-9]+)\.}i
url, headers = rewrite_html_pages(url, headers)
url, headers = rewrite_thumbnails(url, headers)
end
return [url, headers, data]
end
protected
def rewrite_html_pages(url, headers)
if url =~ %r{^https?://.+?\.deviantart\.com/art/} || url =~ %r{\Ahttps?://www\.deviantart\.com/([^/]+)/art/}
return [source.image_url, headers]
else
return [url, headers]
end
end
def rewrite_thumbnails(url, headers)
if url =~ %r{^(https?://(?:fc|th)\d{2}\.deviantart\.net/.+?/)200H/}
match = $1
url.sub!(match + "200H/", match)
elsif url =~ %r{^(https?://(?:fc|th)\d{2}\.deviantart\.net/.+?/)PRE/}
match = $1
url.sub!(match + "PRE/", match)
elsif url =~ %r{^https?://(?:pre|img)\d{2}\.deviantart\.net/}
return [source.image_url, headers]
end
return [url, headers]
end
# Cache the source data so it gets fetched at most once.
def source
@source ||= begin
source = ::Sources::Strategies::DeviantArt.new(url)
source.get
source
end
end
end
end
end

View File

@@ -1,26 +0,0 @@
module Downloads
module RewriteStrategies
class Moebooru < Base
DOMAINS = '(?:[^.]+\.)?yande\.re|konachan\.com'
def rewrite(url, headers, data = {})
if url =~ %r{https?://(?:#{DOMAINS})}
url, headers = rewrite_jpeg_versions(url, headers)
end
return [url, headers, data]
end
protected
def rewrite_jpeg_versions(url, headers)
# example: https://yande.re/jpeg/2c6876ac2317fce617e3c5f1a642123b/yande.re%20292092%20hatsune_miku%20tid%20vocaloid.jpg
if url =~ %r{\A(https?://(?:#{DOMAINS}))/jpeg/([a-f0-9]+(?:/.*)?)\.jpg\Z}
url = $1 + "/image/" + $2 + ".png"
end
return [url, headers]
end
end
end
end

View File

@@ -1,66 +0,0 @@
module Downloads
module RewriteStrategies
class NicoSeiga < Base
attr_accessor :url, :source
def initialize(url)
@url = url
end
def rewrite(url, headers, data = {})
if url =~ %r{https?://lohas\.nicoseiga\.jp} || url =~ %r{https?://seiga\.nicovideo\.jp}
url, headers = rewrite_headers(url, headers)
url, headers = rewrite_html_pages(url, headers)
url, headers = rewrite_thumbnails(url, headers)
url, headers = rewrite_view_big_pages(url, headers)
end
return [url, headers, data]
end
protected
def rewrite_headers(url, headers)
headers["Referer"] = "http://seiga.nicovideo.jp"
return [url, headers]
end
def rewrite_html_pages(url, headers)
# example: http://seiga.nicovideo.jp/seiga/im1389842
if url =~ %r{https?://seiga\.nicovideo\.jp/seiga/im\d+}
return [source.image_url, headers]
else
return [url, headers]
end
end
def rewrite_thumbnails(url, headers)
if url =~ %r{/thumb/\d+}
return [source.image_url, headers]
end
return [url, headers]
end
def rewrite_view_big_pages(url, headers)
# example: http://lohas.nicoseiga.jp/o/40aeedd2848a7780b6046747e75b3566b423a10c/1436307639/5026559
if url =~ %r{http://lohas\.nicoseiga\.jp/o/}
return [source.image_url, headers]
else
return [url, headers]
end
end
# Cache the source data so it gets fetched at most once.
def source
@source ||= begin
source = ::Sources::Strategies::NicoSeiga.new(url)
source.get
source
end
end
end
end
end

View File

@@ -1,40 +0,0 @@
module Downloads
module RewriteStrategies
class Nijie < Base
attr_accessor :url, :source
def initialize(url)
@url = url
end
def rewrite(url, headers, data = {})
if url =~ %r{https?://nijie\.info\/view\.php.+id=\d+}
url, headers = rewrite_html_pages(url, headers)
end
return [url, headers, data]
end
protected
def rewrite_html_pages(url, headers)
# example: http://nijie.info/view.php?id=151126
if url =~ %r{https?://nijie\.info\/view\.php.+id=\d+}
return [source.image_url, headers]
else
return [url, headers]
end
end
# Cache the source data so it gets fetched at most once.
def source
@source ||= begin
source = ::Sources::Strategies::Nijie.new(url)
source.get
source
end
end
end
end
end

View File

@@ -1,17 +0,0 @@
module Downloads
module RewriteStrategies
class Pawoo < Base
def rewrite(url, headers, data = {})
if Sources::Strategies::Pawoo.url_match?(url)
source = Sources::Strategies::Pawoo.new(url)
source.get
url = source.image_url
elsif url =~ %r!\Ahttps?://img\.pawoo\.net/media_attachments/files/(\d+/\d+/\d+)/small/([a-z0-9]+\.\w+)\z!i
url = "https://img.pawoo.net/media_attachments/files/#{$1}/original/#{$2}"
end
return [url, headers, data]
end
end
end
end

View File

@@ -1,127 +0,0 @@
module Downloads
module RewriteStrategies
class Pixiv < Base
attr_accessor :url, :source
def initialize(url)
@url = url
end
def rewrite(url, headers, data = {})
if url =~ /\Ahttps?:\/\/(?:\w+\.)?pixiv\.net/ || url =~ /\Ahttps?:\/\/i\.pximg\.net/
url, headers = rewrite_headers(url, headers)
url, headers = rewrite_cdn(url, headers)
end
if (url =~ /\Ahttps?:\/\/(?:\w+\.)?pixiv\.net/ || url =~ /\Ahttps?:\/\/i\.pximg\.net/) && source.illust_id_from_url
url, headers = rewrite_html_pages(url, headers)
url, headers = rewrite_thumbnails(url, headers)
url, headers = rewrite_old_small_manga_pages(url, headers)
url, headers = rewrite_to_thumbnails(url, headers) if data.delete(:get_thumbnail)
end
# http://i2.pixiv.net/img-zip-ugoira/img/2014/08/05/06/01/10/44524589_ugoira1920x1080.zip
if url =~ %r!\Ahttps?://(i\d+\.pixiv|i\.pximg)\.net/img-zip-ugoira/img/\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}/\d+_ugoira\d+x\d+\.zip\z!i
data[:is_ugoira] = true
data[:ugoira_frame_data] = source.ugoira_frame_data
data[:ugoira_content_type] = source.ugoira_content_type
end
return [url, headers, data]
rescue PixivApiClient::BadIDError, Sources::Site::NoStrategyError
return [url, headers, data]
end
protected
def rewrite_to_thumbnails(url, headers)
if url =~ %r!https?://(i\d+)\.pixiv\.net/img-zip-ugoira/img/(\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2})/(\d+)_ugoira\d+x\d+\.zip!
url = "http://#{$1}.pixiv.net/c/150x150/img-master/img/#{$2}/#{$3}_master1200.jpg"
elsif url =~ %r!https?://i\.pximg\.net/img-zip-ugoira/img/(\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2})/(\d+)_ugoira\d+x\d+\.zip!
url = "http://i.pximg.net/c/150x150/img-master/img/#{$1}/#{$2}_master1200.jpg"
elsif url =~ %r!https?://(i\d+)\.pixiv\.net/img-original/img/(\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2})/(\d+_p\d+)\.!
url = "http://#{$1}.pixiv.net/c/150x150/img-master/img/#{$2}/#{$3}_master1200.jpg"
elsif url =~ %r!https?://i\.pximg\.net/img-original/img/(\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2})/(\d+_p\d+)\.!
url = "http://i.pximg.net/c/150x150/img-master/img/#{$1}/#{$2}_master1200.jpg"
elsif url =~ %r!https?://(i\d+)\.pixiv\.net/img(\d+)/img/(.+?)/(\d+)\.!
url = "http://#{$1}.pixiv.net/img#{$2}/img/#{$3}/mobile/#{$4}_240mw.jpg"
elsif url =~ %r!https?://i\.pximg\.net/img(\d+)/img/(.+?)/(\d+)\.!
url = "http://#{$1}.pixiv.net/img#{$2}/img/#{$3}/mobile/#{$4}_240mw.jpg"
end
return [url, headers]
end
def rewrite_headers(url, headers)
headers["Referer"] = "http://www.pixiv.net"
return [url, headers]
end
# Rewrite these:
# http://www.pixiv.net/i/18557054
# http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1
# Plus this:
# i2.pixiv.net/img-inf/img/2014/09/25/00/57/24/46170939_64x64.jpg
def rewrite_html_pages(url, headers)
if url =~ /illust_id=\d+/i || url =~ %r!pixiv\.net/img-inf/img/!i
return [source.file_url, headers]
else
return [url, headers]
end
end
# Rewrite these:
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_m.jpg
# http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg
def rewrite_thumbnails(url, headers)
url = source.rewrite_thumbnails(url)
return [url, headers]
end
# Rewrite these:
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg
# http://img04.pixiv.net/img/syounen_no_uta/46170939_p0.jpg
# but not these:
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_big_p0.jpg
# http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg
# http://i1.pixiv.net/img-original/img/2014/09/25/23/09/29/46183440_p0.jpg
def rewrite_old_small_manga_pages(url, headers)
if url !~ %r!/img-(?:original|master)/img/!i && url =~ %r!/(\d+_p\d+)\.!i
match = $1
repl = match.sub(/_p/, "_big_p")
big_url = url.sub(match, repl)
if http_exists?(big_url, headers)
url = big_url
end
end
return [url, headers]
end
def rewrite_cdn(url, headers)
if url =~ %r{https?:\/\/(?:\w+\.)?pixiv\.net\.edgesuite\.net}
url = url.sub(".edgesuite.net", "")
end
return [url, headers]
end
# Cache the source data so it gets fetched at most once.
def source
@source ||= begin
source = ::Sources::Site.new(url)
source.get
source
end
end
end
end
end

View File

@@ -1,70 +0,0 @@
module Downloads
module RewriteStrategies
DOMAIN = '(data|(\d+\.)?media)\.tumblr\.com'
MD5 = '(?<md5>[0-9a-f]{32})'
FILENAME = '(?<filename>(tumblr_(inline_)?)?[a-z0-9]+(_r[0-9]+)?)'
SIZES = '(250|400|500|500h|540|1280|raw)'
EXT = '(?<ext>\w+)'
class Tumblr < Base
def rewrite(url, headers, data = {})
url = rewrite_cdn(url)
url = rewrite_samples(url, headers)
url = rewrite_html_pages(url)
return [url, headers, data]
end
protected
# Look for the biggest available version on data.tumblr.com. A bigger
# version may or may not exist.
#
# http://40.media.tumblr.com/d8c6d49785c0842ee31ff26c010b7445/tumblr_naypopLln51tkufhoo2_500h.png
# => http://data.tumblr.com/d8c6d49785c0842ee31ff26c010b7445/tumblr_naypopLln51tkufhoo2_raw.png
#
# https://40.media.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_1280.jpg
# => http://data.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_raw.jpg
#
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
# => http://data.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
#
# https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
# => http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png
#
# http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_400.jpg
# => http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg
#
# http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg
# => http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg
def rewrite_samples(url, headers)
if url =~ %r!\Ahttps?://#{DOMAIN}/(?<dir>#{MD5}/)?#{FILENAME}_#{SIZES}\.#{EXT}\z!i
sizes = ["raw", 1280, 640, 540, "500h", 500, 400, 250]
candidates = sizes.map do |size|
"http://data.tumblr.com/#{$~[:dir]}#{$~[:filename]}_#{size}.#{$~[:ext]}"
end
url = candidates.find do |candidate|
http_exists?(candidate, headers)
end
end
url
end
# https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
# => http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
def rewrite_cdn(url)
url.sub!(%r!\Ahttps?://gs1\.wac\.edgecastcdn\.net/8019B6/data\.tumblr\.com!i, "http://data.tumblr.com")
url
end
def rewrite_html_pages(url)
if Sources::Strategies::Tumblr.url_match?(url)
url = Sources::Strategies::Tumblr.new(url).image_url
end
url
end
end
end
end

View File

@@ -1,36 +0,0 @@
module Downloads
module RewriteStrategies
class Twitpic < Base
def rewrite(url, headers, data = {})
if url =~ %r{https?://twitpic\.com} || url =~ %r{^https?://(?:d3j5vwomefv46c|dn3pm25xmtlyu)\.cloudfront\.net}
url, headers = rewrite_html_pages(url, headers)
url, headers = rewrite_thumbnails(url, headers)
end
return [url, headers, data]
end
protected
def rewrite_html_pages(url, headers)
# example: http://twitpic.com/cpprns
if url =~ %r{https?://twitpic\.com/([a-z0-9]+)$}
id = $1
url = "http://twitpic.com/show/full/#{id}"
return [url, headers]
else
return [url, headers]
end
end
def rewrite_thumbnails(url, headers)
if url =~ %r{^https?://(?:d3j5vwomefv46c|dn3pm25xmtlyu)\.cloudfront\.net/photos/thumb/(\d+\..+)$}
match = $1
url.sub!("/thumb/" + match, "/large/" + match)
end
return [url, headers]
end
end
end
end

View File

@@ -1,40 +0,0 @@
module Downloads
module RewriteStrategies
class Twitter < Base
attr_accessor :url, :source
def initialize(url)
@url = url
end
def rewrite(url, headers, data = {})
if url =~ %r!^https?://(?:mobile\.)?twitter\.com!
url = source.image_url
elsif url =~ %r{^https?://pbs\.twimg\.com}
url, headers = rewrite_thumbnails(url, headers, data)
end
return [url, headers, data]
end
protected
def rewrite_thumbnails(url, headers, data)
if url =~ %r{^(https?://pbs\.twimg\.com/media/[^:]+)}
url = $1 + ":orig"
end
return [url, headers]
end
# Cache the source data so it gets fetched at most once.
def source
@source ||= begin
source = ::Sources::Strategies::Twitter.new(url)
source.get
source
end
end
end
end
end