tumblr: rewrite samples to biggest available version.
This commit is contained in:
@@ -1,44 +1,60 @@
|
||||
module Downloads
|
||||
module RewriteStrategies
|
||||
DOMAIN = '(data|(\d+\.)?media)\.tumblr\.com'
|
||||
MD5 = '(?<md5>[0-9a-f]{32})'
|
||||
FILENAME = '(?<filename>(tumblr_(inline_)?)?[a-z0-9]+(_r[0-9]+)?)'
|
||||
SIZES = '(250|400|500|500h|540|1280|raw)'
|
||||
EXT = '(?<ext>\w+)'
|
||||
|
||||
class Tumblr < Base
|
||||
def rewrite(url, headers, data = {})
|
||||
if url =~ %r{^https?://.*tumblr\.com}
|
||||
url, headers = rewrite_cdn(url, headers)
|
||||
url, headers = rewrite_thumbnails(url, headers)
|
||||
end
|
||||
url = rewrite_cdn(url)
|
||||
url = rewrite_samples(url, headers)
|
||||
|
||||
return [url, headers, data]
|
||||
end
|
||||
|
||||
protected
|
||||
def rewrite_thumbnails(url, headers)
|
||||
if url =~ %r{^https?://.+\.tumblr\.com/(?:\w+/)?(?:tumblr_)?(\w+_)(\d+)(\..+)$}
|
||||
match = $1
|
||||
given_size = $2
|
||||
file_ext = $3
|
||||
# Look for the biggest available version on data.tumblr.com. A bigger
|
||||
# version may or may not exist.
|
||||
#
|
||||
# http://40.media.tumblr.com/d8c6d49785c0842ee31ff26c010b7445/tumblr_naypopLln51tkufhoo2_500h.png
|
||||
# => http://data.tumblr.com/d8c6d49785c0842ee31ff26c010b7445/tumblr_naypopLln51tkufhoo2_raw.png
|
||||
#
|
||||
# https://40.media.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_1280.jpg
|
||||
# => http://data.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_raw.jpg
|
||||
#
|
||||
# https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif
|
||||
# => http://data.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif
|
||||
#
|
||||
# https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
|
||||
# => http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png
|
||||
#
|
||||
# http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_400.jpg
|
||||
# => http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg
|
||||
#
|
||||
# http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg
|
||||
# => http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg
|
||||
def rewrite_samples(url, headers)
|
||||
if url =~ %r!\Ahttps?://#{DOMAIN}/(?<dir>#{MD5}/)?#{FILENAME}_#{SIZES}\.#{EXT}\z!i
|
||||
sizes = ["raw", 1280, 540, 500, 400, 250]
|
||||
candidates = sizes.map do |size|
|
||||
"http://data.tumblr.com/#{$~[:dir]}#{$~[:filename]}_#{size}.#{$~[:ext]}"
|
||||
end
|
||||
|
||||
big_1280_url = url.sub(match + given_size, match + "1280")
|
||||
if file_ext == ".gif"
|
||||
res = http_head_request(big_1280_url, headers)
|
||||
# Sometimes the 1280 version of a gif is actually a static jpeg. We don't want that so we only use the 1280 version if it really is a gif.
|
||||
if res.is_a?(Net::HTTPSuccess) && res["content-type"] == "image/gif"
|
||||
return [big_1280_url, headers]
|
||||
end
|
||||
else
|
||||
if http_exists?(big_1280_url, headers)
|
||||
return [big_1280_url, headers]
|
||||
end
|
||||
url = candidates.find do |candidate|
|
||||
http_exists?(candidate, headers)
|
||||
end
|
||||
end
|
||||
|
||||
return [url, headers]
|
||||
url
|
||||
end
|
||||
|
||||
# https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
|
||||
# => http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png
|
||||
def rewrite_cdn(url, headers)
|
||||
def rewrite_cdn(url)
|
||||
url.sub!(%r!\Ahttps?://gs1\.wac\.edgecastcdn\.net/8019B6/data\.tumblr\.com!i, "http://data.tumblr.com")
|
||||
return [url, headers]
|
||||
url
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -3,20 +3,68 @@ require 'test_helper'
|
||||
module Downloads
|
||||
class TumblrTest < ActiveSupport::TestCase
|
||||
context "a download for a tumblr 500 sample" do
|
||||
should "instead download the 1280 version" do
|
||||
@source = "http://24.media.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_500.jpg"
|
||||
assert_rewritten("http://24.media.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_1280.jpg", @source)
|
||||
should "instead download the raw version" do
|
||||
@source = "https://24.media.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_500.jpg"
|
||||
@rewrite = "http://data.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_raw.jpg"
|
||||
assert_rewritten(@rewrite, @source)
|
||||
assert_downloaded(196_617, @source)
|
||||
end
|
||||
end
|
||||
|
||||
context "a download for a tumblr 500 image without a larger size" do
|
||||
should "download the 500 version" do
|
||||
@source = "http://25.media.tumblr.com/tumblr_lxbzel2H5y1r9yjhso1_500.jpg"
|
||||
context "a download for a *.media.tumblr.com/tumblr_$id_$size image without a larger size" do
|
||||
should "download the same version" do
|
||||
@source = "https://25.media.tumblr.com/tumblr_lxbzel2H5y1r9yjhso1_500.jpg"
|
||||
@rewrite = "http://data.tumblr.com/tumblr_lxbzel2H5y1r9yjhso1_500.jpg"
|
||||
assert_rewritten(@rewrite, @source)
|
||||
assert_downloaded(90_122, @source)
|
||||
end
|
||||
end
|
||||
|
||||
context "a download for a *.media.tumblr.com/tumblr_$id_$size image with a larger size" do
|
||||
should "download the best available version" do
|
||||
@source = "https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png"
|
||||
@rewrite = "http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png"
|
||||
assert_rewritten(@rewrite, @source)
|
||||
assert_downloaded(34_060, @source)
|
||||
end
|
||||
end
|
||||
|
||||
context "a download for a *.media.tumblr.com/$hash/tumblr_$id_rN_$size image" do
|
||||
should "download the best available version" do
|
||||
@source = "https://33.media.tumblr.com/4b7fecf9a5a8284fbaefb051a2369b55/tumblr_npozqfwc9h1rt6u7do1_r1_500.gif"
|
||||
@rewrite = "http://data.tumblr.com/4b7fecf9a5a8284fbaefb051a2369b55/tumblr_npozqfwc9h1rt6u7do1_r1_raw.gif"
|
||||
assert_rewritten(@rewrite, @source)
|
||||
assert_downloaded(1_234_017, @source)
|
||||
end
|
||||
end
|
||||
|
||||
context "a download for a *.media.tumblr.com/$hash/tumblr_inline_$id_$size image" do
|
||||
should "download the best available version" do
|
||||
@source = "https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif"
|
||||
@rewrite = "http://data.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif"
|
||||
assert_rewritten(@rewrite, @source)
|
||||
assert_downloaded(110_348, @source)
|
||||
end
|
||||
end
|
||||
|
||||
context "a download for a data.tumblr.com/$id_$size image with a larger size" do
|
||||
should "download the best available version" do
|
||||
@source = "http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_400.jpg"
|
||||
@rewrite = "http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg"
|
||||
assert_rewritten(@rewrite, @source)
|
||||
assert_downloaded(153_885, @source)
|
||||
end
|
||||
end
|
||||
|
||||
context "a download for a data.tumblr.com/tumblr_$id_$size.jpg image" do
|
||||
should "download the best available version" do
|
||||
@source = "http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_250.jpg"
|
||||
@rewrite = "http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg"
|
||||
assert_rewritten(@rewrite, @source)
|
||||
assert_downloaded(296_399, @source)
|
||||
end
|
||||
end
|
||||
|
||||
context "a download for a gs1.wac.edgecastcdn.net image" do
|
||||
should "rewrite to the full tumblr version" do
|
||||
@source = "https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png"
|
||||
|
||||
Reference in New Issue
Block a user