From 421bbc35a29e7ba0baa4bb076659d93520165df1 Mon Sep 17 00:00:00 2001 From: evazion Date: Wed, 21 Jun 2017 21:09:16 -0500 Subject: [PATCH] tumblr: rewrite samples to biggest available version. --- .../downloads/rewrite_strategies/tumblr.rb | 62 ++++++++++++------- test/unit/downloads/tumblr_test.rb | 60 ++++++++++++++++-- 2 files changed, 93 insertions(+), 29 deletions(-) diff --git a/app/logical/downloads/rewrite_strategies/tumblr.rb b/app/logical/downloads/rewrite_strategies/tumblr.rb index 8e934bcaa..11a462df0 100644 --- a/app/logical/downloads/rewrite_strategies/tumblr.rb +++ b/app/logical/downloads/rewrite_strategies/tumblr.rb @@ -1,44 +1,60 @@ module Downloads module RewriteStrategies + DOMAIN = '(data|(\d+\.)?media)\.tumblr\.com' + MD5 = '(?[0-9a-f]{32})' + FILENAME = '(?(tumblr_(inline_)?)?[a-z0-9]+(_r[0-9]+)?)' + SIZES = '(250|400|500|500h|540|1280|raw)' + EXT = '(?\w+)' + class Tumblr < Base def rewrite(url, headers, data = {}) - if url =~ %r{^https?://.*tumblr\.com} - url, headers = rewrite_cdn(url, headers) - url, headers = rewrite_thumbnails(url, headers) - end + url = rewrite_cdn(url) + url = rewrite_samples(url, headers) return [url, headers, data] end protected - def rewrite_thumbnails(url, headers) - if url =~ %r{^https?://.+\.tumblr\.com/(?:\w+/)?(?:tumblr_)?(\w+_)(\d+)(\..+)$} - match = $1 - given_size = $2 - file_ext = $3 + # Look for the biggest available version on data.tumblr.com. A bigger + # version may or may not exist. + # + # http://40.media.tumblr.com/d8c6d49785c0842ee31ff26c010b7445/tumblr_naypopLln51tkufhoo2_500h.png + # => http://data.tumblr.com/d8c6d49785c0842ee31ff26c010b7445/tumblr_naypopLln51tkufhoo2_raw.png + # + # https://40.media.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_1280.jpg + # => http://data.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_raw.jpg + # + # https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif + # => http://data.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif + # + # https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png + # => http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png + # + # http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_400.jpg + # => http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg + # + # http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg + # => http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg + def rewrite_samples(url, headers) + if url =~ %r!\Ahttps?://#{DOMAIN}/(?#{MD5}/)?#{FILENAME}_#{SIZES}\.#{EXT}\z!i + sizes = ["raw", 1280, 540, 500, 400, 250] + candidates = sizes.map do |size| + "http://data.tumblr.com/#{$~[:dir]}#{$~[:filename]}_#{size}.#{$~[:ext]}" + end - big_1280_url = url.sub(match + given_size, match + "1280") - if file_ext == ".gif" - res = http_head_request(big_1280_url, headers) - # Sometimes the 1280 version of a gif is actually a static jpeg. We don't want that so we only use the 1280 version if it really is a gif. - if res.is_a?(Net::HTTPSuccess) && res["content-type"] == "image/gif" - return [big_1280_url, headers] - end - else - if http_exists?(big_1280_url, headers) - return [big_1280_url, headers] - end + url = candidates.find do |candidate| + http_exists?(candidate, headers) end end - return [url, headers] + url end # https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png # => http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png - def rewrite_cdn(url, headers) + def rewrite_cdn(url) url.sub!(%r!\Ahttps?://gs1\.wac\.edgecastcdn\.net/8019B6/data\.tumblr\.com!i, "http://data.tumblr.com") - return [url, headers] + url end end end diff --git a/test/unit/downloads/tumblr_test.rb b/test/unit/downloads/tumblr_test.rb index 0f2290efb..760800285 100644 --- a/test/unit/downloads/tumblr_test.rb +++ b/test/unit/downloads/tumblr_test.rb @@ -3,20 +3,68 @@ require 'test_helper' module Downloads class TumblrTest < ActiveSupport::TestCase context "a download for a tumblr 500 sample" do - should "instead download the 1280 version" do - @source = "http://24.media.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_500.jpg" - assert_rewritten("http://24.media.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_1280.jpg", @source) + should "instead download the raw version" do + @source = "https://24.media.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_500.jpg" + @rewrite = "http://data.tumblr.com/fc328250915434e66e8e6a92773f79d0/tumblr_mf4nshfibc1s0oswoo1_raw.jpg" + assert_rewritten(@rewrite, @source) assert_downloaded(196_617, @source) end end - context "a download for a tumblr 500 image without a larger size" do - should "download the 500 version" do - @source = "http://25.media.tumblr.com/tumblr_lxbzel2H5y1r9yjhso1_500.jpg" + context "a download for a *.media.tumblr.com/tumblr_$id_$size image without a larger size" do + should "download the same version" do + @source = "https://25.media.tumblr.com/tumblr_lxbzel2H5y1r9yjhso1_500.jpg" + @rewrite = "http://data.tumblr.com/tumblr_lxbzel2H5y1r9yjhso1_500.jpg" + assert_rewritten(@rewrite, @source) assert_downloaded(90_122, @source) end end + context "a download for a *.media.tumblr.com/tumblr_$id_$size image with a larger size" do + should "download the best available version" do + @source = "https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png" + @rewrite = "http://data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png" + assert_rewritten(@rewrite, @source) + assert_downloaded(34_060, @source) + end + end + + context "a download for a *.media.tumblr.com/$hash/tumblr_$id_rN_$size image" do + should "download the best available version" do + @source = "https://33.media.tumblr.com/4b7fecf9a5a8284fbaefb051a2369b55/tumblr_npozqfwc9h1rt6u7do1_r1_500.gif" + @rewrite = "http://data.tumblr.com/4b7fecf9a5a8284fbaefb051a2369b55/tumblr_npozqfwc9h1rt6u7do1_r1_raw.gif" + assert_rewritten(@rewrite, @source) + assert_downloaded(1_234_017, @source) + end + end + + context "a download for a *.media.tumblr.com/$hash/tumblr_inline_$id_$size image" do + should "download the best available version" do + @source = "https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif" + @rewrite = "http://data.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif" + assert_rewritten(@rewrite, @source) + assert_downloaded(110_348, @source) + end + end + + context "a download for a data.tumblr.com/$id_$size image with a larger size" do + should "download the best available version" do + @source = "http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_400.jpg" + @rewrite = "http://data.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg" + assert_rewritten(@rewrite, @source) + assert_downloaded(153_885, @source) + end + end + + context "a download for a data.tumblr.com/tumblr_$id_$size.jpg image" do + should "download the best available version" do + @source = "http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_250.jpg" + @rewrite = "http://data.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg" + assert_rewritten(@rewrite, @source) + assert_downloaded(296_399, @source) + end + end + context "a download for a gs1.wac.edgecastcdn.net image" do should "rewrite to the full tumblr version" do @source = "https://gs1.wac.edgecastcdn.net/8019B6/data.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png"