diff --git a/app/logical/source/url.rb b/app/logical/source/url.rb index b6612906d..fcb7cc63c 100644 --- a/app/logical/source/url.rb +++ b/app/logical/source/url.rb @@ -30,6 +30,7 @@ module Source Source::URL::Newgrounds, Source::URL::Plurk, Source::URL::Skeb, + Source::URL::Tumblr, Source::URL::TwitPic, Source::URL::Weibo, ] diff --git a/app/logical/source/url/tumblr.rb b/app/logical/source/url/tumblr.rb new file mode 100644 index 000000000..0f8c0e7c8 --- /dev/null +++ b/app/logical/source/url/tumblr.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +class Source::URL::Tumblr < Source::URL + attr_reader :work_id, :blog_name, :directory, :full_image_url + + def self.match?(url) + url.domain == "tumblr.com" + end + + def parse + case [host, *path_segments] + + # https://66.media.tumblr.com/168dabd09d5ad69eb5fedcf94c45c31a/3dbfaec9b9e0c2e3-72/s640x960/bf33a1324f3f36d2dc64f011bfeab4867da62bc8.png + # https://66.media.tumblr.com/5a2c3fe25c977e2281392752ab971c90/3dbfaec9b9e0c2e3-92/s500x750/4f92bbaaf95c0b4e7970e62b1d2e1415859dd659.png + in /(\d+\.)?media\.tumblr\.com/ => host, *directories, /s\d+x\d+/ => dimensions, file + @directory = directories.first + max_size = Integer.sqrt(Danbooru.config.max_image_resolution) + @full_image_url = url.to_s.gsub(%r{/s\d+x\d+/\w+\.\w+\z}i, "/s#{max_size}x#{max_size}/#{file}") + @file = file + + # http://data.tumblr.com/07e7bba538046b2b586433976290ee1f/tumblr_o3gg44HcOg1r9pi29o1_raw.jpg + # https://40.media.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_1280.jpg + # https://media.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_raw.jpg + # https://66.media.tumblr.com/2c6f55531618b4335c67e29157f5c1fc/tumblr_pz4a44xdVj1ssucdno1_1280.png + # https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif + # https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif + # https://66.media.tumblr.com/b9395771b2d0435fe4efee926a5a7d9c/tumblr_pg2wu1L9DM1trd056o2_500h.png + # https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png + # https://media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png + # https://media.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg + # https://media.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg + # https://va.media.tumblr.com/tumblr_pgohk0TjhS1u7mrsl.mp4 + in /^(data|(?:\d+\.)?media|(?:vtt|ve|va\.media))\.tumblr\.com/, *directory, file + @directory = directory.first + @file = file + @filename, @old_variant_size, @extension = @file.match(/(\w+?)(?:_(\d+h?|raw))?\.(\w+)\z/).captures + + # https://marmaladica.tumblr.com/post/188237914346/saved + # https://emlan.tumblr.com/post/189469423572/kuro-attempts-to-buy-a-racy-book-at-comiket-but + # https://superboin.tumblr.com/post/141169066579/photoset_iframe/superboin/tumblr_o45miiAOts1u6rxu8/500/false + # https://make-do5.tumblr.com/post/619663949657423872 + in _, ("post" | "image"), /\d+/ => work_id, *rest + @blog_name = subdomain unless subdomain == "www" + @work_id = work_id + + else + end + end + + def asset_url? + @file.present? + end + + def variants + return [] unless @old_variant_size.present? + directory = "#{@directory}/" if @directory.present? + + sizes = %w[1280 640 540 500h 500 400 250 100] + sizes.map { |size| "https://media.tumblr.com/#{directory}#{@filename}_#{size}.#{@extension}" } + end + + def page_url + return nil unless @blog_name.present? && @work_id.present? + "https://#{@blog_name}.tumblr.com/post/#{@work_id}" + end + + def profile_url + return nil unless @blog_name.present? + "https://#{@blog_name}.tumblr.com" + end +end diff --git a/app/logical/sources/strategies/tumblr.rb b/app/logical/sources/strategies/tumblr.rb index bf3cde66d..b87e33eff 100644 --- a/app/logical/sources/strategies/tumblr.rb +++ b/app/logical/sources/strategies/tumblr.rb @@ -1,73 +1,39 @@ # frozen_string_literal: true -# https://marmaladica.tumblr.com/post/188237914346/saved -# https://66.media.tumblr.com/2c6f55531618b4335c67e29157f5c1fc/tumblr_pz4a44xdVj1ssucdno1_1280.png -# https://66.media.tumblr.com/11700cab20d65d5a6acc470e284dbd3a/tumblr_pz4a44xdVj1ssucdno2_1280.png -# -# https://emlan.tumblr.com/post/189469423572/kuro-attempts-to-buy-a-racy-book-at-comiket-but -# https://66.media.tumblr.com/168dabd09d5ad69eb5fedcf94c45c31a/3dbfaec9b9e0c2e3-72/s640x960/bf33a1324f3f36d2dc64f011bfeab4867da62bc8.png -# https://66.media.tumblr.com/5a2c3fe25c977e2281392752ab971c90/3dbfaec9b9e0c2e3-92/s500x750/4f92bbaaf95c0b4e7970e62b1d2e1415859dd659.png -# -# https://superboin.tumblr.com/post/141169066579/photoset_iframe/superboin/tumblr_o45miiAOts1u6rxu8/500/false -# -# https://make-do5.tumblr.com/post/619663949657423872 (extremely high res, extractable) - +# @see Source::URL::Tumblr module Sources::Strategies class Tumblr < Base - SIZES = %w[1280 640 540 500h 500 400 250 100] - - BASE_URL = %r{\Ahttps?://(?:[^/]+\.)*tumblr\.com}i - DOMAIN = /(data|(?:\d+\.)?media)\.tumblr\.com/i - MD5 = /(?[0-9a-f]{32})/i - FILENAME = /(?(?:tumblr_(?:inline_)?)?[a-z0-9]+(?:_r[0-9]+)?)/i - EXT = /(?\w+)/ - - # old: https://66.media.tumblr.com/2c6f55531618b4335c67e29157f5c1fc/tumblr_pz4a44xdVj1ssucdno1_1280.png - # new: https://66.media.tumblr.com/168dabd09d5ad69eb5fedcf94c45c31a/3dbfaec9b9e0c2e3-72/s640x960/bf33a1324f3f36d2dc64f011bfeab4867da62bc8.png - OLD_IMAGE = %r{\Ahttps?://#{DOMAIN}/(?#{MD5}/)?#{FILENAME}_(?\w+)\.#{EXT}\z}i - - IMAGE = %r{\Ahttps?://#{DOMAIN}/}i - VIDEO = %r{\Ahttps?://(?:vtt|ve|va\.media)\.tumblr\.com/}i - POST = %r{\Ahttps?://(?[^.]+)\.tumblr\.com/(?:post|image)/(?\d+)}i - def self.enabled? Danbooru.config.tumblr_consumer_key.present? end - def domains - ["tumblr.com"] + def match? + Source::URL::Tumblr === parsed_url end def site_name - "Tumblr" - end - - def image_url - return image_urls.first unless url.match?(IMAGE) || url.match?(VIDEO) - find_largest(url) + parsed_url.site_name end def image_urls - list = [] + return [find_largest(parsed_url)].compact if parsed_url.asset_url? + + assets = [] case post[:type] when "photo" - list += post[:photos].map do |photo| + assets += post[:photos].map do |photo| sizes = [photo[:original_size]] + photo[:alt_sizes] biggest = sizes.max_by { |x| x[:width] * x[:height] } biggest[:url] end when "video" - list += [post[:video_url]] - - # api response is blank (work is deleted or we were given a direct image with no referer url) - when nil - list += [url] if url.match?(IMAGE) || url.match?(VIDEO) + assets += [post[:video_url]] end - list += inline_images - list.map { |url| find_largest(url) } + assets += inline_images + assets.map { |url| find_largest(url) } end def preview_urls @@ -77,21 +43,11 @@ module Sources::Strategies end def page_url - return nil unless blog_name.present? && post_id.present? - "https://#{blog_name}.tumblr.com/post/#{post_id}" - end - - def canonical_url - page_url + parsed_url.page_url || parsed_referer&.page_url || post_url_from_image_html&.page_url end def profile_url - return nil if artist_name.blank? - "https://#{artist_name}.tumblr.com" - end - - def artist_name - post[:blog_name] || blog_name + parsed_url.profile_url || parsed_referer&.profile_url || post_url_from_image_html&.profile_url end def artist_commentary_title @@ -138,72 +94,57 @@ module Sources::Strategies end def normalize_for_source - return unless blog_name.present? && post_id.present? - - "https://#{blog_name}.tumblr.com/post/#{post_id}" + parsed_url.page_url end def dtext_artist_commentary_desc DText.from_html(artist_commentary_desc).strip end - # Look for the biggest available version on media.tumblr.com. A bigger - # version may or may not exist. - # - # https://40.media.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_1280.jpg - # => https://media.tumblr.com/de018501416a465d898d24ad81d76358/tumblr_nfxt7voWDX1rsd4umo1_r23_raw.jpg - # - # https://68.media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_250.gif - # => https://media.tumblr.com/ee02048f5578595badc95905e17154b4/tumblr_inline_ofbr4452601sk4jd9_500.gif - # - # https://25.media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_500.png - # => https://media.tumblr.com/tumblr_m2dxb8aOJi1rop2v0o1_1280.png - # - # http://media.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_400.jpg - # => https://media.tumblr.com/0DNBGJovY5j3smfeQs8nB53z_500.jpg - # - # http://media.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg - # => https://media.tumblr.com/tumblr_m24kbxqKAX1rszquso1_1280.jpg - def find_largest(url, sizes: SIZES) - if url =~ OLD_IMAGE - candidates = sizes.map do |size| - "https://media.tumblr.com/#{$~[:dir]}#{$~[:filename]}_#{size}.#{$~[:ext]}" - end - - candidates.find do |candidate| - http_exists?(candidate) - end - elsif url =~ %r{/s\d+x\d+/(\w+\.\w+)\z}i - max_size = Integer.sqrt(Danbooru.config.max_image_resolution) - url = url.gsub(%r{/s\d+x\d+/\w+\.\w+\z}i, "/s#{max_size}x#{max_size}/#{$1}") - - resp = http.cache(1.minute).headers(accept: "text/html").get(url).parse - resp.at("img[src*='/s#{max_size}x#{max_size}/']")["src"] + def find_largest(image_url) + parsed_image = Source::URL.parse(image_url) + if parsed_image.full_image_url.present? + image_url_html(parsed_image.full_image_url)&.at("img[src*='/#{parsed_image.directory}/']")&.[](:src) + elsif parsed_image.variants.present? + # Look for the biggest available version on media.tumblr.com. A bigger + # version may or may not exist. + parsed_image.variants.find { |variant| http_exists?(variant) } else - url + parsed_image.original_url end end + def post_url_from_image_html + extracted = image_url_html(parsed_url)&.at("[href*='/post/']")&.[](:href) + Source::URL.parse(extracted) + end + + def image_url_html(image_url) + resp = http.cache(1.minute).headers(accept: "text/html").get(image_url) + return nil if resp.code != 200 + resp.parse + end + def inline_images html = Nokogiri::HTML5.fragment(artist_commentary_desc) html.css("img").map { |node| node["src"] } end - def blog_name - urls.map { |url| url[POST, :blog_name] }.compact.first + def artist_name + parsed_url.blog_name || parsed_referer&.blog_name || post_url_from_image_html&.blog_name end - def post_id - urls.map { |url| url[POST, :post_id] }.compact.first + def work_id + parsed_url.work_id || parsed_referer&.work_id || post_url_from_image_html&.work_id end def api_response return {} unless self.class.enabled? - return {} unless blog_name.present? && post_id.present? + return {} unless artist_name.present? && work_id.present? response = http.cache(1.minute).get( - "https://api.tumblr.com/v2/blog/#{blog_name}/posts", - params: { id: post_id, api_key: Danbooru.config.tumblr_consumer_key } + "https://api.tumblr.com/v2/blog/#{artist_name}/posts", + params: { id: work_id, api_key: Danbooru.config.tumblr_consumer_key } ) return {} if response.code != 200 diff --git a/test/unit/sources/tumblr_test.rb b/test/unit/sources/tumblr_test.rb index c00dad23e..2c51e75f6 100644 --- a/test/unit/sources/tumblr_test.rb +++ b/test/unit/sources/tumblr_test.rb @@ -1,4 +1,4 @@ -require 'test_helper' +require "test_helper" module Sources class TumblrTest < ActiveSupport::TestCase @@ -114,7 +114,7 @@ module Sources end context "with a referer" do - should "get all the images and metadata" do + should "get all the metadata" do site = Sources::Strategies.find(@url, @ref) assert_equal("noizave", site.artist_name) @@ -122,26 +122,18 @@ module Sources assert_equal(["tag1", "tag2"], site.tags.map(&:first)) assert_equal(@ref, site.canonical_url) assert_equal("https://media.tumblr.com/7c4d2c6843466f92c3dd0516e749ec35/tumblr_orwwptNBCE1wsfqepo2_1280.jpg", site.image_url) - assert_equal(%w[ - https://media.tumblr.com/afed9f5b3c33c39dc8c967e262955de2/tumblr_orwwptNBCE1wsfqepo1_1280.png - https://media.tumblr.com/7c4d2c6843466f92c3dd0516e749ec35/tumblr_orwwptNBCE1wsfqepo2_1280.jpg - https://media.tumblr.com/d2ed224f135b0c81f812df81a0a8692d/tumblr_orwwptNBCE1wsfqepo3_640.gif - https://media.tumblr.com/3bbfcbf075ddf969c996641b264086fd/tumblr_inline_os3134mABB1v11u29_1280.png - https://media.tumblr.com/34ed9d0ff4a21625981372291cb53040/tumblr_nv3hwpsZQY1uft51jo1_1280.gif - ], site.image_urls) end end context "without a referer" do - should "get the original image" do + should "still find all the relevant information" do site = Sources::Strategies.find(@url) - assert_nil(site.artist_name) - assert_nil(site.profile_url) - assert_nil(site.canonical_url) - assert_equal([], site.tags) + assert_equal("noizave", site.artist_name) + assert_equal("https://noizave.tumblr.com", site.profile_url) + assert_equal(["tag1", "tag2"], site.tags.map(&:first)) + assert_equal(@ref, site.canonical_url) assert_equal("https://media.tumblr.com/7c4d2c6843466f92c3dd0516e749ec35/tumblr_orwwptNBCE1wsfqepo2_1280.jpg", site.image_url) - assert_equal(["https://media.tumblr.com/7c4d2c6843466f92c3dd0516e749ec35/tumblr_orwwptNBCE1wsfqepo2_1280.jpg"], site.image_urls) end end end @@ -161,40 +153,24 @@ module Sources end should "get the commentary" do - desc = %r!

description

! + desc = %r{

description

} assert_equal("test post", @site.artist_commentary_title) assert_match(desc, @site.artist_commentary_desc) end end - context "The source for a 'http://ve.media.tumblr.com/*' video post with inline images" do - setup do - @url = "https://va.media.tumblr.com/tumblr_os31dkexhK1wsfqep.mp4" - @ref = "https://noizave.tumblr.com/post/162222617101" - end + context "A video post with inline images" do + should "get the video and inline images" do + url = "https://noizave.tumblr.com/post/162222617101" + site = Sources::Strategies.find(url) + urls = %w[ + https://va.media.tumblr.com/tumblr_os31dkexhK1wsfqep.mp4 + https://media.tumblr.com/afed9f5b3c33c39dc8c967e262955de2/tumblr_inline_os31dclyCR1v11u29_1280.png + ] - context "with a referer" do - should "get the video and inline images" do - site = Sources::Strategies.find(@url, @ref) - urls = %w[ - https://va.media.tumblr.com/tumblr_os31dkexhK1wsfqep.mp4 - https://media.tumblr.com/afed9f5b3c33c39dc8c967e262955de2/tumblr_inline_os31dclyCR1v11u29_1280.png - ] - - assert_equal(@url, site.image_url) - assert_equal(urls, site.image_urls) - assert_equal(@ref, site.canonical_url) - end - end - - context "without a referer" do - should "get the video" do - site = Sources::Strategies.find(@url) - - assert_equal(@url, site.image_url) - assert_equal([@url], site.image_urls) - assert_nil(site.canonical_url) - end + assert_equal("https://va.media.tumblr.com/tumblr_os31dkexhK1wsfqep.mp4", site.image_url) + assert_equal(urls, site.image_urls) + assert_equal(url, site.canonical_url) end end @@ -254,7 +230,6 @@ module Sources site = Sources::Strategies.find(image, page) assert_equal(full, site.image_url) - assert_equal(full, site.image_urls.second) end end end