From d5ce6c98ef95a40b46c67ea4c802a05ab7f249d9 Mon Sep 17 00:00:00 2001 From: evazion Date: Thu, 22 Jun 2017 23:20:04 -0500 Subject: [PATCH 1/4] tumblr: add tumblr api client. --- Gemfile | 1 + Gemfile.lock | 3 +++ app/logical/tumblr_api_client.rb | 9 +++++++++ 3 files changed, 13 insertions(+) create mode 100644 app/logical/tumblr_api_client.rb diff --git a/Gemfile b/Gemfile index e4178086a..dc59a76af 100644 --- a/Gemfile +++ b/Gemfile @@ -45,6 +45,7 @@ gem 'daemons' gem 'oauth2' gem 'bootsnap' gem 'addressable' +gem 'httparty' # needed for looser jpeg header compat gem 'ruby-imagespec', :require => "image_spec", :git => "https://github.com/r888888888/ruby-imagespec.git", :branch => "exif-fixes" diff --git a/Gemfile.lock b/Gemfile.lock index 45f0286b8..c641d8c14 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -166,6 +166,8 @@ GEM domain_name (~> 0.5) http-form_data (1.0.2) http_parser.rb (0.6.0) + httparty (0.15.5) + multi_xml (>= 0.5.2) httpclient (2.8.0) hurley (0.2) i18n (0.8.1) @@ -412,6 +414,7 @@ DEPENDENCIES gctools google-api-client highline + httparty mechanize memcache-client memcache_mock diff --git a/app/logical/tumblr_api_client.rb b/app/logical/tumblr_api_client.rb new file mode 100644 index 000000000..61b0892ee --- /dev/null +++ b/app/logical/tumblr_api_client.rb @@ -0,0 +1,9 @@ +class TumblrApiClient < Struct.new(:api_key) + include HTTParty + base_uri "https://api.tumblr.com/v2/blog/" + + def posts(blog_name, post_id) + response = self.class.get("/#{blog_name}/posts", query: { id: post_id, api_key: api_key }) + response.parsed_response.with_indifferent_access[:response] + end +end From 118843b26a8b2015977f7832ba2a3eb2aa13c38b Mon Sep 17 00:00:00 2001 From: evazion Date: Thu, 22 Jun 2017 23:32:53 -0500 Subject: [PATCH 2/4] tumblr: fetch tags, commentary, and image urls. --- app/logical/downloads/file.rb | 2 +- app/logical/sources/strategies/tumblr.rb | 120 +++++++++++++++++++++-- config/danbooru_default_config.rb | 6 ++ 3 files changed, 118 insertions(+), 10 deletions(-) diff --git a/app/logical/downloads/file.rb b/app/logical/downloads/file.rb index ffd2b6809..8229426e5 100644 --- a/app/logical/downloads/file.rb +++ b/app/logical/downloads/file.rb @@ -140,8 +140,8 @@ module Downloads def set_source_to_referer(src, referer) if Sources::Strategies::Nijie.url_match?(src) || Sources::Strategies::Twitter.url_match?(src) || - Sources::Strategies::Tumblr.url_match?(src) || Sources::Strategies::Pawoo.url_match?(src) || + Sources::Strategies::Tumblr.url_match?(src) || Sources::Strategies::Tumblr.url_match?(referer) Sources::Strategies::ArtStation.url_match?(src) || Sources::Strategies::ArtStation.url_match?(referer) strategy = Sources::Site.new(src, :referer_url => referer) strategy.referer_url diff --git a/app/logical/sources/strategies/tumblr.rb b/app/logical/sources/strategies/tumblr.rb index 818aaaf62..d9d7062a7 100644 --- a/app/logical/sources/strategies/tumblr.rb +++ b/app/logical/sources/strategies/tumblr.rb @@ -1,28 +1,130 @@ module Sources::Strategies class Tumblr < Base + extend Memoist + def self.url_match?(url) - url =~ %r{^https?://.+\.tumblr\.com/(?:\w+/)?(?:tumblr_)?(\w+_)(\d+)\..+$} || url =~ %r{^https?://[^.]+\.tumblr\.com/(?:post|image)/\d+} + blog_name, post_id = parse_info_from_url(url) + blog_name.present? && post_id.present? end def referer_url - if @referer_url =~ %r{^https?://[^.]+\.tumblr\.com/post/\d+} && @url =~ %r{^https?://.+\.tumblr\.com/(?:\w+/)?(?:tumblr_)?(\w+_)(\d+)\..+$} - @referer_url - elsif @referer_url =~ %r{^https?://[^.]+\.tumblr\.com/image/\d+} && @url =~ %r{^https?://.+\.tumblr\.com/(?:\w+/)?(?:tumblr_)?(\w+_)(\d+)\..+$} - @referer_url.sub("/image/", "/post/") - else - @url - end + blog_name, post_id = self.class.parse_info_from_url(normalized_url) + "https://#{blog_name}.tumblr.com/post/#{post_id}" end def tags - [] + post[:tags].map do |tag| + # normalize tags: space, underscore, and hyphen are equivalent in tumblr tags. + [tag.tr(" _-", "_"), "https://tumblr.com/tagged/#{CGI::escape(tag.tr(" _-", "-"))}"] + end.uniq end def site_name "Tumblr" end + def profile_url + "https://#{artist_name}.tumblr.com/" + end + + def artist_name + post[:blog_name] + end + + def artist_commentary_title + case post[:type] + when "text", "link" + post[:title] + else + nil + end + end + + def artist_commentary_desc + case post[:type] + when "text" + post[:body] + when "link" + post[:description] + when "photo", "video" + post[:caption] + else + nil + end + end + + def image_url + image_urls.first + end + + def image_urls + urls = case post[:type] + when "photo" + post[:photos].map do |photo| + self.class.normalize_image_url(photo[:original_size][:url]) + end + when "video" + [post[:video_url]] + else + [] + end + + urls += self.class.parse_inline_images(artist_commentary_desc) + urls + end + def get end + + module HelperMethods + extend ActiveSupport::Concern + + module ClassMethods + def parse_info_from_url(url) + url =~ %r!\Ahttps?://(?[^.]+)\.tumblr\.com/(?:post|image)/(?\d+)!i + [$1, $2] + end + + def parse_inline_images(text) + html = Nokogiri::HTML.fragment(text) + image_urls = html.css("img").map { |node| node["src"] } + image_urls = image_urls.map(&method(:normalize_image_url)) + image_urls + end + + def normalize_image_url(url) + url, _, _ = Downloads::RewriteStrategies::Tumblr.new.rewrite(url, {}) + url + end + end + + def normalized_url + if self.class.url_match?(@referer_url) + @referer_url + elsif self.class.url_match?(@url) + @url + end + end + end + + module ApiMethods + def client + ::TumblrApiClient.new(Danbooru.config.tumblr_consumer_key) + end + + def api_response + blog_name, post_id = self.class.parse_info_from_url(normalized_url) + client.posts(blog_name, post_id) + end + + def post + api_response[:posts].first + end + end + + include ApiMethods + include HelperMethods + + memoize :client, :api_response end end diff --git a/config/danbooru_default_config.rb b/config/danbooru_default_config.rb index 78a4c26bc..470c9a212 100644 --- a/config/danbooru_default_config.rb +++ b/config/danbooru_default_config.rb @@ -382,6 +382,12 @@ module Danbooru nil end + # 1. Register app at https://www.tumblr.com/oauth/register. + # 2. Copy "OAuth Consumer Key" from https://www.tumblr.com/oauth/apps. + def tumblr_consumer_key + nil + end + def enable_dimension_autotagging true end From 030052bf146e7d0b358e8cbb672583b348417a76 Mon Sep 17 00:00:00 2001 From: evazion Date: Sun, 25 Jun 2017 01:03:23 -0500 Subject: [PATCH 3/4] tumblr: rewrite html pages to image url. --- app/logical/downloads/rewrite_strategies/tumblr.rb | 9 +++++++++ test/unit/downloads/tumblr_test.rb | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/app/logical/downloads/rewrite_strategies/tumblr.rb b/app/logical/downloads/rewrite_strategies/tumblr.rb index 11a462df0..2f6490e26 100644 --- a/app/logical/downloads/rewrite_strategies/tumblr.rb +++ b/app/logical/downloads/rewrite_strategies/tumblr.rb @@ -10,6 +10,7 @@ module Downloads def rewrite(url, headers, data = {}) url = rewrite_cdn(url) url = rewrite_samples(url, headers) + url = rewrite_html_pages(url) return [url, headers, data] end @@ -56,6 +57,14 @@ module Downloads url.sub!(%r!\Ahttps?://gs1\.wac\.edgecastcdn\.net/8019B6/data\.tumblr\.com!i, "http://data.tumblr.com") url end + + def rewrite_html_pages(url) + if Sources::Strategies::Tumblr.url_match?(url) + url = Sources::Strategies::Tumblr.new(url).image_url + end + + url + end end end end diff --git a/test/unit/downloads/tumblr_test.rb b/test/unit/downloads/tumblr_test.rb index 760800285..0a20cdc0c 100644 --- a/test/unit/downloads/tumblr_test.rb +++ b/test/unit/downloads/tumblr_test.rb @@ -74,5 +74,15 @@ module Downloads assert_rewritten(@rewrite, @source) end end + + context "a download for a *.tumblr.com/post/* html page" do + should "download the best available version" do + @source = "https://noizave.tumblr.com/post/162206271767" + @rewrite = "http://data.tumblr.com/3bbfcbf075ddf969c996641b264086fd/tumblr_os2buiIOt51wsfqepo1_raw.png" + + assert_downloaded(3_620, @source) + assert_rewritten(@rewrite, @source) + end + end end end From fbb25666b001b001da48978e1c63ec5b86741b1d Mon Sep 17 00:00:00 2001 From: evazion Date: Sun, 25 Jun 2017 01:03:39 -0500 Subject: [PATCH 4/4] tumblr: add source tests. --- test/unit/sources/tumblr_test.rb | 136 +++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 test/unit/sources/tumblr_test.rb diff --git a/test/unit/sources/tumblr_test.rb b/test/unit/sources/tumblr_test.rb new file mode 100644 index 000000000..4f89e4a93 --- /dev/null +++ b/test/unit/sources/tumblr_test.rb @@ -0,0 +1,136 @@ +require 'test_helper' + +module Sources + class TumblrTest < ActiveSupport::TestCase + context "The source for a 'http://*.tumblr.com/post/*' photo post with a single image" do + setup do + @site = Sources::Site.new("https://noizave.tumblr.com/post/162206271767") + @site.get + end + + should "get the artist name" do + assert_equal("noizave", @site.artist_name) + end + + should "get the profile" do + assert_equal("https://noizave.tumblr.com/", @site.profile_url) + end + + should "get the tags" do + tags = [["tag", "https://tumblr.com/tagged/tag"], ["red_hair", "https://tumblr.com/tagged/red-hair"]] + assert_equal(tags, @site.tags) + end + + should "get the commentary" do + desc = <<-EOS.strip_heredoc.chomp +

header

+ +

plain bold italics strike

+ + + +
  1. one
  2. +
  3. two
  4. +
  • one
  • +
    • two
    • +

quote

+ +

link

+ EOS + + assert_nil(@site.artist_commentary_title) + assert_equal(desc, @site.artist_commentary_desc) + end + + should "get the image url" do + assert_equal("http://data.tumblr.com/3bbfcbf075ddf969c996641b264086fd/tumblr_os2buiIOt51wsfqepo1_raw.png", @site.image_url) + end + end + + context "The source for a 'http://*.tumblr.com/image/*' image page" do + setup do + @site = Sources::Site.new("https://noizave.tumblr.com/image/162206271767") + @site.get + end + + should "get the image url" do + assert_equal("http://data.tumblr.com/3bbfcbf075ddf969c996641b264086fd/tumblr_os2buiIOt51wsfqepo1_raw.png", @site.image_url) + end + + should "get the tags" do + tags = [["tag", "https://tumblr.com/tagged/tag"], ["red_hair", "https://tumblr.com/tagged/red-hair"]] + assert_equal(tags, @site.tags) + end + end + + context "The source for a 'http://*.media.tumblr.com/$hash/tumblr_$id_1280.jpg' image with a referer" do + setup do + @url = "https://68.media.tumblr.com/7c4d2c6843466f92c3dd0516e749ec35/tumblr_orwwptNBCE1wsfqepo2_1280.jpg" + @ref = "https://noizave.tumblr.com/post/162094447052" + @site = Sources::Site.new(@url, referer_url: @ref) + @site.get + end + + should "get the image urls" do + urls = %w[ + http://data.tumblr.com/afed9f5b3c33c39dc8c967e262955de2/tumblr_orwwptNBCE1wsfqepo1_raw.png + http://data.tumblr.com/7c4d2c6843466f92c3dd0516e749ec35/tumblr_orwwptNBCE1wsfqepo2_raw.jpg + http://data.tumblr.com/d2ed224f135b0c81f812df81a0a8692d/tumblr_orwwptNBCE1wsfqepo3_raw.gif + http://data.tumblr.com/3bbfcbf075ddf969c996641b264086fd/tumblr_inline_os3134mABB1v11u29_raw.png + http://data.tumblr.com/34ed9d0ff4a21625981372291cb53040/tumblr_nv3hwpsZQY1uft51jo1_raw.gif + ] + + assert_equal(urls, @site.image_urls) + end + + should "get the tags" do + tags = [["tag1", "https://tumblr.com/tagged/tag1"], ["tag2", "https://tumblr.com/tagged/tag2"]] + assert_equal(tags, @site.tags) + end + + should "get the commentary" do + desc = '

description

' + assert_equal(desc, @site.artist_commentary_desc) + end + end + + context "The source for a 'http://*.tumblr.com/post/*' text post with inline images" do + setup do + @site = Sources::Site.new("https://noizave.tumblr.com/post/162221502947") + @site.get + end + + should "get the image urls" do + urls = %w[ + http://data.tumblr.com/afed9f5b3c33c39dc8c967e262955de2/tumblr_inline_os2zhkfhY01v11u29_raw.png + http://data.tumblr.com/7c4d2c6843466f92c3dd0516e749ec35/tumblr_inline_os2zkg02xH1v11u29_raw.jpg + ] + + assert_equal(urls, @site.image_urls) + end + + should "get the commentary" do + desc = '

description

' + + assert_equal("test post", @site.artist_commentary_title) + assert_equal(desc, @site.artist_commentary_desc) + end + end + + context "The source for a 'http://*.tumblr.com/post/*' video post with inline images" do + setup do + @site = Sources::Site.new("https://noizave.tumblr.com/post/162222617101") + @site.get + end + + should "get the image urls" do + urls = %w[ + https://vtt.tumblr.com/tumblr_os31dkexhK1wsfqep.mp4 + http://data.tumblr.com/afed9f5b3c33c39dc8c967e262955de2/tumblr_inline_os31dclyCR1v11u29_raw.png + ] + + assert_equal(urls, @site.image_urls) + end + end + end +end