tumblr: extract info from url when api data is unavailable.

Derive the artist name / profile url / page url from the source URLs when the API response is unavailable because the Tumblr post was deleted. This fixes the artist finder to work on bad_tumblr_id posts.
2018-10-09 11:57:23 -05:00
parent 0c31a5d6a9
commit b0d7d90103
3 changed files with 33 additions and 26 deletions
--- a/app/logical/sources/strategies/base.rb
+++ b/app/logical/sources/strategies/base.rb
@@ -221,6 +221,8 @@ module Sources
          :profile_url => profile_url,
          :image_url => image_url,
          :image_urls => image_urls,
+          :page_url => page_url,
+          :canonical_url => canonical_url,
          :normalized_for_artist_finder_url => normalize_for_artist_finder,
          :tags => tags,
          :translated_tags => translated_tags,
--- a/app/logical/sources/strategies/tumblr.rb
+++ b/app/logical/sources/strategies/tumblr.rb
@@ -1,5 +1,6 @@
 module Sources::Strategies
  class Tumblr < Base
+    BASE_URL = %r!\Ahttps?://(?:[^/]+\.)*tumblr\.com!i
    DOMAIN = %r{(data|(\d+\.)?media)\.tumblr\.com}
    MD5 = %r{(?<md5>[0-9a-f]{32})}i
    FILENAME = %r{(?<filename>(tumblr_(inline_)?)?[a-z0-9]+(_r[0-9]+)?)}i
@@ -13,18 +14,7 @@ module Sources::Strategies
    end

    def self.match?(*urls)
-      urls.compact.any? do |url|
-        blog_name, post_id = parse_info_from_url(url)
-        url =~ IMAGE || blog_name.present? && post_id.present?
-      end
-    end
-
-    def self.parse_info_from_url(url)
-      if url =~ POST
-        [$~[:blog_name], $~[:post_id]]
-      else
-        []
-      end
+      urls.compact.any? { |url| url.match?(BASE_URL) }
    end

    def site_name
@@ -42,22 +32,21 @@ module Sources::Strategies
    end

    def page_url
-      [url, referer_url].each do |x|
-        if x =~ POST
-          blog_name, post_id = self.class.parse_info_from_url(x)
-          return "https://#{blog_name}.tumblr.com/post/#{post_id}"
-        end
-      end
+      return nil unless blog_name.present? && post_id.present?
+      "https://#{blog_name}.tumblr.com/post/#{post_id}"
+    end

-      return super
+    def canonical_url
+      page_url
    end

    def profile_url
-      "https://#{artist_name}.tumblr.com/"
+      return nil if artist_name.blank?
+      "https://#{artist_name}.tumblr.com"
    end

    def artist_name
-      post[:blog_name]
+      post[:blog_name] || blog_name
    end

    def artist_commentary_title
@@ -99,7 +88,6 @@ module Sources::Strategies
        [etag, "https://tumblr.com/tagged/#{CGI.escape(etag)}"]
      end.uniq
    end
-    memoize :tags

    def dtext_artist_commentary_desc
      DText.from_html(artist_commentary_desc).strip
@@ -175,11 +163,18 @@ module Sources::Strategies
      html = Nokogiri::HTML.fragment(artist_commentary_desc)
      html.css("img").map { |node| node["src"] }
    end
-    memoize :inline_images
+
+    def blog_name
+      urls.map { |url| url[POST, :blog_name] }.compact.first
+    end
+
+    def post_id
+      urls.map { |url| url[POST, :post_id] }.compact.first
+    end

    def api_response
      return {} unless self.class.enabled?
-      blog_name, post_id = self.class.parse_info_from_url(page_url)
+      return {} unless blog_name.present? && post_id.present?

      body, code = HttpartyCache.get("/#{blog_name}/posts",
        params: { id: post_id, api_key: Danbooru.config.tumblr_consumer_key },
--- a/test/unit/sources/tumblr_test.rb
+++ b/test/unit/sources/tumblr_test.rb
@@ -16,7 +16,7 @@ module Sources
      end

      should "get the profile" do
-        assert_equal("https://noizave.tumblr.com/", @site.profile_url)
+        assert_equal("https://noizave.tumblr.com", @site.profile_url)
      end

      should "get the tags" do
@@ -180,10 +180,20 @@ module Sources
    end

    context "A deleted tumblr post" do
-      should "work" do
+      should "extract the info from the url" do
        site = Sources::Strategies.find("http://shimetsukage.tumblr.com/post/176805588268/20180809-ssb-coolboy")
+        data = {
+          artist_name: "shimetsukage",
+          profile_url: "https://shimetsukage.tumblr.com",
+          page_url: "https://shimetsukage.tumblr.com/post/176805588268",
+          canonical_url: "https://shimetsukage.tumblr.com/post/176805588268",
+          image_url: nil,
+          image_urls: [],
+          tags: [],
+        }

        assert_nothing_raised { site.to_h }
+        assert_operator(data, :<, site.to_h)
      end
    end
  end