sources: fix sources sometimes choosing wrong strategy (fix #3968)

Fix sources choosing the wrong strategy when the referer belongs to a different site (for example, when uploading a twitter post with a pixiv referer). * Fix `match?` to only consider the main url, not the referer. * Change `match?` to match against a list of domains given by the `domains` method. * Change `match?` to an instance method.
2018-11-04 13:00:17 -06:00
parent 4219163042
commit 5cf6a43918
13 changed files with 58 additions and 37 deletions
--- a/app/logical/sources/strategies.rb
+++ b/app/logical/sources/strategies.rb
@@ -16,8 +16,8 @@ module Sources
    end

    def self.find(url, referer=nil, default: Strategies::Null)
-      strategy = all.detect { |strategy| strategy.match?(url, referer) } || default
-      strategy&.new(url, referer)
+      strategy = all.map { |strategy| strategy.new(url, referer) }.detect(&:match?)
+      strategy || default.new(url, referer)
    end

    def self.canonical(url, referer)
--- a/app/logical/sources/strategies/art_station.rb
+++ b/app/logical/sources/strategies/art_station.rb
@@ -9,10 +9,6 @@ module Sources::Strategies

    attr_reader :json, :image_urls

-    def self.match?(*urls)
-      urls.compact.any? { |x| x.match?(PROJECT) || x.match?(ASSET) || x.match?(PROFILE)}
-    end
-
    # https://www.artstation.com/artwork/04XA4
    # https://www.artstation.com/artwork/cody-from-sf
    # https://sa-dui.artstation.com/projects/DVERn
@@ -24,6 +20,10 @@ module Sources::Strategies
      end
    end

+    def domains
+      ["artstation.com"]
+    end
+
    def site_name
      "ArtStation"
    end
--- a/app/logical/sources/strategies/base.rb
+++ b/app/logical/sources/strategies/base.rb
@@ -14,14 +14,10 @@
 module Sources
  module Strategies
    class Base
-      attr_reader :url, :referer_url
+      attr_reader :url, :referer_url, :urls, :parsed_url, :parsed_referer, :parsed_urls

      extend Memoist

-      def self.match?(*urls)
-        false
-      end
-
      # Should return true if all prerequisites for using the strategy are met.
      # Return false if the strategy requires api keys that have not been configured.
      def self.enabled?
@@ -41,10 +37,24 @@ module Sources
      def initialize(url, referer_url = nil)
        @url = url
        @referer_url = referer_url
+        @urls = [url, referer_url].select(&:present?)
+
+        @parsed_url = Addressable::URI.heuristic_parse(url) rescue nil
+        @parsed_referer = Addressable::URI.heuristic_parse(referer_url) rescue nil
+        @parsed_urls = [parsed_url, parsed_referer].select(&:present?)
      end

-      def urls
-        [url, referer_url].select(&:present?)
+      # Should return true if this strategy should be used. By default, checks
+      # if the main url belongs to any of the domains associated with this site.
+      def match?
+        return false if parsed_url.nil?
+        parsed_url.domain.in?(domains)
+      end
+
+      # The list of base domains belonging to this site. Subdomains are
+      # automatically included (i.e. "pixiv.net" matches "fanbox.pixiv.net").
+      def domains
+        []
      end

      def site_name
--- a/app/logical/sources/strategies/deviant_art.rb
+++ b/app/logical/sources/strategies/deviant_art.rb
@@ -49,8 +49,8 @@ module Sources
      PATH_PROFILE = %r{\Ahttps?://(www\.)?deviantart\.com/#{ARTIST}/?\z}i
      SUBDOMAIN_PROFILE = %r{\Ahttps?://#{ARTIST}\.deviantart\.com/?\z}i

-      def self.match?(*urls)
-        urls.compact.any? { |x| x.match?(/^https?:\/\/(?:.+?\.)?deviantart\.(?:com|net)/) }
+      def domains
+        ["deviantart.net", "deviantart.com"]
      end

      def site_name
--- a/app/logical/sources/strategies/moebooru.rb
+++ b/app/logical/sources/strategies/moebooru.rb
@@ -38,8 +38,8 @@ module Sources

      delegate :artist_name, :profile_url, :unique_id, :artist_commentary_title, :artist_commentary_desc, :dtext_artist_commentary_title, :dtext_artist_commentary_desc, to: :sub_strategy, allow_nil: true

-      def self.match?(*urls)
-        urls.compact.any? { |x| x.match?(BASE_URL) }
+      def domains
+        ["yande.re", "konachan.com"]
      end

      def site_name
--- a/app/logical/sources/strategies/nico_seiga.rb
+++ b/app/logical/sources/strategies/nico_seiga.rb
@@ -7,8 +7,8 @@ module Sources
      PAGE = %r!\Ahttps?://seiga\.nicovideo\.jp/seiga/im(\d+)!i
      PROFILE = %r!\Ahttps?://seiga\.nicovideo\.jp/user/illust/(\d+)!i

-      def self.match?(*urls)
-        urls.compact.any? { |x| x.match?(URL) }
+      def domains
+        ["nicoseiga.jp", "nicovideo.jp"]
      end

      def site_name
--- a/app/logical/sources/strategies/nijie.rb
+++ b/app/logical/sources/strategies/nijie.rb
@@ -53,8 +53,8 @@ module Sources
      FILENAME = %r!(?:(?<illust_id>\d+)_(?<page>\d+_))?(?<artist_id>\d+)_(?<timestamp>\d{14})(?:_\d+)?!i
      IMAGE_URL = %r!\Ahttps?://pic\d+\.nijie\.info/#{DIR}/#{FILENAME}\.\w+\z!i

-      def self.match?(*urls)
-        urls.compact.any? { |x| x.match?(BASE_URL) }
+      def domains
+        ["nijie.info"]
      end

      def site_name
--- a/app/logical/sources/strategies/pawoo.rb
+++ b/app/logical/sources/strategies/pawoo.rb
@@ -2,10 +2,8 @@ module Sources::Strategies
  class Pawoo < Base
    IMAGE = %r!\Ahttps?://img\.pawoo\.net/media_attachments/files/(\d+/\d+/\d+)!

-    def self.match?(*urls)
-      urls.compact.any? do |x| 
-        x =~ IMAGE || PawooApiClient::Status.is_match?(x) || PawooApiClient::Account.is_match?(x)
-      end
+    def domains
+      ["pawoo.net"]
    end

    def site_name
--- a/app/logical/sources/strategies/pixiv.rb
+++ b/app/logical/sources/strategies/pixiv.rb
@@ -21,10 +21,6 @@ module Sources
      FANBOX_IMAGE = %r!(?:\Ahttps?://fanbox\.pixiv\.net/images/post/(\d+))!
      FANBOX_PAGE = %r!(?:\Ahttps?://www\.pixiv\.net/fanbox/creator/\d+/post/(\d+))!

-      def self.match?(*urls)
-        urls.compact.any? { |x| x.match?(/#{WEB}|#{IMG}|#{I12}|#{TOUCH}|#{PXIMG}|#{FANBOX_IMAGE}|#{FANBOX_ACCOUNT}/i) }
-      end
-
      def self.to_dtext(text)
        if text.nil?
          return nil
@@ -47,6 +43,10 @@ module Sources
        DText.from_html(text)
      end

+      def domains
+        ["pixiv.net", "pximg.net"]
+      end
+
      def site_name
        "Pixiv"
      end
--- a/app/logical/sources/strategies/stash.rb
+++ b/app/logical/sources/strategies/stash.rb
@@ -15,8 +15,12 @@ module Sources
    class Stash < DeviantArt
      STASH = %r{\Ahttps?://sta\.sh/(?<post_id>[0-9a-zA-Z]+)}i

-      def self.match?(*urls)
-        urls.compact.any? { |x| x =~ STASH }
+      def domains
+        ["deviantart.net", "sta.sh"]
+      end
+
+      def match?
+        parsed_urls.map(&:domain).any?("sta.sh")
      end

      def site_name
--- a/app/logical/sources/strategies/tumblr.rb
+++ b/app/logical/sources/strategies/tumblr.rb
@@ -15,8 +15,8 @@ module Sources::Strategies
      Danbooru.config.tumblr_consumer_key.present?
    end

-    def self.match?(*urls)
-      urls.compact.any? { |url| url.match?(BASE_URL) }
+    def domains
+      ["tumblr.com"]
    end

    def site_name
--- a/app/logical/sources/strategies/twitter.rb
+++ b/app/logical/sources/strategies/twitter.rb
@@ -9,10 +9,6 @@ module Sources::Strategies
    # https://developer.twitter.com/en/docs/developer-utilities/configuration/api-reference/get-help-configuration
    RESERVED_USERNAMES = %w[home i intent search]

-    def self.match?(*urls)
-      urls.compact.any? { |x| x =~ PAGE || x =~ ASSET}
-    end
-
    def self.enabled?
      TwitterService.new.enabled?
    end
@@ -35,6 +31,10 @@ module Sources::Strategies
      end
    end

+    def domains
+      ["twitter.com", "twimg.com"]
+    end
+
    def site_name
      "Twitter"
    end
--- a/test/unit/sources/twitter_test.rb
+++ b/test/unit/sources/twitter_test.rb
@@ -229,5 +229,14 @@ module Sources
        assert_equal(desc2, site.dtext_artist_commentary_desc)
      end
    end
+
+    context "A twitter post with a pixiv referer" do
+      should "use the twitter strategy" do
+        site = Sources::Strategies.find("https://twitter.com/Mityubi/status/849630665603665920", "https://www.pixiv.net/member_illust.php?mode=medium&illust_id=56735489")
+
+        assert_equal(site.site_name, "Twitter")
+        assert_equal("https://pbs.twimg.com/media/C8p-gPhVoAMZupS.png:orig", site.image_url)
+      end
+    end
  end
 end