Merge branch 'master' into fix-pixiv-profile-url

2020-06-24 00:06:55 -05:00
parent 158a4aa916 8ef2b7772d
commit 185693b99b
103 changed files with 1639 additions and 2247 deletions
--- a/app/logical/sources/strategies/art_station.rb
+++ b/app/logical/sources/strategies/art_station.rb
@@ -147,7 +147,7 @@ module Sources::Strategies
        urls = urls.reverse
      end

-      chosen_url = urls.find { |url| http_exists?(url, headers) }
+      chosen_url = urls.find { |url| http_exists?(url) }
      chosen_url || url
    end
  end
--- a/app/logical/sources/strategies/base.rb
+++ b/app/logical/sources/strategies/base.rb
@@ -14,6 +14,8 @@
 module Sources
  module Strategies
    class Base
+      class DownloadError < StandardError; end
+
      attr_reader :url, :referer_url, :urls, :parsed_url, :parsed_referer, :parsed_urls

      extend Memoist
@@ -35,9 +37,9 @@ module Sources
      #   <tt>referrer_url</tt> so the strategy can discover the HTML
      #   page and other information.
      def initialize(url, referer_url = nil)
-        @url = url
-        @referer_url = referer_url
-        @urls = [url, referer_url].select(&:present?)
+        @url = url.to_s
+        @referer_url = referer_url&.to_s
+        @urls = [@url, @referer_url].select(&:present?)

        @parsed_url = Addressable::URI.heuristic_parse(url) rescue nil
        @parsed_referer = Addressable::URI.heuristic_parse(referer_url) rescue nil
@@ -139,15 +141,28 @@ module Sources
      # Subclasses should merge in any required headers needed to access resources
      # on the site.
      def headers
-        Danbooru.config.http_headers
+        {}
      end

      # Returns the size of the image resource without actually downloading the file.
      def size
-        Downloads::File.new(image_url).size
+        http.head(image_url).content_length.to_i
      end
      memoize :size

+      # Download the file at the given url, or at the main image url by default.
+      def download_file!(download_url = image_url)
+        raise DownloadError, "Download failed: couldn't find download url for #{url}" if download_url.blank?
+        response, file = http.download_media(download_url)
+        raise DownloadError, "Download failed: #{download_url} returned error #{response.status}" if response.status != 200
+        file
+      end
+
+      def http
+        Danbooru::Http.public_only.timeout(30).max_size(Danbooru.config.max_file_size)
+      end
+      memoize :http
+
      # The url to use for artist finding purposes. This will be stored in the
      # artist entry. Normally this will be the profile url.
      def normalize_for_artist_finder
@@ -274,9 +289,8 @@ module Sources
        to_h.to_json
      end

-      def http_exists?(url, headers)
-        res = HTTParty.head(url, Danbooru.config.httparty_options.deep_merge(headers: headers))
-        res.success?
+      def http_exists?(url, headers = {})
+        http.headers(headers).head(url).status.success?
      end

      # Convert commentary to dtext by stripping html tags. Sites can override
--- a/app/logical/sources/strategies/hentai_foundry.rb
+++ b/app/logical/sources/strategies/hentai_foundry.rb
@@ -64,11 +64,10 @@ module Sources
      def page
        return nil if page_url.blank?

-        doc = Cache.get("hentai-foundry:#{page_url}", 1.minute) do
-          HTTParty.get("#{page_url}?enterAgree=1").body
-        end
+        response = Danbooru::Http.new.cache(1.minute).get("#{page_url}?enterAgree=1")
+        return nil unless response.status == 200

-        Nokogiri::HTML(doc)
+        response.parse
      end

      def tags
--- a/app/logical/sources/strategies/nico_seiga.rb
+++ b/app/logical/sources/strategies/nico_seiga.rb
@@ -73,8 +73,7 @@ module Sources
      end

      def image_url
-        return if image_urls.blank?
-        return url if api_client.blank?
+        return url if image_urls.blank? || api_client.blank?

        img = case url
        when DIRECT || CDN_DIRECT then "https://seiga.nicovideo.jp/image/source/#{image_id_from_url(url)}"
@@ -83,7 +82,7 @@ module Sources
        end

        resp = api_client.get(img)
-        if resp.headers["Location"] =~ %r{https?://.+/(\w+/\d+/\d+)\z}i
+        if resp.uri.to_s =~ %r{https?://.+/(\w+/\d+/\d+)\z}i
          "https://lohas.nicoseiga.jp/priv/#{$1}"
        else
          img
@@ -181,12 +180,12 @@ module Sources

      def api_client
        if illust_id.present?
-          NicoSeigaApiClient.new(work_id: illust_id, type: "illust")
+          NicoSeigaApiClient.new(work_id: illust_id, type: "illust", http: http)
        elsif manga_id.present?
-          NicoSeigaApiClient.new(work_id: manga_id, type: "manga")
+          NicoSeigaApiClient.new(work_id: manga_id, type: "manga", http: http)
        elsif image_id.present?
          # We default to illust to attempt getting the api anyway
-          NicoSeigaApiClient.new(work_id: image_id, type: "illust")
+          NicoSeigaApiClient.new(work_id: image_id, type: "illust", http: http)
        end
      end
      memoize :api_client
--- a/app/logical/sources/strategies/nijie.rb
+++ b/app/logical/sources/strategies/nijie.rb
@@ -178,54 +178,21 @@ module Sources
      def page
        return nil if page_url.blank?

-        doc = agent.get(page_url)
+        http = Danbooru::Http.new
+        form = { email: Danbooru.config.nijie_login, password: Danbooru.config.nijie_password }

-        if doc.search("div#header-login-container").any?
-          # Session cache is invalid, clear it and log in normally.
-          Cache.delete("nijie-session")
-          doc = agent.get(page_url)
-        end
+        # XXX `retriable` must come after `cache` so that retries don't return cached error responses.
+        response = http.cache(1.hour).use(retriable: { max_retries: 20 }).post("https://nijie.info/login_int.php", form: form)
+        DanbooruLogger.info "Nijie login failed (#{url}, #{response.status})" if response.status != 200
+        return nil unless response.status == 200

-        doc
-      rescue Mechanize::ResponseCodeError => e
-        return nil if e.response_code.to_i == 404
-        raise
+        response = http.cookies(R18: 1).cache(1.minute).get(page_url)
+        return nil unless response.status == 200
+
+        response&.parse
      end
+
      memoize :page
-
-      def agent
-        mech = Mechanize.new
-
-        session = Cache.get("nijie-session")
-        if session
-          cookie = Mechanize::Cookie.new("NIJIEIJIEID", session)
-          cookie.domain = ".nijie.info"
-          cookie.path = "/"
-          mech.cookie_jar.add(cookie)
-        else
-          mech.get("https://nijie.info/login.php") do |page|
-            page.form_with(:action => "/login_int.php") do |form|
-              form['email'] = Danbooru.config.nijie_login
-              form['password'] = Danbooru.config.nijie_password
-            end.click_button
-          end
-          session = mech.cookie_jar.cookies.select {|c| c.name == "NIJIEIJIEID"}.first
-          Cache.put("nijie-session", session.value, 1.day) if session
-        end
-
-        # This cookie needs to be set to allow viewing of adult works while anonymous
-        cookie = Mechanize::Cookie.new("R18", "1")
-        cookie.domain = ".nijie.info"
-        cookie.path = "/"
-        mech.cookie_jar.add(cookie)
-
-        mech
-      rescue Mechanize::ResponseCodeError => e
-        raise unless e.response_code.to_i == 429
-        sleep(5)
-        retry
-      end
-      memoize :agent
    end
  end
 end
--- a/app/logical/sources/strategies/null.rb
+++ b/app/logical/sources/strategies/null.rb
@@ -47,7 +47,7 @@ module Sources
        when %r{\Ahttps?://c(?:s|han|[1-4])\.sankakucomplex\.com/data(?:/sample)?/(?:[a-f0-9]{2}/){2}(?:sample-|preview)?([a-f0-9]{32})}i
          "https://chan.sankakucomplex.com/en/post/show?md5=#{$1}"

-        when %r{\Ahttps?://(?:www|s(?:tatic|[1-4]))\.zerochan\.net/.+(?:\.|\/)(\d+)(?:\.(?:jpe?g?))?\z}i
+        when %r{\Ahttps?://(?:www|s(?:tatic|[1-4]))\.zerochan\.net/.+(?:\.|\/)(\d+)(?:\.(?:jpe?g?|png))?\z}i
          "https://www.zerochan.net/#{$1}#full"

        when %r{\Ahttps?://static[1-6]?\.minitokyo\.net/(?:downloads|view)/(?:\d{2}/){2}(\d+)}i
--- a/app/logical/sources/strategies/pixiv.rb
+++ b/app/logical/sources/strategies/pixiv.rb
@@ -64,9 +64,6 @@ module Sources
      ORIG_IMAGE = %r{#{PXIMG}/img-original/img/#{DATE}/(?<illust_id>\d+)_p(?<page>\d+)\.#{EXT}\z}i
      STACC_PAGE = %r{\A#{WEB}/stacc/#{MONIKER}/?\z}i
      NOVEL_PAGE = %r{(?:\Ahttps?://www\.pixiv\.net/novel/show\.php\?id=(\d+))}
-      FANBOX_ACCOUNT = %r{(?:\Ahttps?://www\.pixiv\.net/fanbox/creator/\d+\z)}
-      FANBOX_IMAGE = %r{(?:\Ahttps?://fanbox\.pixiv\.net/images/post/(\d+))}
-      FANBOX_PAGE = %r{(?:\Ahttps?://www\.pixiv\.net/fanbox/creator/\d+/post/(\d+))}

      def self.to_dtext(text)
        if text.nil?
@@ -127,14 +124,6 @@ module Sources
          return "https://www.pixiv.net/novel/show.php?id=#{novel_id}&mode=cover"
        end

-        if fanbox_id.present?
-          return "https://www.pixiv.net/fanbox/creator/#{metadata.user_id}/post/#{fanbox_id}"
-        end
-
-        if fanbox_account_id.present?
-          return "https://www.pixiv.net/fanbox/creator/#{fanbox_account_id}"
-        end
-
        if illust_id.present?
          return "https://www.pixiv.net/artworks/#{illust_id}"
        end
@@ -192,17 +181,7 @@ module Sources
      end

      def headers
-        if fanbox_id.present?
-          # need the session to download fanbox images
-          return {
-            "Referer" => "https://www.pixiv.net/fanbox",
-            "Cookie" => HTTP::Cookie.cookie_value(agent.cookies)
-          }
-        end
-
-        {
-          "Referer" => "https://www.pixiv.net"
-        }
+        { "Referer" => "https://www.pixiv.net" }
      end

      def normalize_for_source
@@ -242,10 +221,6 @@ module Sources
      end

      def image_urls_sub
-        if url =~ FANBOX_IMAGE
-          return [url]
-        end
-
        # there's too much normalization bullshit we have to deal with
        # raw urls, so just fetch the canonical url from the api every
        # time.
@@ -265,7 +240,7 @@ module Sources
      # even though it makes sense to reference page_url here, it will only look
      # at (url, referer_url).
      def illust_id
-        return nil if novel_id.present? || fanbox_id.present?
+        return nil if novel_id.present?

        parsed_urls.each do |url|
          # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
@@ -328,46 +303,11 @@ module Sources
      end
      memoize :novel_id

-      def fanbox_id
-        [url, referer_url].each do |x|
-          if x =~ FANBOX_PAGE
-            return $1
-          end
-
-          if x =~ FANBOX_IMAGE
-            return $1
-          end
-        end
-
-        nil
-      end
-      memoize :fanbox_id
-
-      def fanbox_account_id
-        [url, referer_url].each do |x|
-          if x =~ FANBOX_ACCOUNT
-            return x
-          end
-        end
-
-        nil
-      end
-      memoize :fanbox_account_id
-
-      def agent
-        PixivWebAgent.build
-      end
-      memoize :agent
-
      def metadata
        if novel_id.present?
          return PixivApiClient.new.novel(novel_id)
        end

-        if fanbox_id.present?
-          return PixivApiClient.new.fanbox(fanbox_id)
-        end
-
        PixivApiClient.new.work(illust_id)
      end
      memoize :metadata
--- a/app/logical/sources/strategies/tumblr.rb
+++ b/app/logical/sources/strategies/tumblr.rb
@@ -23,7 +23,7 @@ module Sources::Strategies
    OLD_IMAGE = %r{\Ahttps?://#{DOMAIN}/(?<dir>#{MD5}/)?#{FILENAME}_(?<size>\w+)\.#{EXT}\z}i

    IMAGE = %r{\Ahttps?://#{DOMAIN}/}i
-    VIDEO = %r{\Ahttps?://(?:vtt|ve\.media)\.tumblr\.com/}i
+    VIDEO = %r{\Ahttps?://(?:vtt|ve|va\.media)\.tumblr\.com/}i
    POST = %r{\Ahttps?://(?<blog_name>[^.]+)\.tumblr\.com/(?:post|image)/(?<post_id>\d+)}i

    def self.enabled?
@@ -168,7 +168,7 @@ module Sources::Strategies
      end

      candidates.find do |candidate|
-        http_exists?(candidate, headers)
+        http_exists?(candidate)
      end
    end

--- a/app/logical/sources/strategies/twitter.rb
+++ b/app/logical/sources/strategies/twitter.rb
@@ -200,7 +200,7 @@ module Sources::Strategies
    end

    def api_response
-      return {} unless self.class.enabled?
+      return {} unless self.class.enabled? && status_id.present?
      api_client.status(status_id)
    end