pixiv: fix API breakage.

Fix the Pixiv API no longer working by rewriting the Pixiv strategy to use the Ajax API instead of the mobile API. Before we could authenticate in the mobile API by using the OAuth 2.0 grant_type=password authentication flow. This no longer works. Now it requires logging in through a HTML page, which is protected by Google reCaptcha. This makes using the mobile API infeasible. Instead we switch to the Ajax API, which only needs a PHPSESSID to authenticate. This can be obtained by logging in manually and using the devtools to extract the cookie. This also temporarily removes support for Pixiv novels. This should be moved to a separate source strategy.
2021-02-09 05:48:02 -06:00
parent 7520c4db49
commit 39cc3ed5cf
8 changed files with 458 additions and 317 deletions
--- a/app/logical/sources/strategies.rb
+++ b/app/logical/sources/strategies.rb
@@ -2,7 +2,7 @@ module Sources
  module Strategies
    def self.all
      [
-        #Strategies::Pixiv,
+        Strategies::Pixiv,
        Strategies::Fanbox,
        Strategies::NicoSeiga,
        Strategies::Twitter,
--- a/app/logical/sources/strategies/pixiv.rb
+++ b/app/logical/sources/strategies/pixiv.rb
@@ -48,27 +48,22 @@ module Sources
      I12 =     %r{(?:\A(?:https?://)?i[0-9]+\.pixiv\.net)}
      IMG =     %r{(?:\A(?:https?://)?img[0-9]*\.pixiv\.net)}
      PXIMG =   %r{(?:\A(?:https?://)?[^.]+\.pximg\.net)}
-      TOUCH =   %r{(?:\A(?:https?://)?touch\.pixiv\.net)}
      UGOIRA =  %r{#{PXIMG}/img-zip-ugoira/img/#{DATE}/(?<illust_id>\d+)_ugoira1920x1080\.zip\z}i
      ORIG_IMAGE = %r{#{PXIMG}/img-original/img/#{DATE}/(?<illust_id>\d+)_p(?<page>\d+)\.#{EXT}\z}i
-      STACC_PAGE = %r{\A#{WEB}/stacc/#{MONIKER}/?\z}i
-      NOVEL_PAGE = %r{(?:\Ahttps?://www\.pixiv\.net/novel/show\.php\?id=(\d+))}

      def self.enabled?
-        Danbooru.config.pixiv_login.present? && Danbooru.config.pixiv_password.present?
+        Danbooru.config.pixiv_phpsessid.present?
      end

      def self.to_dtext(text)
-        if text.nil?
-          return nil
-        end
+        return nil if text.nil?

-        text = text.gsub(%r{https?://www\.pixiv\.net/member_illust\.php\?mode=medium&illust_id=([0-9]+)}i) do |_match|
+        text = text.gsub(%r{<a href="https?://www\.pixiv\.net/en/artworks/([0-9]+)">illust/[0-9]+</a>}i) do |_match|
          pixiv_id = $1
          %(pixiv ##{pixiv_id} "»":[#{Routes.posts_path(tags: "pixiv:#{pixiv_id}")}])
        end

-        text = text.gsub(%r{https?://www\.pixiv\.net/member\.php\?id=([0-9]+)}i) do |_match|
+        text = text.gsub(%r{<a href="https?://www\.pixiv\.net/en/users/([0-9]+)">user/[0-9]+</a>}i) do |_match|
          member_id = $1
          profile_url = "https://www.pixiv.net/users/#{member_id}"
          artist_search_url = Routes.artists_path(search: { url_matches: profile_url })
@@ -76,7 +71,6 @@ module Sources
          %("user/#{member_id}":[#{profile_url}] "»":[#{artist_search_url}])
        end

-        text = text.gsub(/\r\n|\r|\n/, "<br>")
        DText.from_html(text)
      end

@@ -95,9 +89,19 @@ module Sources
      end

      def image_urls
-        image_urls_sub
-      rescue PixivApiClient::BadIDError
-        [url]
+        if is_ugoira?
+          [api_ugoira[:originalSrc]]
+        elsif manga_page.present? && original_urls.present?
+          [original_urls[manga_page]]
+        elsif original_urls.present?
+          original_urls
+        else
+          [url]
+        end
+      end
+
+      def original_urls
+        api_pages.map { |page| page.dig("urls", "original") }
      end

      def preview_urls
@@ -114,17 +118,8 @@ module Sources
      end

      def page_url
-        if novel_id.present?
-          return "https://www.pixiv.net/novel/show.php?id=#{novel_id}&mode=cover"
-        end
-
-        if illust_id.present?
-          return "https://www.pixiv.net/artworks/#{illust_id}"
-        end
-
-        url
-      rescue PixivApiClient::BadIDError
-        nil
+        return nil if illust_id.blank?
+        "https://www.pixiv.net/artworks/#{illust_id}"
      end

      def canonical_url
@@ -132,15 +127,15 @@ module Sources
      end

      def profile_url
-        [url, referer_url].each do |x|
-          if x =~ PROFILE
-            return x
-          end
-        end
+        url = urls.find { |url| url.match?(PROFILE) }

-        "https://www.pixiv.net/users/#{metadata.user_id}"
-      rescue PixivApiClient::BadIDError
-        nil
+        if url.present?
+          url
+        elsif api_illust[:userId].present?
+          "https://www.pixiv.net/users/#{api_illust[:userId]}"
+        else
+          nil
+        end
      end

      def stacc_url
@@ -153,9 +148,7 @@ module Sources
      end

      def artist_name
-        metadata.name
-      rescue PixivApiClient::BadIDError
-        nil
+        api_illust[:userName]
      end

      def other_names
@@ -163,15 +156,11 @@ module Sources
      end

      def artist_commentary_title
-        metadata.artist_commentary_title
-      rescue PixivApiClient::BadIDError
-        nil
+        api_illust[:title]
      end

      def artist_commentary_desc
-        metadata.artist_commentary_desc
-      rescue PixivApiClient::BadIDError
-        nil
+        api_illust[:description]
      end

      def headers
@@ -179,8 +168,7 @@ module Sources
      end

      def normalize_for_source
-        return if illust_id.blank?
-
+        return nil if illust_id.blank?
        "https://www.pixiv.net/artworks/#{illust_id}"
      end

@@ -189,11 +177,10 @@ module Sources
      end

      def tags
-        metadata.tags.map do |tag|
+        api_illust.dig(:tags, :tags).to_a.map do |item|
+          tag = item[:tag]
          [tag, "https://www.pixiv.net/search.php?s_mode=s_tag_full&#{{word: tag}.to_param}"]
        end
-      rescue PixivApiClient::BadIDError
-        []
      end

      def normalize_tag(tag)
@@ -214,28 +201,12 @@ module Sources
        illust_id.present? ? "pixiv:#{illust_id}" : "source:#{canonical_url}"
      end

-      def image_urls_sub
-        # there's too much normalization bullshit we have to deal with
-        # raw urls, so just fetch the canonical url from the api every
-        # time.
-        if manga_page.present?
-          return [metadata.pages[manga_page]]
-        end
-
-        if metadata.pages.is_a?(Hash)
-          return [ugoira_zip_url]
-        end
-
-        metadata.pages
+      def is_ugoira?
+        # https://i.pximg.net/img-original/img/2019/05/27/17/59/33/74932152_ugoira0.jpg
+        url.match?(UGOIRA) || api_illust.dig(:urls, :original)&.match?(/ugoira/)
      end

-      # in order to prevent recursive loops, this method should not make any
-      # api calls and only try to extract the illust_id from the url. therefore,
-      # even though it makes sense to reference page_url here, it will only look
-      # at (url, referer_url).
      def illust_id
-        return nil if novel_id.present?
-
        parsed_urls.each do |url|
          # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
          # http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
@@ -284,27 +255,22 @@ module Sources

        nil
      end
-      memoize :illust_id

-      def novel_id
-        [url, referer_url].each do |x|
-          if x =~ NOVEL_PAGE
-            return $1
-          end
-        end
-
-        nil
+      def api_client
+        PixivAjaxClient.new(Danbooru.config.pixiv_phpsessid)
      end
-      memoize :novel_id

-      def metadata
-        if novel_id.present?
-          return PixivApiClient.new.novel(novel_id)
-        end
-
-        PixivApiClient.new.work(illust_id)
+      def api_illust
+        api_client.illust(illust_id)
+      end
+
+      def api_pages
+        api_client.pages(illust_id)
+      end
+
+      def api_ugoira
+        api_client.ugoira_meta(illust_id)
      end
-      memoize :metadata

      def moniker
        # we can sometimes get the moniker from the url
@@ -315,44 +281,17 @@ module Sources
        elsif url =~ %r{#{WEB}/stacc/(#{MONIKER})/?$}i
          $1
        else
-          metadata.moniker
+          api_illust[:userAccount]
        end
-      rescue PixivApiClient::BadIDError
-        nil
      end
-      memoize :moniker

      def data
-        { ugoira_frame_data: ugoira_frame_data }
+        { ugoira_frame_data: api_ugoira[:frames] }
      end

-      def ugoira_zip_url
-        if metadata.pages.is_a?(Hash) && metadata.pages["ugoira600x600"]
-          metadata.pages["ugoira600x600"].sub("_ugoira600x600.zip", "_ugoira1920x1080.zip")
-        end
-      end
-      memoize :ugoira_zip_url
-
-      def ugoira_frame_data
-        metadata.json.dig("metadata", "frames")
-      rescue PixivApiClient::BadIDError
-        nil
-      end
-      memoize :ugoira_frame_data
-
      def ugoira_content_type
-        case metadata.json["image_urls"].to_s
-        when /\.jpg/
-          "image/jpeg"
-        when /\.png/
-          "image/png"
-        when /\.gif/
-          "image/gif"
-        else
-          raise Sources::Error, "content type not found for (#{url}, #{referer_url})"
-        end
+        api_ugoira[:mime_type]
      end
-      memoize :ugoira_content_type

      # Returns the current page number of the manga. This will not
      # make any api calls and only looks at (url, referer_url).
@@ -373,7 +312,8 @@ module Sources

        nil
      end
-      memoize :manga_page
+
+      memoize :illust_id, :api_client, :api_illust, :api_pages, :api_ugoira
    end
  end
 end