From 33db1a27618e0afee0e8e485fc9e2eff4065f3ba Mon Sep 17 00:00:00 2001
From: nonamethanks <hellafrickingepic@gmail.com>
Date: Fri, 14 Jan 2022 11:46:26 +0100
Subject: [PATCH] Skeb: fix several issues with the strategy

* Fix fetching of videos
* Fix fetching of original commentary
* Fix images being returned out of order in bookmarklet
---
 app/logical/sources/strategies/skeb.rb | 61 ++++++++++++++++----------
 test/unit/sources/skeb_test.rb         | 46 ++++++++++---------
 2 files changed, 62 insertions(+), 45 deletions(-)
diff --git a/app/logical/sources/strategies/skeb.rb b/app/logical/sources/strategies/skeb.rb
index d8e37aa51..8351a5b06 100644
--- a/app/logical/sources/strategies/skeb.rb
+++ b/app/logical/sources/strategies/skeb.rb
@@ -26,18 +26,21 @@ module Sources
   module Strategies
     class Skeb < Base
       PROFILE_URL = %r{https?://(?:www\.)?skeb\.jp/@(?<artist_name>\w+)}i
-
       PAGE_URL    = %r{#{PROFILE_URL}/works/(?<illust_id>\d+)}i
-
-      IMAGE_URL   = %r{https?://(?:www\.)?skeb\.imgix\.net/(requests|uploads/origins)/.*}i
+      IMAGE_URL   = %r{https?://(?:(?:www\.)?skeb\.imgix\.net|skeb-production.s3.ap-northeast-1.amazonaws.com/)/.+}i
+      UUID_REGEX  = %r{/(?<uuid>(?:(?:\w+-)+\w+|(?:\d+_\d+))).*(?:fm=(?<type>\w+))?.*}
 
       def domains
         ["skeb.jp"]
       end
 
+      def image_domains
+        ["skeb.imgix.net", "skeb-production.s3.ap-northeast-1.amazonaws.com"]
+      end
+
       def match?
         return false if parsed_url.nil?
-        parsed_url.domain.in?(domains) || parsed_url.host == "skeb.imgix.net"
+        parsed_url.domain.in?(domains) || parsed_url.host.in?(image_domains)
       end
 
       def site_name
@@ -47,33 +50,41 @@ module Sources
       def image_urls
         if url =~ IMAGE_URL
           [url]
-        elsif page.present?
-          # Heavy heuristic to extract the uncropped image among the nighmare that is the skeb minified json
-          candidates = page&.css("script")&.map { |script| script.text&.scan(/(https:\\u002F\\u002Fskeb\.imgix\.net.*?)(?:"|,|\s)/) }
-          candidates = candidates.to_a.flatten.compact.uniq.reject { |match| match.include? "crop=" }
-          # sometimes skeb offers a slightly-smaller, non-watermarked version picture
-          unwatermarked = candidates.reject { |match| match.include? "=SAMPLE" }
-          unsampled = unwatermarked.reject { |match| match.include? "q=" }
+        elsif api_response.present?
+          previews = api_response["previews"].to_a.map { |preview| preview&.dig("url") }.compact.uniq
 
-          final_candidates = [unsampled, unwatermarked, candidates].reject(&:empty?).first&.to_a
-          final_candidates.map { |img| img.gsub("\\u002F", "/") }
+          unwatermarked = api_response["article_image_url"]
+          return previews unless unwatermarked.present?
+          previews.map do |p|
+            next p unless p[UUID_REGEX, :uuid].present? && p[UUID_REGEX, :uuid] == unwatermarked[UUID_REGEX, :uuid]
+            next p if p[/fm=(\w+)/, 1].in?(["gif", "mp4"])
+            next p unless p.include?("&txt=")
+
+            unwatermarked
+          end
         else
           []
         end
       end
 
       def page_url
-        urls.map { |u| u if u =~ PAGE_URL }.compact.first
+        return unless artist_name.present? && illust_id.present?
+        "https://skeb.jp/@#{artist_name}/works/#{illust_id}"
       end
 
       def normalize_for_source
         page_url
       end
 
-      def page
-        return if page_url.blank?
-        response = http.cache(1.minute).get(page_url)
-        return nil unless response.status == 200
+      def api_response
+        return {} unless artist_name.present? && illust_id.present?
+        headers = {
+          Referer: profile_url,
+          Authorization: "Bearer null",
+        }
+        api_url = "https://skeb.jp/api/users/#{artist_name}/works/#{illust_id}"
+        response = http.cache(1.minute).headers(headers).get(api_url)
+        return {} unless response.status == 200
         # The status check is required for private commissions, which return 404
 
         response.parse
@@ -89,7 +100,11 @@ module Sources
       end
 
       def display_name
-        page&.at("title")&.text&.match(/.*by (.*?) \| skeb/i).to_a[1]
+        api_response&.dig("creator", "name")
+      end
+
+      def illust_id
+        urls.map { |u| u[PAGE_URL, :illust_id] }.compact.first
       end
 
       def other_names
@@ -97,19 +112,17 @@ module Sources
       end
 
       def artist_commentary_desc
+        api_response&.dig("source_body") || api_response&.dig("body")
         # skeb "titles" are not needed: it's just the first few characters of the description
-        return if page.blank?
-        page.at("[property='og:description']")["content"]
       end
 
       def client_response
-        return if page.blank?
-        page.text[/window\.__NUXT__=.*,thanks:"(.*?)",/, 1]&.gsub(/\\n/, "\n")
+        api_response&.dig("source_thanks") || api_response&.dig("thanks")
       end
 
       def dtext_artist_commentary_desc
         if client_response.present? && artist_commentary_desc.present?
-          "h5. Original Request:\n#{artist_commentary_desc}\n\nh5. Client Response:\n#{client_response}"
+          "h6. Original Request:\n\n#{artist_commentary_desc}\n\nh6. Client Response:\n\n#{client_response}"
         else
           artist_commentary_desc
         end
diff --git a/test/unit/sources/skeb_test.rb b/test/unit/sources/skeb_test.rb
index fdfa7a0dc..265132695 100644
--- a/test/unit/sources/skeb_test.rb
+++ b/test/unit/sources/skeb_test.rb
@@ -11,23 +11,6 @@ module Sources
         assert_equal("kai_chiisame", @site.artist_name)
       end
 
-      should "get the artist commentary" do
-        commentary = <<~COMM.chomp
-          初めまして、先日アピールを頂きましたのでリクエストさせて頂きます。
-
-          〇キャラ
-          　東方の東風谷早苗さん
-
-          〇内容
-          　・水着や薄着などの若干セクシーめ・肌色多めな方向性で、細部は絵師さんにお任せ
-          　・念のためNSFW指定にしましたがエロでなくていいです
-
-          ご検討お願いします。
-        COMM
-
-        assert_equal(commentary, @site.artist_commentary_desc)
-      end
-
       should "get profile url" do
         assert_equal("https://skeb.jp/@kai_chiisame", @site.profile_url)
       end
@@ -72,25 +55,46 @@ module Sources
       end
     end
 
+    context "An animated post with a smaller static unwatermarked version" do
+      should "still get the watermarked gif" do
+        site = Sources::Strategies.find("https://skeb.jp/@tontaro_/works/316")
+        assert_equal("https://skeb.imgix.net/uploads/origins/5097b1e1-18ce-418e-82f0-e7e2cdab1cea?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&fm=mp4&w=800&s=fcff06871e114b3dbf505c04f27b5ed1", site.image_url)
+      end
+    end
+
     context "A post with both the small and large version clean" do
       should "just get the bigger image" do
-        site = Sources::Strategies.find("https://skeb.jp/@LambOic029/works/149")
-        assert_equal(["https://skeb.imgix.net/uploads/origins/ebe94108-7ca7-4b3d-b80c-b37759ffd695?bg=%23fff&fm=jpg&q=45&w=696&s=9c4e093a440fe4030ac1596813ce7e17"], site.image_urls)
+        site = Sources::Strategies.find("https://skeb.jp/@goma_feet/works/1")
+        assert_equal(["https://skeb.imgix.net/uploads/origins/78ca23dc-a053-4ebe-894f-d5a06e228af8?bg=%23fff&auto=format&w=800&s=3de55b04236059113659f99fd6900d7d"], site.image_urls)
       end
     end
 
     context "A post with two images" do
-      should "get both correctly" do
+      should "get both correctly and in the right order" do
         site = Sources::Strategies.find("https://skeb.jp/@LambOic029/works/146")
         image_urls = %w[
-          https://skeb.imgix.net/uploads/origins/e888bb27-e1a6-48ec-a317-7615252ff818?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=9df9b46bbfad404d3a65c7c56b0cbf40
           https://skeb.imgix.net/uploads/origins/3fc062c5-231d-400f-921f-22d77cde54df?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=80a1373b3f8e9bf0108d201fba34de71
+          https://skeb.imgix.net/uploads/origins/e888bb27-e1a6-48ec-a317-7615252ff818?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=9df9b46bbfad404d3a65c7c56b0cbf40
         ]
 
         assert_equal(image_urls, site.image_urls)
       end
     end
 
+    context "A post with a video" do
+      should "get it correctly" do
+        site = Sources::Strategies.find("https://skeb.jp/@kaisouafuro/works/112")
+        assert_equal(site.image_url, "https://skeb-production.s3.ap-northeast-1.amazonaws.com/uploads/outputs/20f9d68f-50ec-44ae-8630-173fc38a2d6a?response-content-disposition=attachment%3B%20filename%3D%22458093-1.output.mp4%22%3B%20filename%2A%3DUTF-8%27%27458093-1.output.mp4&response-content-type=video%2Fmp4&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIVPUTFQBBL7UDSUA%2F20220113%2Fap-northeast-1%2Fs3%2Faws4_request&X-Amz-Date=20220113T141927Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=260c90b0755d894493fef478f806ac3fac0b94f4c8efb3df4f4f2a98309d09f0")
+      end
+    end
+
+    context "A post with both original and autotranslated commentary" do
+      should "get the original commentary" do
+        site = Sources::Strategies.find("https://skeb.jp/@kaisouafuro/works/112")
+        assert_match(/I would like to request an animation screen for my Twitch channel. My character is a catgirl/, site.dtext_artist_commentary_desc)
+      end
+    end
+
     context "normalizing for source" do
       should "avoid normalizing unnormalizable urls" do
         bad_source = "https://skeb.imgix.net/requests/229088_2?bg=%23fff&auto=format&w=800&s=9cac8b76c0838f2df4f19ebc41c1ae0a"