From 33db1a27618e0afee0e8e485fc9e2eff4065f3ba Mon Sep 17 00:00:00 2001 From: nonamethanks Date: Fri, 14 Jan 2022 11:46:26 +0100 Subject: [PATCH] Skeb: fix several issues with the strategy * Fix fetching of videos * Fix fetching of original commentary * Fix images being returned out of order in bookmarklet --- app/logical/sources/strategies/skeb.rb | 61 ++++++++++++++++---------- test/unit/sources/skeb_test.rb | 46 ++++++++++--------- 2 files changed, 62 insertions(+), 45 deletions(-) diff --git a/app/logical/sources/strategies/skeb.rb b/app/logical/sources/strategies/skeb.rb index d8e37aa51..8351a5b06 100644 --- a/app/logical/sources/strategies/skeb.rb +++ b/app/logical/sources/strategies/skeb.rb @@ -26,18 +26,21 @@ module Sources module Strategies class Skeb < Base PROFILE_URL = %r{https?://(?:www\.)?skeb\.jp/@(?\w+)}i - PAGE_URL = %r{#{PROFILE_URL}/works/(?\d+)}i - - IMAGE_URL = %r{https?://(?:www\.)?skeb\.imgix\.net/(requests|uploads/origins)/.*}i + IMAGE_URL = %r{https?://(?:(?:www\.)?skeb\.imgix\.net|skeb-production.s3.ap-northeast-1.amazonaws.com/)/.+}i + UUID_REGEX = %r{/(?(?:(?:\w+-)+\w+|(?:\d+_\d+))).*(?:fm=(?\w+))?.*} def domains ["skeb.jp"] end + def image_domains + ["skeb.imgix.net", "skeb-production.s3.ap-northeast-1.amazonaws.com"] + end + def match? return false if parsed_url.nil? - parsed_url.domain.in?(domains) || parsed_url.host == "skeb.imgix.net" + parsed_url.domain.in?(domains) || parsed_url.host.in?(image_domains) end def site_name @@ -47,33 +50,41 @@ module Sources def image_urls if url =~ IMAGE_URL [url] - elsif page.present? - # Heavy heuristic to extract the uncropped image among the nighmare that is the skeb minified json - candidates = page&.css("script")&.map { |script| script.text&.scan(/(https:\\u002F\\u002Fskeb\.imgix\.net.*?)(?:"|,|\s)/) } - candidates = candidates.to_a.flatten.compact.uniq.reject { |match| match.include? "crop=" } - # sometimes skeb offers a slightly-smaller, non-watermarked version picture - unwatermarked = candidates.reject { |match| match.include? "=SAMPLE" } - unsampled = unwatermarked.reject { |match| match.include? "q=" } + elsif api_response.present? + previews = api_response["previews"].to_a.map { |preview| preview&.dig("url") }.compact.uniq - final_candidates = [unsampled, unwatermarked, candidates].reject(&:empty?).first&.to_a - final_candidates.map { |img| img.gsub("\\u002F", "/") } + unwatermarked = api_response["article_image_url"] + return previews unless unwatermarked.present? + previews.map do |p| + next p unless p[UUID_REGEX, :uuid].present? && p[UUID_REGEX, :uuid] == unwatermarked[UUID_REGEX, :uuid] + next p if p[/fm=(\w+)/, 1].in?(["gif", "mp4"]) + next p unless p.include?("&txt=") + + unwatermarked + end else [] end end def page_url - urls.map { |u| u if u =~ PAGE_URL }.compact.first + return unless artist_name.present? && illust_id.present? + "https://skeb.jp/@#{artist_name}/works/#{illust_id}" end def normalize_for_source page_url end - def page - return if page_url.blank? - response = http.cache(1.minute).get(page_url) - return nil unless response.status == 200 + def api_response + return {} unless artist_name.present? && illust_id.present? + headers = { + Referer: profile_url, + Authorization: "Bearer null", + } + api_url = "https://skeb.jp/api/users/#{artist_name}/works/#{illust_id}" + response = http.cache(1.minute).headers(headers).get(api_url) + return {} unless response.status == 200 # The status check is required for private commissions, which return 404 response.parse @@ -89,7 +100,11 @@ module Sources end def display_name - page&.at("title")&.text&.match(/.*by (.*?) \| skeb/i).to_a[1] + api_response&.dig("creator", "name") + end + + def illust_id + urls.map { |u| u[PAGE_URL, :illust_id] }.compact.first end def other_names @@ -97,19 +112,17 @@ module Sources end def artist_commentary_desc + api_response&.dig("source_body") || api_response&.dig("body") # skeb "titles" are not needed: it's just the first few characters of the description - return if page.blank? - page.at("[property='og:description']")["content"] end def client_response - return if page.blank? - page.text[/window\.__NUXT__=.*,thanks:"(.*?)",/, 1]&.gsub(/\\n/, "\n") + api_response&.dig("source_thanks") || api_response&.dig("thanks") end def dtext_artist_commentary_desc if client_response.present? && artist_commentary_desc.present? - "h5. Original Request:\n#{artist_commentary_desc}\n\nh5. Client Response:\n#{client_response}" + "h6. Original Request:\n\n#{artist_commentary_desc}\n\nh6. Client Response:\n\n#{client_response}" else artist_commentary_desc end diff --git a/test/unit/sources/skeb_test.rb b/test/unit/sources/skeb_test.rb index fdfa7a0dc..265132695 100644 --- a/test/unit/sources/skeb_test.rb +++ b/test/unit/sources/skeb_test.rb @@ -11,23 +11,6 @@ module Sources assert_equal("kai_chiisame", @site.artist_name) end - should "get the artist commentary" do - commentary = <<~COMM.chomp - 初めまして、先日アピールを頂きましたのでリクエストさせて頂きます。 - - 〇キャラ -  東方の東風谷早苗さん - - 〇内容 -  ・水着や薄着などの若干セクシーめ・肌色多めな方向性で、細部は絵師さんにお任せ -  ・念のためNSFW指定にしましたがエロでなくていいです - - ご検討お願いします。 - COMM - - assert_equal(commentary, @site.artist_commentary_desc) - end - should "get profile url" do assert_equal("https://skeb.jp/@kai_chiisame", @site.profile_url) end @@ -72,25 +55,46 @@ module Sources end end + context "An animated post with a smaller static unwatermarked version" do + should "still get the watermarked gif" do + site = Sources::Strategies.find("https://skeb.jp/@tontaro_/works/316") + assert_equal("https://skeb.imgix.net/uploads/origins/5097b1e1-18ce-418e-82f0-e7e2cdab1cea?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&fm=mp4&w=800&s=fcff06871e114b3dbf505c04f27b5ed1", site.image_url) + end + end + context "A post with both the small and large version clean" do should "just get the bigger image" do - site = Sources::Strategies.find("https://skeb.jp/@LambOic029/works/149") - assert_equal(["https://skeb.imgix.net/uploads/origins/ebe94108-7ca7-4b3d-b80c-b37759ffd695?bg=%23fff&fm=jpg&q=45&w=696&s=9c4e093a440fe4030ac1596813ce7e17"], site.image_urls) + site = Sources::Strategies.find("https://skeb.jp/@goma_feet/works/1") + assert_equal(["https://skeb.imgix.net/uploads/origins/78ca23dc-a053-4ebe-894f-d5a06e228af8?bg=%23fff&auto=format&w=800&s=3de55b04236059113659f99fd6900d7d"], site.image_urls) end end context "A post with two images" do - should "get both correctly" do + should "get both correctly and in the right order" do site = Sources::Strategies.find("https://skeb.jp/@LambOic029/works/146") image_urls = %w[ - https://skeb.imgix.net/uploads/origins/e888bb27-e1a6-48ec-a317-7615252ff818?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=9df9b46bbfad404d3a65c7c56b0cbf40 https://skeb.imgix.net/uploads/origins/3fc062c5-231d-400f-921f-22d77cde54df?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=80a1373b3f8e9bf0108d201fba34de71 + https://skeb.imgix.net/uploads/origins/e888bb27-e1a6-48ec-a317-7615252ff818?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=9df9b46bbfad404d3a65c7c56b0cbf40 ] assert_equal(image_urls, site.image_urls) end end + context "A post with a video" do + should "get it correctly" do + site = Sources::Strategies.find("https://skeb.jp/@kaisouafuro/works/112") + assert_equal(site.image_url, "https://skeb-production.s3.ap-northeast-1.amazonaws.com/uploads/outputs/20f9d68f-50ec-44ae-8630-173fc38a2d6a?response-content-disposition=attachment%3B%20filename%3D%22458093-1.output.mp4%22%3B%20filename%2A%3DUTF-8%27%27458093-1.output.mp4&response-content-type=video%2Fmp4&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIVPUTFQBBL7UDSUA%2F20220113%2Fap-northeast-1%2Fs3%2Faws4_request&X-Amz-Date=20220113T141927Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=260c90b0755d894493fef478f806ac3fac0b94f4c8efb3df4f4f2a98309d09f0") + end + end + + context "A post with both original and autotranslated commentary" do + should "get the original commentary" do + site = Sources::Strategies.find("https://skeb.jp/@kaisouafuro/works/112") + assert_match(/I would like to request an animation screen for my Twitch channel. My character is a catgirl/, site.dtext_artist_commentary_desc) + end + end + context "normalizing for source" do should "avoid normalizing unnormalizable urls" do bad_source = "https://skeb.imgix.net/requests/229088_2?bg=%23fff&auto=format&w=800&s=9cac8b76c0838f2df4f19ebc41c1ae0a"