Merge pull request #4956 from nonamethanks/fix-skeb

Skeb: fix several issues with the strategy
This commit is contained in:
evazion
2022-01-14 22:04:44 -06:00
committed by GitHub
2 changed files with 62 additions and 45 deletions

View File

@@ -26,18 +26,21 @@ module Sources
module Strategies module Strategies
class Skeb < Base class Skeb < Base
PROFILE_URL = %r{https?://(?:www\.)?skeb\.jp/@(?<artist_name>\w+)}i PROFILE_URL = %r{https?://(?:www\.)?skeb\.jp/@(?<artist_name>\w+)}i
PAGE_URL = %r{#{PROFILE_URL}/works/(?<illust_id>\d+)}i PAGE_URL = %r{#{PROFILE_URL}/works/(?<illust_id>\d+)}i
IMAGE_URL = %r{https?://(?:(?:www\.)?skeb\.imgix\.net|skeb-production.s3.ap-northeast-1.amazonaws.com/)/.+}i
IMAGE_URL = %r{https?://(?:www\.)?skeb\.imgix\.net/(requests|uploads/origins)/.*}i UUID_REGEX = %r{/(?<uuid>(?:(?:\w+-)+\w+|(?:\d+_\d+))).*(?:fm=(?<type>\w+))?.*}
def domains def domains
["skeb.jp"] ["skeb.jp"]
end end
def image_domains
["skeb.imgix.net", "skeb-production.s3.ap-northeast-1.amazonaws.com"]
end
def match? def match?
return false if parsed_url.nil? return false if parsed_url.nil?
parsed_url.domain.in?(domains) || parsed_url.host == "skeb.imgix.net" parsed_url.domain.in?(domains) || parsed_url.host.in?(image_domains)
end end
def site_name def site_name
@@ -47,33 +50,41 @@ module Sources
def image_urls def image_urls
if url =~ IMAGE_URL if url =~ IMAGE_URL
[url] [url]
elsif page.present? elsif api_response.present?
# Heavy heuristic to extract the uncropped image among the nighmare that is the skeb minified json previews = api_response["previews"].to_a.map { |preview| preview&.dig("url") }.compact.uniq
candidates = page&.css("script")&.map { |script| script.text&.scan(/(https:\\u002F\\u002Fskeb\.imgix\.net.*?)(?:"|,|\s)/) }
candidates = candidates.to_a.flatten.compact.uniq.reject { |match| match.include? "crop=" }
# sometimes skeb offers a slightly-smaller, non-watermarked version picture
unwatermarked = candidates.reject { |match| match.include? "=SAMPLE" }
unsampled = unwatermarked.reject { |match| match.include? "q=" }
final_candidates = [unsampled, unwatermarked, candidates].reject(&:empty?).first&.to_a unwatermarked = api_response["article_image_url"]
final_candidates.map { |img| img.gsub("\\u002F", "/") } return previews unless unwatermarked.present?
previews.map do |p|
next p unless p[UUID_REGEX, :uuid].present? && p[UUID_REGEX, :uuid] == unwatermarked[UUID_REGEX, :uuid]
next p if p[/fm=(\w+)/, 1].in?(["gif", "mp4"])
next p unless p.include?("&txt=")
unwatermarked
end
else else
[] []
end end
end end
def page_url def page_url
urls.map { |u| u if u =~ PAGE_URL }.compact.first return unless artist_name.present? && illust_id.present?
"https://skeb.jp/@#{artist_name}/works/#{illust_id}"
end end
def normalize_for_source def normalize_for_source
page_url page_url
end end
def page def api_response
return if page_url.blank? return {} unless artist_name.present? && illust_id.present?
response = http.cache(1.minute).get(page_url) headers = {
return nil unless response.status == 200 Referer: profile_url,
Authorization: "Bearer null",
}
api_url = "https://skeb.jp/api/users/#{artist_name}/works/#{illust_id}"
response = http.cache(1.minute).headers(headers).get(api_url)
return {} unless response.status == 200
# The status check is required for private commissions, which return 404 # The status check is required for private commissions, which return 404
response.parse response.parse
@@ -89,7 +100,11 @@ module Sources
end end
def display_name def display_name
page&.at("title")&.text&.match(/.*by (.*?) \| skeb/i).to_a[1] api_response&.dig("creator", "name")
end
def illust_id
urls.map { |u| u[PAGE_URL, :illust_id] }.compact.first
end end
def other_names def other_names
@@ -97,19 +112,17 @@ module Sources
end end
def artist_commentary_desc def artist_commentary_desc
api_response&.dig("source_body") || api_response&.dig("body")
# skeb "titles" are not needed: it's just the first few characters of the description # skeb "titles" are not needed: it's just the first few characters of the description
return if page.blank?
page.at("[property='og:description']")["content"]
end end
def client_response def client_response
return if page.blank? api_response&.dig("source_thanks") || api_response&.dig("thanks")
page.text[/window\.__NUXT__=.*,thanks:"(.*?)",/, 1]&.gsub(/\\n/, "\n")
end end
def dtext_artist_commentary_desc def dtext_artist_commentary_desc
if client_response.present? && artist_commentary_desc.present? if client_response.present? && artist_commentary_desc.present?
"h5. Original Request:\n#{artist_commentary_desc}\n\nh5. Client Response:\n#{client_response}" "h6. Original Request:\n\n#{artist_commentary_desc}\n\nh6. Client Response:\n\n#{client_response}"
else else
artist_commentary_desc artist_commentary_desc
end end

View File

@@ -11,23 +11,6 @@ module Sources
assert_equal("kai_chiisame", @site.artist_name) assert_equal("kai_chiisame", @site.artist_name)
end end
should "get the artist commentary" do
commentary = <<~COMM.chomp
 
 
 NSFW指定にしましたがエロでなくていいです
COMM
assert_equal(commentary, @site.artist_commentary_desc)
end
should "get profile url" do should "get profile url" do
assert_equal("https://skeb.jp/@kai_chiisame", @site.profile_url) assert_equal("https://skeb.jp/@kai_chiisame", @site.profile_url)
end end
@@ -72,25 +55,46 @@ module Sources
end end
end end
context "An animated post with a smaller static unwatermarked version" do
should "still get the watermarked gif" do
site = Sources::Strategies.find("https://skeb.jp/@tontaro_/works/316")
assert_equal("https://skeb.imgix.net/uploads/origins/5097b1e1-18ce-418e-82f0-e7e2cdab1cea?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&fm=mp4&w=800&s=fcff06871e114b3dbf505c04f27b5ed1", site.image_url)
end
end
context "A post with both the small and large version clean" do context "A post with both the small and large version clean" do
should "just get the bigger image" do should "just get the bigger image" do
site = Sources::Strategies.find("https://skeb.jp/@LambOic029/works/149") site = Sources::Strategies.find("https://skeb.jp/@goma_feet/works/1")
assert_equal(["https://skeb.imgix.net/uploads/origins/ebe94108-7ca7-4b3d-b80c-b37759ffd695?bg=%23fff&fm=jpg&q=45&w=696&s=9c4e093a440fe4030ac1596813ce7e17"], site.image_urls) assert_equal(["https://skeb.imgix.net/uploads/origins/78ca23dc-a053-4ebe-894f-d5a06e228af8?bg=%23fff&auto=format&w=800&s=3de55b04236059113659f99fd6900d7d"], site.image_urls)
end end
end end
context "A post with two images" do context "A post with two images" do
should "get both correctly" do should "get both correctly and in the right order" do
site = Sources::Strategies.find("https://skeb.jp/@LambOic029/works/146") site = Sources::Strategies.find("https://skeb.jp/@LambOic029/works/146")
image_urls = %w[ image_urls = %w[
https://skeb.imgix.net/uploads/origins/e888bb27-e1a6-48ec-a317-7615252ff818?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=9df9b46bbfad404d3a65c7c56b0cbf40
https://skeb.imgix.net/uploads/origins/3fc062c5-231d-400f-921f-22d77cde54df?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=80a1373b3f8e9bf0108d201fba34de71 https://skeb.imgix.net/uploads/origins/3fc062c5-231d-400f-921f-22d77cde54df?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=80a1373b3f8e9bf0108d201fba34de71
https://skeb.imgix.net/uploads/origins/e888bb27-e1a6-48ec-a317-7615252ff818?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=9df9b46bbfad404d3a65c7c56b0cbf40
] ]
assert_equal(image_urls, site.image_urls) assert_equal(image_urls, site.image_urls)
end end
end end
context "A post with a video" do
should "get it correctly" do
site = Sources::Strategies.find("https://skeb.jp/@kaisouafuro/works/112")
assert_equal(site.image_url, "https://skeb-production.s3.ap-northeast-1.amazonaws.com/uploads/outputs/20f9d68f-50ec-44ae-8630-173fc38a2d6a?response-content-disposition=attachment%3B%20filename%3D%22458093-1.output.mp4%22%3B%20filename%2A%3DUTF-8%27%27458093-1.output.mp4&response-content-type=video%2Fmp4&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIVPUTFQBBL7UDSUA%2F20220113%2Fap-northeast-1%2Fs3%2Faws4_request&X-Amz-Date=20220113T141927Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=260c90b0755d894493fef478f806ac3fac0b94f4c8efb3df4f4f2a98309d09f0")
end
end
context "A post with both original and autotranslated commentary" do
should "get the original commentary" do
site = Sources::Strategies.find("https://skeb.jp/@kaisouafuro/works/112")
assert_match(/I would like to request an animation screen for my Twitch channel. My character is a catgirl/, site.dtext_artist_commentary_desc)
end
end
context "normalizing for source" do context "normalizing for source" do
should "avoid normalizing unnormalizable urls" do should "avoid normalizing unnormalizable urls" do
bad_source = "https://skeb.imgix.net/requests/229088_2?bg=%23fff&auto=format&w=800&s=9cac8b76c0838f2df4f19ebc41c1ae0a" bad_source = "https://skeb.imgix.net/requests/229088_2?bg=%23fff&auto=format&w=800&s=9cac8b76c0838f2df4f19ebc41c1ae0a"