sources: factor out Source::URL::Skeb.

This commit is contained in:
evazion
2022-02-24 19:53:40 -06:00
parent 6523cd14cb
commit 26f4cf1ebd
7 changed files with 155 additions and 58 deletions

View File

@@ -43,8 +43,13 @@ module Danbooru
nil
end
# @return [String] the URL in normalized form
# @return [String] the URL in unnormalized form
def to_s
original_url
end
# @return [String] the URL in normalized form
def to_normalized_s
url.to_str
end

View File

@@ -19,6 +19,7 @@ module Source
class URL < Danbooru::URL
SUBCLASSES = [
Source::URL::Twitter,
Source::URL::Skeb,
Source::URL::TwitPic,
Source::URL::Foundation,
]

View File

@@ -0,0 +1,97 @@
# frozen_string_literal: true
# Image URLs
#
## Non-watermarked:
#
# # Page: https://skeb.jp/@OrvMZ/works/3
# * https://skeb.imgix.net/requests/199886_0?bg=%23fff&auto=format&w=800&s=5a6a908ab964fcdfc4713fad179fe715
#
## Watermarked:
#
# * https://skeb.imgix.net/requests/73290_0?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=4843435cff85d623b1f657209d131526
# * https://skeb.imgix.net/uploads/origins/04d62c2f-e396-46f9-903a-3ca8bd69fc7c?bg=%23fff&auto=format&w=800&s=966c5d0389c3b94dc36ac970f812bef4 (new format)
#
## Full Size (found in commissioner_upload):
#
# # Page: https://skeb.jp/@matsuda_Toki/works/101
# * https://skeb.imgix.net/requests/53269_1?bg=%23fff&fm=png&dl=53269.png&w=1.0&h=1.0&s=44588ea9c41881049e392adb1df21cce
#
# The signature is required and tied to the parameters. Doesn't seem like it's possible to reverse engineer it to remove the watermark, unfortunately.
#
# Video URLs
#
# # Page: https://skeb.jp/@kaisouafuro/works/112
# * https://skeb-production.s3.ap-northeast-1.amazonaws.com/uploads/outputs/20f9d68f-50ec-44ae-8630-173fc38a2d6a?response-content-disposition=attachment%3B%20filename%3D%22458093-1.output.mp4%22%3B%20filename%2A%3DUTF-8%27%27458093-1.output.mp4&response-content-type=video%2Fmp4&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIVPUTFQBBL7UDSUA%2F20220221%2Fap-northeast-1%2Fs3%2Faws4_request&X-Amz-Date=20220221T200057Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=7f028cfd9a56344cf1d42410063fad3ef30a1e47b83cef047247e0c37df01df0
#
# Page URLs
#
# * https://skeb.jp/@OrvMZ/works/3 (non-watermarked)
# * https://skeb.jp/@OrvMZ/works/1 (separated request and client's message after delivery. We can't get the latter)
# * https://skeb.jp/@asanagi/works/16 (age-restricted, watermarked)
# * https://skeb.jp/@asanagi/works/6 (private, returns 404)
# * https://skeb.jp/@nasuno42/works/30 (multi-image post)
#
# Profile URLs
#
# Since skeb forces login through twitter, usernames are the same as twitter
#
# * https://skeb.jp/@asanagi
#
# API URLs
#
## Must send "Authorization: Bearer null"
#
# * https://skeb.jp/api/users/kaisouafuro
# * https://skeb.jp/api/users/kaisouafuro/works/112
#
class Source::URL::Skeb < Source::URL
attr_reader :username, :work_id, :image_id, :image_uuid
def self.match?(url)
url.host.in?(%w[skeb.jp skeb.imgix.net skeb-production.s3.ap-northeast-1.amazonaws.com])
end
def parse
case [domain, *path_segments]
# https://skeb.jp/@asanagi
in "skeb.jp", /^@/ => username
@username = username.delete_prefix("@")
# https://skeb.jp/@OrvMZ/works/3
in "skeb.jp", /^@/ => username, "works", work_id
@username = username.delete_prefix("@")
@work_id = work_id
# https://skeb.imgix.net/requests/199886_0?bg=%23fff&auto=format&w=800&s=5a6a908ab964fcdfc4713fad179fe715
# https://skeb.imgix.net/requests/73290_0?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=4843435cff85d623b1f657209d131526
# https://skeb.imgix.net/requests/53269_1?bg=%23fff&fm=png&dl=53269.png&w=1.0&h=1.0&s=44588ea9c41881049e392adb1df21cce
in "imgix.net", "requests", image_id
@image_id = image_id
# https://skeb.imgix.net/uploads/origins/04d62c2f-e396-46f9-903a-3ca8bd69fc7c?bg=%23fff&auto=format&w=800&s=966c5d0389c3b94dc36ac970f812bef4
in "imgix.net", "uploads", "origins", image_uuid
@image_uuid = image_uuid
# Page: https://skeb.jp/@kaisouafuro/works/112
# https://skeb-production.s3.ap-northeast-1.amazonaws.com/uploads/outputs/20f9d68f-50ec-44ae-8630-173fc38a2d6a?response-content-disposition=attachment%3B%20filename%3D%22458093-1.output.mp4%22%3B%20filename%2A%3DUTF-8%27%27458093-1.output.mp4&response-content-type=video%2Fmp4&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIVPUTFQBBL7UDSUA%2F20220221%2Fap-northeast-1%2Fs3%2Faws4_request&X-Amz-Date=20220221T200057Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=7f028cfd9a56344cf1d42410063fad3ef30a1e47b83cef047247e0c37df01df0
in "amazonaws.com", "uploads", "outputs", image_uuid
@image_uuid = image_uuid
else
end
end
def image_url?
domain.in?(%[imgix.net amazonaws.com])
end
def animated?
image_url? && params[:fm].in?(["gif", "mp4"])
end
def watermarked?
image_url? && params[:txt].present?
end
end

View File

@@ -1,72 +1,54 @@
# frozen_string_literal: true
# Image URLS
## Non-watermarked:
# * https://skeb.imgix.net/requests/199886_0?bg=%23fff&auto=format&w=800&s=5a6a908ab964fcdfc4713fad179fe715
## Watermarked:
# * https://skeb.imgix.net/requests/73290_0?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&w=800&s=4843435cff85d623b1f657209d131526
# * https://skeb.imgix.net/uploads/origins/04d62c2f-e396-46f9-903a-3ca8bd69fc7c?bg=%23fff&auto=format&w=800&s=966c5d0389c3b94dc36ac970f812bef4 (new format)
## Full Size (found in commissioner_upload):
# * https://skeb.imgix.net/requests/53269_1?bg=%23fff&fm=png&dl=53269.png&w=1.0&h=1.0&s=44588ea9c41881049e392adb1df21cce
#
# The signature is required and tied to the parameters. Doesn't seem like it's possible to reverse engineer it to remove the watermark, unfortunately.
#
# Page URLS
# * https://skeb.jp/@OrvMZ/works/3 (non-watermarked)
# * https://skeb.jp/@OrvMZ/works/1 (separated request and client's message after delivery. We can't get the latter)
# * https://skeb.jp/@asanagi/works/16 (age-restricted, watermarked)
# * https://skeb.jp/@asanagi/works/6 (private, returns 404)
# * https://skeb.jp/@nasuno42/works/30 (multi-image post)
#
# Profile URLS
# Since skeb forces login through twitter, usernames are the same as twitter
# * https://skeb.jp/@asanagi
# @see Source::URL::Skeb
module Sources
module Strategies
class Skeb < Base
PROFILE_URL = %r{https?://(?:www\.)?skeb\.jp/@(?<artist_name>\w+)}i
PAGE_URL = %r{#{PROFILE_URL}/works/(?<illust_id>\d+)}i
IMAGE_URL = %r{https?://(?:(?:www\.)?skeb\.imgix\.net|skeb-production.s3.ap-northeast-1.amazonaws.com/)/.+}i
UUID_REGEX = %r{/(?<uuid>(?:(?:\w+-)+\w+|(?:\d+_\d+))).*(?:fm=(?<type>\w+))?.*}
def domains
["skeb.jp"]
end
def image_domains
["skeb.imgix.net", "skeb-production.s3.ap-northeast-1.amazonaws.com"]
end
extend Memoist
def match?
return false if parsed_url.nil?
parsed_url.domain.in?(domains) || parsed_url.host.in?(image_domains)
parsed_url&.site_name == "Skeb"
end
def site_name
"Skeb"
parsed_url.site_name
end
def image_urls
if url =~ IMAGE_URL
if parsed_url.image_url?
[url]
elsif api_response.present?
previews = api_response["previews"].to_a.map { |preview| preview&.dig("url") }.compact.uniq
unwatermarked = api_response["article_image_url"]
return previews unless unwatermarked.present?
previews.map do |p|
next p unless p[UUID_REGEX, :uuid].present? && p[UUID_REGEX, :uuid] == unwatermarked[UUID_REGEX, :uuid]
next p if p[/fm=(\w+)/, 1].in?(["gif", "mp4"])
next p unless p.include?("&txt=")
unwatermarked
end
elsif unwatermarked_url.present?
# If the unwatermarked URL is present, then find and replace the watermarked URL
# with the unwatermarked version (unless the watermarked version is a video or
# gif, in which case the unwatermarked URL is not used because it's a still image).
#
# https://skeb.jp/@goma_feet/works/1: https://skeb.imgix.net/uploads/origins/78ca23dc-a053-4ebe-894f-d5a06e228af8?bg=%23fff&auto=format&w=800&s=3de55b04236059113659f99fd6900d7d
# https://skeb.jp/@2gi0gi_/works/13: https://skeb.imgix.net/requests/191942_0?bg=%23fff&fm=jpg&q=45&w=696&s=5783ee951cc55d183713395926389453
# https://skeb.jp/@tontaro_/works/316: https://skeb.imgix.net/uploads/origins/5097b1e1-18ce-418e-82f0-e7e2cdab1cea?bg=%23fff&auto=format&txtfont=bold&txtshad=70&txtclr=BFFFFFFF&txtalign=middle%2Ccenter&txtsize=150&txt=SAMPLE&fm=mp4&w=800&s=fcff06871e114b3dbf505c04f27b5ed1
sample_urls.map do |sample_url|
if sample_url.path == unwatermarked_url.path && sample_url.watermarked? && !sample_url.animated?
unwatermarked_url
else
sample_url
end
end.map(&:to_s)
else
[]
sample_urls.map(&:to_s)
end
end
def sample_urls
api_response["previews"].to_a.pluck("url").compact.map { |url| Source::URL.parse(url) }
end
# Some posts have an unwatermarked version of the image. Usually it's lower
# resolution and lower JPEG quality than the watermarked image. Multi-image posts
# will have only one unwatermarked URL.
def unwatermarked_url
return nil if api_response["article_image_url"].nil?
Source::URL.parse(api_response["article_image_url"])
end
def page_url
return unless artist_name.present? && illust_id.present?
"https://skeb.jp/@#{artist_name}/works/#{illust_id}"
@@ -76,13 +58,19 @@ module Sources
page_url
end
def api_url
return nil unless artist_name.present? && illust_id.present?
"https://skeb.jp/api/users/#{artist_name}/works/#{illust_id}"
end
def api_response
return {} unless artist_name.present? && illust_id.present?
return {} unless api_url.present?
headers = {
Referer: profile_url,
Authorization: "Bearer null",
}
api_url = "https://skeb.jp/api/users/#{artist_name}/works/#{illust_id}"
response = http.cache(1.minute).headers(headers).get(api_url)
return {} unless response.status == 200
# The status check is required for private commissions, which return 404
@@ -96,7 +84,7 @@ module Sources
end
def artist_name
urls.map { |u| u[PROFILE_URL, :artist_name] }.compact.first
parsed_url.username || parsed_referer&.username
end
def display_name
@@ -104,7 +92,7 @@ module Sources
end
def illust_id
urls.map { |u| u[PAGE_URL, :illust_id] }.compact.first
parsed_url.work_id || parsed_referer&.work_id
end
def other_names
@@ -127,6 +115,8 @@ module Sources
artist_commentary_desc
end
end
memoize :api_response
end
end
end

View File

@@ -118,7 +118,7 @@ class ArtistURL < ApplicationRecord
end
def self.normalize_url(url)
Danbooru::URL.parse(url)&.to_s.presence || url
Danbooru::URL.parse(url)&.to_normalized_s.presence || url
end
def url=(url)

View File

@@ -70,7 +70,7 @@ class Upload < ApplicationRecord
class_methods do
# percent-encode unicode characters in the URL
def normalize_source(url)
Danbooru::URL.parse(url)&.to_s.presence || url
Danbooru::URL.parse(url)&.to_normalized_s.presence || url
end
end
end

View File

@@ -338,6 +338,10 @@ class UploadsControllerTest < ActionDispatch::IntegrationTest
should_upload_successfully("https://foundation.app/@mochiiimo/~/97376")
should_upload_successfully("https://foundation.app/@mochiiimo/foundation/97376")
should_upload_successfully("https://foundation.app/@KILLERGF/kgfgen/4")
should_upload_successfully("https://skeb.jp/@kokuzou593/works/45")
should_upload_successfully("https://skeb.jp/@LambOic029/works/146")
should_upload_successfully("https://skeb.imgix.net/uploads/origins/307941e9-dbe0-4e4b-93d4-94accdaff9a0?bg=%23fff&auto=format&w=800&s=e0ddfb1fa0d9f23797b338598aae78fa")
end
end
end