sources: factor out Source::URL::Pixiv.

* Drop support for preview_urls. This means that IQDB lookups may be
  slower, especially for ugoiras, since we have to download the full
  ugoira now. However, ugoira lookups should produce better results,
  since the ugoira thumbnail chosen by Pixiv wasn't necessarily the same
  as the thumbnail chosen by Danbooru.

* Drop support for uploading single manga pages:

    http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=2

  Previously uploading an URL like this would only upload a single image
  out of a multi-image work. Now it will upload all images in the work.
  Pixiv no longer supports URLs like this, so we don't either.

* Add support for parsing URLs like this:

    https://i.pximg.net/c/360x360_70/custom-thumb/img/2022/03/08/00/00/56/96755248_p0_custom1200.jpg

  Apparently artists can choose a custom thumbnail now (not like anyone
  will try to upload one though).
This commit is contained in:
evazion
2022-03-08 22:06:24 -06:00
parent df0bb70486
commit 1c620f8055
3 changed files with 155 additions and 176 deletions

View File

@@ -0,0 +1,140 @@
# frozen_string_literal: true
module Source
class URL::Pixiv < Source::URL
attr_reader :work_id, :page, :username, :user_id, :full_image_url
def self.match?(url)
return false if Source::URL::Fanbox.match?(url) || Source::URL::PixivSketch.match?(url)
url.domain.in?(%w[pximg.net pixiv.net pixiv.me]) || url.host == "tc-pximg01.techorus-cdn.com"
end
def parse
case [host, *path_segments]
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
# https://i.pximg.net/c/250x250_80_a2/img-master/img/2014/10/29/09/27/19/46785915_p0_square1200.jpg
# https://i.pximg.net/img-zip-ugoira/img/2016/04/09/14/25/29/56268141_ugoira1920x1080.zip
# https://i.pximg.net/img-original/img/2019/05/27/17/59/33/74932152_ugoira0.jpg
# https://i.pximg.net/c/360x360_70/custom-thumb/img/2022/03/08/00/00/56/96755248_p0_custom1200.jpg
# https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_64x64.jpg
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
# https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
#
# but not:
#
# https://i.pximg.net/novel-cover-original/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42.jpg
in *, ("img-original" | "img-master" | "img-zip-ugoira" | "img-inf" | "custom-thumb"), "img", year, month, day, hour, min, sec, file if image_url?
parse_filename
if work_id.present? && is_ugoira?
@full_image_url = "https://i.pximg.net/img-zip-ugoira/img/#{year}/#{month}/#{day}/#{hour}/#{min}/#{sec}/#{work_id}_ugoira1920x1080.zip"
elsif work_id.present? && page.present? && file_ext.present?
@full_image_url = "https://i.pximg.net/img-original/img/#{year}/#{month}/#{day}/#{hour}/#{min}/#{sec}/#{work_id}_p#{page}.#{file_ext}"
end
# http://img18.pixiv.net/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
in *, "img", username, file if image_url?
parse_filename
@username = username
@full_image_url = url # XXX these URLs all return 404, so we can't convert them to a working full image URL.
# https://www.pixiv.net/en/artworks/46324488
# https://www.pixiv.net/artworks/46324488
in "www.pixiv.net", *, "artworks", work_id
@work_id = work_id
# http://www.pixiv.net/i/18557054
in "www.pixiv.net", "i", work_id
@work_id = work_id
# http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1
in "www.pixiv.net", "member_illust.php" if params[:illust_id].present?
@work_id = params[:illust_id]
# https://www.pixiv.net/member.php?id=339253
in "www.pixiv.net", "member.php" if params[:id].present?
@user_id = params[:id]
# https://www.pixiv.net/u/9202877
# https://www.pixiv.net/users/9202877
# https://www.pixiv.net/en/users/9202877
in "www.pixiv.net", *, ("u" | "users"), user_id
@user_id = user_id
# https://www.pixiv.net/stacc/noizave
in "www.pixiv.net", "stacc", username
@username = username
# http://www.pixiv.me/noizave
in "www.pixiv.me", username
@username = username
else
end
end
def parse_filename
case filename.split("_")
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
in /^\d+$/ => work_id, /^p\d+$/ => page, *rest
@work_id = work_id
@page = page.delete_prefix("p")
# https://i.pximg.net/img-original/img/2019/05/27/17/59/33/74932152_ugoira0.jpg
# https://i.pximg.net/img-zip-ugoira/img/2016/04/09/14/25/29/56268141_ugoira1920x1080.zip
in /^\d+$/ => work_id, /^ugoira/
@work_id = work_id
@ugoira = true
# https://i.pximg.net/c/240x240/img-master/img/2017/04/04/08/57/38/62247364_master1200.jpg
# http://i1.pixiv.net/img53/img/themare/39735353_big_p1.jpg
# http://i2.pixiv.net/img18/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
# http://i2.pixiv.net/img18/img/evazion/14901720_s.png
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_64x64.jpg
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
in /^\d+$/ => work_id, *rest
@work_id = work_id
else
end
end
def image_url?
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
# https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
# https://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg
# http://img18.pixiv.net/img/evazion/14901720.png
host.in?(["i.pximg.net", "i-f.pximg.net", "tc-pximg01.techorus-cdn.com"]) || host.match?(/\A(i\d+|img\d+)\.pixiv\.net\z/)
end
def is_ugoira?
@ugoira.present?
end
def page_url
"https://www.pixiv.net/artworks/#{work_id}" if work_id.present?
end
def profile_url
"https://www.pixiv.net/users/#{user_id}" if user_id.present?
end
def stacc_url
"https://www.pixiv.net/stacc/#{username}" if username.present?
end
end
end

View File

@@ -1,58 +1,9 @@
# frozen_string_literal: true # frozen_string_literal: true
# Pixiv # @see Source::URL::Pixiv
#
# * https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# * https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
#
# * https://i.pximg.net/c/250x250_80_a2/img-master/img/2014/10/29/09/27/19/46785915_p0_square1200.jpg
# * https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
#
# * https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
#
# * https://www.pixiv.net/member_illust.php?mode=medium&illust_id=46324488
# * https://www.pixiv.net/member_illust.php?mode=manga&illust_id=46324488
# * https://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46324488&page=0
# * https://www.pixiv.net/en/artworks/46324488
#
# * https://www.pixiv.net/member.php?id=339253
# * https://www.pixiv.net/member_illust.php?id=339253&type=illust
# * https://www.pixiv.net/u/9202877
# * https://www.pixiv.net/stacc/noizave
# * http://www.pixiv.me/noizave
#
# Novels
#
# * https://i.pximg.net/novel-cover-original/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42.jpg
# * https://i.pximg.net/c/600x600/novel-cover-master/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42_master1200.jpg
# * https://img-novel.pximg.net/img-novel/work_main/XtFbt7gsymsvyaG45lZ8/1554.jpg?20190107110435
#
# * https://www.pixiv.net/novel/show.php?id=10617324
# * https://novel.pixiv.net/works/1554
#
# Sketch
#
# * https://img-sketch.pixiv.net/uploads/medium/file/4463372/8906921629213362989.jpg
# * https://img-sketch.pximg.net/c!/w=540,f=webp:jpeg/uploads/medium/file/4463372/8906921629213362989.jpg
# * https://sketch.pixiv.net/items/1588346448904706151
# * https://sketch.pixiv.net/@0125840
#
module Sources module Sources
module Strategies module Strategies
class Pixiv < Base class Pixiv < Base
MONIKER = /(?:[a-zA-Z0-9_-]+)/
PROFILE = %r{\Ahttps?://www\.pixiv\.net/member\.php\?id=[0-9]+\z}
DATE = %r{(?<date>\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2})}i
EXT = /(?:jpg|jpeg|png|gif)/i
WEB = %r{(?:\A(?:https?://)?www\.pixiv\.net)}
I12 = %r{(?:\A(?:https?://)?i[0-9]+\.pixiv\.net)}
IMG = %r{(?:\A(?:https?://)?img[0-9]*\.pixiv\.net)}
PXIMG = %r{(?:\A(?:https?://)?[^.]+\.pximg\.net)}
UGOIRA = %r{#{PXIMG}/img-zip-ugoira/img/#{DATE}/(?<illust_id>\d+)_ugoira1920x1080\.zip\z}i
ORIG_IMAGE = %r{#{PXIMG}/img-original/img/#{DATE}/(?<illust_id>\d+)_p(?<page>\d+)\.#{EXT}\z}i
def self.enabled? def self.enabled?
Danbooru.config.pixiv_phpsessid.present? Danbooru.config.pixiv_phpsessid.present?
end end
@@ -81,48 +32,26 @@ module Sources
end end
end end
def domains
["pixiv.net", "pximg.net"]
end
def match? def match?
return false if parsed_url.nil? Source::URL::Pixiv === parsed_url
return false if url.include? "/fanbox/"
return false if Source::URL::PixivSketch === parsed_url
parsed_url.domain.in?(domains) || parsed_url.host == "tc-pximg01.techorus-cdn.com"
end end
def site_name def site_name
"Pixiv" parsed_url.site_name
end end
def image_urls def image_urls
if is_ugoira? if parsed_url.image_url?
[parsed_url.full_image_url]
elsif is_ugoira?
[api_ugoira[:originalSrc]] [api_ugoira[:originalSrc]]
elsif manga_page.present? && original_urls.present?
[original_urls[manga_page]]
elsif original_urls.present?
original_urls
else else
[url] original_urls
end end
end end
def original_urls def original_urls
api_pages.map { |page| page.dig("urls", "original") } api_pages.pluck("urls").pluck("original").to_a
end
def preview_urls
image_urls.map do |url|
case url
when ORIG_IMAGE
"https://i.pximg.net/c/240x240/img-master/img/#{$~[:date]}/#{$~[:illust_id]}_p#{$~[:page]}_master1200.jpg"
when UGOIRA
"https://i.pximg.net/c/240x240/img-master/img/#{$~[:date]}/#{$~[:illust_id]}_master1200.jpg"
else
url
end
end
end end
def page_url def page_url
@@ -135,10 +64,8 @@ module Sources
end end
def profile_url def profile_url
url = urls.find { |url| url.match?(PROFILE) } if parsed_url.profile_url.present?
parsed_url.profile_url
if url.present?
url
elsif api_illust[:userId].present? elsif api_illust[:userId].present?
"https://www.pixiv.net/users/#{api_illust[:userId]}" "https://www.pixiv.net/users/#{api_illust[:userId]}"
else else
@@ -195,7 +122,7 @@ module Sources
def download_file!(url = image_url) def download_file!(url = image_url)
file = super(url) file = super(url)
file.frame_data = ugoira_frame_data if is_ugoira? file.frame_data = ugoira_frame_data
file file
end end
@@ -214,58 +141,11 @@ module Sources
end end
def is_ugoira? def is_ugoira?
# https://i.pximg.net/img-original/img/2019/05/27/17/59/33/74932152_ugoira0.jpg original_urls.any? { |url| Source::URL.parse(url).is_ugoira? }
url.match?(UGOIRA) || original_urls&.any?(/ugoira/)
end end
def illust_id def illust_id
parsed_urls.each do |url| parsed_url.work_id || parsed_referer&.work_id
# http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1
if url.host == "www.pixiv.net" && url.path == "/member_illust.php" && url.params.has_key?("illust_id")
return url.params[:illust_id].to_i
# http://www.pixiv.net/en/artworks/46324488
elsif url.host == "www.pixiv.net" && url.path =~ %r{\A/(?:en/)?artworks/(?<illust_id>\d+)}i
return $~[:illust_id].to_i
# http://www.pixiv.net/i/18557054
elsif url.host == "www.pixiv.net" && url.path =~ %r{\A/i/(?<illust_id>\d+)\z}i
return $~[:illust_id].to_i
# http://img18.pixiv.net/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
# http://i2.pixiv.net/img18/img/evazion/14901720_s.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_big_p1.png
elsif url.host =~ /\A(?:i\d+|img\d+)\.pixiv\.net\z/i &&
url.path =~ %r{\A(?:/img\d+)?/img/#{MONIKER}/(?<illust_id>\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)}i
return $~[:illust_id].to_i
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_64x64.jpg
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
# http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p0_master1200.jpg
# http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p0.png
# http://i1.pixiv.net/img-zip-ugoira/img/2014/10/03/17/29/16/46323924_ugoira1920x1080.zip
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
# https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
# https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
#
# but not:
#
# https://i.pximg.net/novel-cover-original/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42.jpg
# https://img-sketch.pixiv.net/uploads/medium/file/4463372/8906921629213362989.jpg
elsif url.host =~ /\A(?:[^.]+\.pximg\.net|i\d+\.pixiv\.net|tc-pximg01\.techorus-cdn\.com)\z/i &&
url.path =~ %r{\A(/c/\w+)?/img-[a-z-]+/img/#{DATE}/(?<illust_id>\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)}i
return $~[:illust_id].to_i
end
end
nil
end end
def api_client def api_client
@@ -285,16 +165,7 @@ module Sources
end end
def moniker def moniker
# we can sometimes get the moniker from the url parsed_url.username || api_illust[:userAccount]
if url =~ %r{#{IMG}/img/(#{MONIKER})}i
$1
elsif url =~ %r{#{I12}/img[0-9]+/img/(#{MONIKER})}i
$1
elsif url =~ %r{#{WEB}/stacc/(#{MONIKER})/?$}i
$1
else
api_illust[:userAccount]
end
end end
def ugoira_frame_data def ugoira_frame_data
@@ -302,30 +173,6 @@ module Sources
api_ugoira[:frames] api_ugoira[:frames]
end end
def ugoira_content_type
api_ugoira[:mime_type]
end
# Returns the current page number of the manga. This will not
# make any api calls and only looks at (url, referer_url).
def manga_page
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg
# http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg
# http://i1.pixiv.net/img-original/img/2014/09/25/23/09/29/46183440_p0.jpg
if url =~ %r{/\d+_p(\d+)(?:_\w+)?\.#{EXT}}i
return $1.to_i
end
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46170939&page=0
[url, referer_url].each do |x|
if x =~ /page=(\d+)/i
return $1.to_i
end
end
nil
end
memoize :illust_id, :api_client, :api_illust, :api_pages, :api_ugoira memoize :illust_id, :api_client, :api_illust, :api_pages, :api_ugoira
end end
end end

View File

@@ -8,7 +8,7 @@ module Sources
def assert_illust_id(illust_id, url) def assert_illust_id(illust_id, url)
site = Sources::Strategies.find(url) site = Sources::Strategies.find(url)
assert_equal(illust_id, site.illust_id) assert_equal(illust_id, site.illust_id.to_i)
assert_nothing_raised { site.to_h } assert_nothing_raised { site.to_h }
end end
@@ -43,10 +43,6 @@ module Sources
assert_equal("https://i.pximg.net/img-zip-ugoira/img/2017/04/04/08/57/38/62247364_ugoira1920x1080.zip", @site.file_url) assert_equal("https://i.pximg.net/img-zip-ugoira/img/2017/04/04/08/57/38/62247364_ugoira1920x1080.zip", @site.file_url)
end end
should "get the preview url" do
assert_equal("https://i.pximg.net/c/240x240/img-master/img/2017/04/04/08/57/38/62247364_master1200.jpg", @site.preview_url)
end
should "capture the frame data" do should "capture the frame data" do
media_file = @site.download_file! media_file = @site.download_file!
@@ -103,10 +99,6 @@ module Sources
assert_equal("https://i.pximg.net/img-original/img/2017/11/21/05/12/37/65981735_p0.jpg", @site.image_url) assert_equal("https://i.pximg.net/img-original/img/2017/11/21/05/12/37/65981735_p0.jpg", @site.image_url)
end end
should "get the preview size image url" do
assert_equal("https://i.pximg.net/c/240x240/img-master/img/2017/11/21/05/12/37/65981735_p0_master1200.jpg", @site.preview_url)
end
should "get the page count" do should "get the page count" do
assert_equal(1, @site.image_urls.size) assert_equal(1, @site.image_urls.size)
end end