sources: factor out Source::URL::Pixiv.

* Drop support for preview_urls. This means that IQDB lookups may be
  slower, especially for ugoiras, since we have to download the full
  ugoira now. However, ugoira lookups should produce better results,
  since the ugoira thumbnail chosen by Pixiv wasn't necessarily the same
  as the thumbnail chosen by Danbooru.

* Drop support for uploading single manga pages:

    http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=2

  Previously uploading an URL like this would only upload a single image
  out of a multi-image work. Now it will upload all images in the work.
  Pixiv no longer supports URLs like this, so we don't either.

* Add support for parsing URLs like this:

    https://i.pximg.net/c/360x360_70/custom-thumb/img/2022/03/08/00/00/56/96755248_p0_custom1200.jpg

  Apparently artists can choose a custom thumbnail now (not like anyone
  will try to upload one though).
This commit is contained in:
evazion
2022-03-08 22:06:24 -06:00
parent df0bb70486
commit 1c620f8055
3 changed files with 155 additions and 176 deletions

View File

@@ -0,0 +1,140 @@
# frozen_string_literal: true
module Source
class URL::Pixiv < Source::URL
attr_reader :work_id, :page, :username, :user_id, :full_image_url
def self.match?(url)
return false if Source::URL::Fanbox.match?(url) || Source::URL::PixivSketch.match?(url)
url.domain.in?(%w[pximg.net pixiv.net pixiv.me]) || url.host == "tc-pximg01.techorus-cdn.com"
end
def parse
case [host, *path_segments]
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
# https://i.pximg.net/c/250x250_80_a2/img-master/img/2014/10/29/09/27/19/46785915_p0_square1200.jpg
# https://i.pximg.net/img-zip-ugoira/img/2016/04/09/14/25/29/56268141_ugoira1920x1080.zip
# https://i.pximg.net/img-original/img/2019/05/27/17/59/33/74932152_ugoira0.jpg
# https://i.pximg.net/c/360x360_70/custom-thumb/img/2022/03/08/00/00/56/96755248_p0_custom1200.jpg
# https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_64x64.jpg
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
# https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
#
# but not:
#
# https://i.pximg.net/novel-cover-original/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42.jpg
in *, ("img-original" | "img-master" | "img-zip-ugoira" | "img-inf" | "custom-thumb"), "img", year, month, day, hour, min, sec, file if image_url?
parse_filename
if work_id.present? && is_ugoira?
@full_image_url = "https://i.pximg.net/img-zip-ugoira/img/#{year}/#{month}/#{day}/#{hour}/#{min}/#{sec}/#{work_id}_ugoira1920x1080.zip"
elsif work_id.present? && page.present? && file_ext.present?
@full_image_url = "https://i.pximg.net/img-original/img/#{year}/#{month}/#{day}/#{hour}/#{min}/#{sec}/#{work_id}_p#{page}.#{file_ext}"
end
# http://img18.pixiv.net/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
in *, "img", username, file if image_url?
parse_filename
@username = username
@full_image_url = url # XXX these URLs all return 404, so we can't convert them to a working full image URL.
# https://www.pixiv.net/en/artworks/46324488
# https://www.pixiv.net/artworks/46324488
in "www.pixiv.net", *, "artworks", work_id
@work_id = work_id
# http://www.pixiv.net/i/18557054
in "www.pixiv.net", "i", work_id
@work_id = work_id
# http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1
in "www.pixiv.net", "member_illust.php" if params[:illust_id].present?
@work_id = params[:illust_id]
# https://www.pixiv.net/member.php?id=339253
in "www.pixiv.net", "member.php" if params[:id].present?
@user_id = params[:id]
# https://www.pixiv.net/u/9202877
# https://www.pixiv.net/users/9202877
# https://www.pixiv.net/en/users/9202877
in "www.pixiv.net", *, ("u" | "users"), user_id
@user_id = user_id
# https://www.pixiv.net/stacc/noizave
in "www.pixiv.net", "stacc", username
@username = username
# http://www.pixiv.me/noizave
in "www.pixiv.me", username
@username = username
else
end
end
def parse_filename
case filename.split("_")
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
in /^\d+$/ => work_id, /^p\d+$/ => page, *rest
@work_id = work_id
@page = page.delete_prefix("p")
# https://i.pximg.net/img-original/img/2019/05/27/17/59/33/74932152_ugoira0.jpg
# https://i.pximg.net/img-zip-ugoira/img/2016/04/09/14/25/29/56268141_ugoira1920x1080.zip
in /^\d+$/ => work_id, /^ugoira/
@work_id = work_id
@ugoira = true
# https://i.pximg.net/c/240x240/img-master/img/2017/04/04/08/57/38/62247364_master1200.jpg
# http://i1.pixiv.net/img53/img/themare/39735353_big_p1.jpg
# http://i2.pixiv.net/img18/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
# http://i2.pixiv.net/img18/img/evazion/14901720_s.png
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_64x64.jpg
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
in /^\d+$/ => work_id, *rest
@work_id = work_id
else
end
end
def image_url?
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
# https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
# https://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg
# http://img18.pixiv.net/img/evazion/14901720.png
host.in?(["i.pximg.net", "i-f.pximg.net", "tc-pximg01.techorus-cdn.com"]) || host.match?(/\A(i\d+|img\d+)\.pixiv\.net\z/)
end
def is_ugoira?
@ugoira.present?
end
def page_url
"https://www.pixiv.net/artworks/#{work_id}" if work_id.present?
end
def profile_url
"https://www.pixiv.net/users/#{user_id}" if user_id.present?
end
def stacc_url
"https://www.pixiv.net/stacc/#{username}" if username.present?
end
end
end

View File

@@ -1,58 +1,9 @@
# frozen_string_literal: true
# Pixiv
#
# * https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# * https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
#
# * https://i.pximg.net/c/250x250_80_a2/img-master/img/2014/10/29/09/27/19/46785915_p0_square1200.jpg
# * https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
#
# * https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
#
# * https://www.pixiv.net/member_illust.php?mode=medium&illust_id=46324488
# * https://www.pixiv.net/member_illust.php?mode=manga&illust_id=46324488
# * https://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46324488&page=0
# * https://www.pixiv.net/en/artworks/46324488
#
# * https://www.pixiv.net/member.php?id=339253
# * https://www.pixiv.net/member_illust.php?id=339253&type=illust
# * https://www.pixiv.net/u/9202877
# * https://www.pixiv.net/stacc/noizave
# * http://www.pixiv.me/noizave
#
# Novels
#
# * https://i.pximg.net/novel-cover-original/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42.jpg
# * https://i.pximg.net/c/600x600/novel-cover-master/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42_master1200.jpg
# * https://img-novel.pximg.net/img-novel/work_main/XtFbt7gsymsvyaG45lZ8/1554.jpg?20190107110435
#
# * https://www.pixiv.net/novel/show.php?id=10617324
# * https://novel.pixiv.net/works/1554
#
# Sketch
#
# * https://img-sketch.pixiv.net/uploads/medium/file/4463372/8906921629213362989.jpg
# * https://img-sketch.pximg.net/c!/w=540,f=webp:jpeg/uploads/medium/file/4463372/8906921629213362989.jpg
# * https://sketch.pixiv.net/items/1588346448904706151
# * https://sketch.pixiv.net/@0125840
#
# @see Source::URL::Pixiv
module Sources
module Strategies
class Pixiv < Base
MONIKER = /(?:[a-zA-Z0-9_-]+)/
PROFILE = %r{\Ahttps?://www\.pixiv\.net/member\.php\?id=[0-9]+\z}
DATE = %r{(?<date>\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2})}i
EXT = /(?:jpg|jpeg|png|gif)/i
WEB = %r{(?:\A(?:https?://)?www\.pixiv\.net)}
I12 = %r{(?:\A(?:https?://)?i[0-9]+\.pixiv\.net)}
IMG = %r{(?:\A(?:https?://)?img[0-9]*\.pixiv\.net)}
PXIMG = %r{(?:\A(?:https?://)?[^.]+\.pximg\.net)}
UGOIRA = %r{#{PXIMG}/img-zip-ugoira/img/#{DATE}/(?<illust_id>\d+)_ugoira1920x1080\.zip\z}i
ORIG_IMAGE = %r{#{PXIMG}/img-original/img/#{DATE}/(?<illust_id>\d+)_p(?<page>\d+)\.#{EXT}\z}i
def self.enabled?
Danbooru.config.pixiv_phpsessid.present?
end
@@ -81,48 +32,26 @@ module Sources
end
end
def domains
["pixiv.net", "pximg.net"]
end
def match?
return false if parsed_url.nil?
return false if url.include? "/fanbox/"
return false if Source::URL::PixivSketch === parsed_url
parsed_url.domain.in?(domains) || parsed_url.host == "tc-pximg01.techorus-cdn.com"
Source::URL::Pixiv === parsed_url
end
def site_name
"Pixiv"
parsed_url.site_name
end
def image_urls
if is_ugoira?
if parsed_url.image_url?
[parsed_url.full_image_url]
elsif is_ugoira?
[api_ugoira[:originalSrc]]
elsif manga_page.present? && original_urls.present?
[original_urls[manga_page]]
elsif original_urls.present?
original_urls
else
[url]
original_urls
end
end
def original_urls
api_pages.map { |page| page.dig("urls", "original") }
end
def preview_urls
image_urls.map do |url|
case url
when ORIG_IMAGE
"https://i.pximg.net/c/240x240/img-master/img/#{$~[:date]}/#{$~[:illust_id]}_p#{$~[:page]}_master1200.jpg"
when UGOIRA
"https://i.pximg.net/c/240x240/img-master/img/#{$~[:date]}/#{$~[:illust_id]}_master1200.jpg"
else
url
end
end
api_pages.pluck("urls").pluck("original").to_a
end
def page_url
@@ -135,10 +64,8 @@ module Sources
end
def profile_url
url = urls.find { |url| url.match?(PROFILE) }
if url.present?
url
if parsed_url.profile_url.present?
parsed_url.profile_url
elsif api_illust[:userId].present?
"https://www.pixiv.net/users/#{api_illust[:userId]}"
else
@@ -195,7 +122,7 @@ module Sources
def download_file!(url = image_url)
file = super(url)
file.frame_data = ugoira_frame_data if is_ugoira?
file.frame_data = ugoira_frame_data
file
end
@@ -214,58 +141,11 @@ module Sources
end
def is_ugoira?
# https://i.pximg.net/img-original/img/2019/05/27/17/59/33/74932152_ugoira0.jpg
url.match?(UGOIRA) || original_urls&.any?(/ugoira/)
original_urls.any? { |url| Source::URL.parse(url).is_ugoira? }
end
def illust_id
parsed_urls.each do |url|
# http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1
if url.host == "www.pixiv.net" && url.path == "/member_illust.php" && url.params.has_key?("illust_id")
return url.params[:illust_id].to_i
# http://www.pixiv.net/en/artworks/46324488
elsif url.host == "www.pixiv.net" && url.path =~ %r{\A/(?:en/)?artworks/(?<illust_id>\d+)}i
return $~[:illust_id].to_i
# http://www.pixiv.net/i/18557054
elsif url.host == "www.pixiv.net" && url.path =~ %r{\A/i/(?<illust_id>\d+)\z}i
return $~[:illust_id].to_i
# http://img18.pixiv.net/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720.png
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
# http://i2.pixiv.net/img18/img/evazion/14901720_s.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_big_p1.png
elsif url.host =~ /\A(?:i\d+|img\d+)\.pixiv\.net\z/i &&
url.path =~ %r{\A(?:/img\d+)?/img/#{MONIKER}/(?<illust_id>\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)}i
return $~[:illust_id].to_i
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_64x64.jpg
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
# http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p0_master1200.jpg
# http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p0.png
# http://i1.pixiv.net/img-zip-ugoira/img/2014/10/03/17/29/16/46323924_ugoira1920x1080.zip
# https://i.pximg.net/img-original/img/2014/10/03/18/10/20/46324488_p0.png
# https://i.pximg.net/img-master/img/2014/10/03/18/10/20/46324488_p0_master1200.jpg
# https://i-f.pximg.net/img-original/img/2020/02/19/00/40/18/79584713_p0.png
# https://tc-pximg01.techorus-cdn.com/img-original/img/2017/09/18/03/18/24/65015428_p4.png
#
# but not:
#
# https://i.pximg.net/novel-cover-original/img/2019/01/14/01/15/05/10617324_d84daae89092d96bbe66efafec136e42.jpg
# https://img-sketch.pixiv.net/uploads/medium/file/4463372/8906921629213362989.jpg
elsif url.host =~ /\A(?:[^.]+\.pximg\.net|i\d+\.pixiv\.net|tc-pximg01\.techorus-cdn\.com)\z/i &&
url.path =~ %r{\A(/c/\w+)?/img-[a-z-]+/img/#{DATE}/(?<illust_id>\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)}i
return $~[:illust_id].to_i
end
end
nil
parsed_url.work_id || parsed_referer&.work_id
end
def api_client
@@ -285,16 +165,7 @@ module Sources
end
def moniker
# we can sometimes get the moniker from the url
if url =~ %r{#{IMG}/img/(#{MONIKER})}i
$1
elsif url =~ %r{#{I12}/img[0-9]+/img/(#{MONIKER})}i
$1
elsif url =~ %r{#{WEB}/stacc/(#{MONIKER})/?$}i
$1
else
api_illust[:userAccount]
end
parsed_url.username || api_illust[:userAccount]
end
def ugoira_frame_data
@@ -302,30 +173,6 @@ module Sources
api_ugoira[:frames]
end
def ugoira_content_type
api_ugoira[:mime_type]
end
# Returns the current page number of the manga. This will not
# make any api calls and only looks at (url, referer_url).
def manga_page
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg
# http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg
# http://i1.pixiv.net/img-original/img/2014/09/25/23/09/29/46183440_p0.jpg
if url =~ %r{/\d+_p(\d+)(?:_\w+)?\.#{EXT}}i
return $1.to_i
end
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46170939&page=0
[url, referer_url].each do |x|
if x =~ /page=(\d+)/i
return $1.to_i
end
end
nil
end
memoize :illust_id, :api_client, :api_illust, :api_pages, :api_ugoira
end
end

View File

@@ -8,7 +8,7 @@ module Sources
def assert_illust_id(illust_id, url)
site = Sources::Strategies.find(url)
assert_equal(illust_id, site.illust_id)
assert_equal(illust_id, site.illust_id.to_i)
assert_nothing_raised { site.to_h }
end
@@ -43,10 +43,6 @@ module Sources
assert_equal("https://i.pximg.net/img-zip-ugoira/img/2017/04/04/08/57/38/62247364_ugoira1920x1080.zip", @site.file_url)
end
should "get the preview url" do
assert_equal("https://i.pximg.net/c/240x240/img-master/img/2017/04/04/08/57/38/62247364_master1200.jpg", @site.preview_url)
end
should "capture the frame data" do
media_file = @site.download_file!
@@ -103,10 +99,6 @@ module Sources
assert_equal("https://i.pximg.net/img-original/img/2017/11/21/05/12/37/65981735_p0.jpg", @site.image_url)
end
should "get the preview size image url" do
assert_equal("https://i.pximg.net/c/240x240/img-master/img/2017/11/21/05/12/37/65981735_p0_master1200.jpg", @site.preview_url)
end
should "get the page count" do
assert_equal(1, @site.image_urls.size)
end