Fix random VCR failures in Pixiv tests. Sometimes tests randomly fail because the PHPSESSID they use in their HTTP requests to Pixiv is different than the one that was originally recorded by VCR. This causes VCR to complain that the requests don't match. This is caused by the PHPSESSID being globally cached in Memcache. Depending on the order the tests run in (which is random), one set of tests can use a PHPSESSID that was recorded for a /different/ set of tests. Improve Pixiv URL matching. * Allow URLs that are missing the http:// part. These are sometimes seen in artist entries. * Ignore URLs from random Pixiv domains such as dic.pixiv.net, blog.pixiv.net, etc. These are also sometimes in artist entries. Improve normalize_for_artist_finder! URL matching. * Normalize www.pixiv.net/stacc/username URLs. * Correctly normalize URLs that are missing the illust ID part on the end (i.e. http://i2.pixiv.net/img04/img/syounen_no_uta/). These are common in artist entries. Match URLs strictly when normalizing for artist entries. Only normalize Pixiv URLs that strictly match a known format. Pass any unrecognized URLs through without attempting to normalize them, just to be safe. Normalize URLs when saving artist entries.
436 lines
15 KiB
Ruby
436 lines
15 KiB
Ruby
# encoding: UTF-8
|
|
|
|
require 'csv'
|
|
|
|
module Sources
|
|
class Error < StandardError ; end
|
|
|
|
module Strategies
|
|
class Pixiv < Base
|
|
attr_reader :zip_url, :ugoira_frame_data, :ugoira_content_type
|
|
|
|
MONIKER = '(?:[a-zA-Z0-9_-]+)'
|
|
TIMESTAMP = '(?:[0-9]{4}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2})'
|
|
EXT = "(?:jpg|jpeg|png|gif)"
|
|
|
|
WEB = "^(?:https?://)?www\\.pixiv\\.net"
|
|
I12 = "^(?:https?://)?i[12]\\.pixiv\\.net"
|
|
IMG = "^(?:https?://)?img[0-9]*\\.pixiv\\.net"
|
|
|
|
def self.url_match?(url)
|
|
url =~ /#{WEB}|#{IMG}|#{I12}/i
|
|
end
|
|
|
|
def referer_url(template)
|
|
if template.params[:ref] =~ /pixiv\.net\/member_illust/ && template.params[:ref] =~ /mode=medium/
|
|
template.params[:ref]
|
|
else
|
|
template.params[:url]
|
|
end
|
|
end
|
|
|
|
def site_name
|
|
"Pixiv"
|
|
end
|
|
|
|
def unique_id
|
|
@pixiv_moniker
|
|
end
|
|
|
|
def normalized_for_artist_finder?
|
|
url =~ %r!http://img\.pixiv\.net/img/#{MONIKER}/?$!i
|
|
end
|
|
|
|
def normalizable_for_artist_finder?
|
|
has_moniker? || sample_image? || full_image? || work_page?
|
|
end
|
|
|
|
def normalize_for_artist_finder!
|
|
if has_moniker?
|
|
moniker = get_moniker_from_url
|
|
else
|
|
illust_id = illust_id_from_url(url)
|
|
get_metadata_from_spapi!(illust_id) do |metadata|
|
|
moniker = metadata[24]
|
|
end
|
|
end
|
|
|
|
"http://img.pixiv.net/img/#{moniker}/"
|
|
end
|
|
|
|
def get
|
|
agent.get(URI.parse(normalized_url)) do |page|
|
|
@artist_name, @profile_url = get_profile_from_page(page)
|
|
@pixiv_moniker = get_moniker_from_page(page)
|
|
@zip_url, @ugoira_frame_data, @ugoira_content_type = get_zip_url_from_page(page)
|
|
@tags = get_tags_from_page(page)
|
|
@page_count = get_page_count_from_page(page)
|
|
|
|
is_manga = @page_count > 1
|
|
|
|
if !@zip_url
|
|
@image_url = get_image_url_from_page(page, is_manga)
|
|
end
|
|
end
|
|
end
|
|
|
|
def rewrite_thumbnails(thumbnail_url, is_manga=nil)
|
|
thumbnail_url = rewrite_new_medium_images(thumbnail_url)
|
|
thumbnail_url = rewrite_medium_ugoiras(thumbnail_url)
|
|
thumbnail_url = rewrite_old_small_and_medium_images(thumbnail_url, is_manga)
|
|
return thumbnail_url
|
|
end
|
|
|
|
def agent
|
|
@agent ||= PixivWebAgent.build
|
|
end
|
|
|
|
def file_url
|
|
image_url || zip_url
|
|
end
|
|
|
|
protected
|
|
|
|
# http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg
|
|
# => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1.png
|
|
def rewrite_new_medium_images(thumbnail_url)
|
|
if thumbnail_url =~ %r!/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_p\d+_\w+\.jpg!i
|
|
thumbnail_url = thumbnail_url.sub(%r!/c/\d+x\d+/img-master/!i, '/img-original/')
|
|
# => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg
|
|
|
|
page = manga_page_from_url(@url)
|
|
thumbnail_url = thumbnail_url.sub(%r!_p(\d+)_\w+\.jpg$!i, "_p#{page}.")
|
|
# => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1.
|
|
|
|
illust_id = illust_id_from_url(@url)
|
|
get_metadata_from_spapi!(illust_id) do |metadata|
|
|
file_ext = metadata[2]
|
|
thumbnail_url += file_ext
|
|
# => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1.png
|
|
end
|
|
end
|
|
|
|
thumbnail_url
|
|
end
|
|
|
|
# http://i3.pixiv.net/img-zip-ugoira/img/2014/12/03/04/58/24/47378698_ugoira600x600.zip
|
|
# => http://i3.pixiv.net/img-zip-ugoira/img/2014/12/03/04/58/24/47378698_ugoira1920x1080.zip
|
|
def rewrite_medium_ugoiras(thumbnail_url)
|
|
if thumbnail_url =~ %r!/img-zip-ugoira/img/.*/\d+_ugoira600x600.zip!i
|
|
thumbnail_url = thumbnail_url.sub("_ugoira600x600.zip", "_ugoira1920x1080.zip")
|
|
end
|
|
|
|
thumbnail_url
|
|
end
|
|
|
|
# If the thumbnail is for a manga gallery, it needs to be rewritten like this:
|
|
#
|
|
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
|
|
# => http://i2.pixiv.net/img18/img/evazion/14901720_big_p0.png
|
|
#
|
|
# Otherwise, it needs to be rewritten like this:
|
|
#
|
|
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
|
|
# => http://i2.pixiv.net/img18/img/evazion/14901720.png
|
|
#
|
|
def rewrite_old_small_and_medium_images(thumbnail_url, is_manga)
|
|
if thumbnail_url =~ %r!/img/#{MONIKER}/\d+_[ms]\.#{EXT}!i
|
|
if is_manga.nil?
|
|
illust_id = illust_id_from_url(@url)
|
|
get_metadata_from_spapi!(illust_id) do |metadata|
|
|
page_count = metadata[19].to_i || 1
|
|
is_manga = page_count > 1
|
|
end
|
|
end
|
|
|
|
if is_manga
|
|
page = manga_page_from_url(@url)
|
|
return thumbnail_url.sub(/_[ms]\./, "_big_p#{page}.")
|
|
else
|
|
return thumbnail_url.sub(/_[ms]\./, ".")
|
|
end
|
|
end
|
|
|
|
return thumbnail_url
|
|
end
|
|
|
|
def manga_page_from_url(url)
|
|
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg
|
|
# http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg
|
|
# http://i1.pixiv.net/img-original/img/2014/09/25/23/09/29/46183440_p0.jpg
|
|
if url =~ %r!/\d+_p(\d+)(?:_\w+)?\.#{EXT}!i
|
|
$1
|
|
|
|
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46170939&page=0
|
|
elsif url =~ /page=(\d+)/i
|
|
$1
|
|
|
|
else
|
|
0
|
|
end
|
|
end
|
|
|
|
def get_profile_from_page(page)
|
|
profile_url = page.search("a.user-link").first
|
|
if profile_url
|
|
profile_url = "http://www.pixiv.net" + profile_url["href"]
|
|
end
|
|
|
|
artist_name = page.search("h1.user").first
|
|
if artist_name
|
|
artist_name = artist_name.inner_text
|
|
end
|
|
|
|
return [artist_name, profile_url]
|
|
end
|
|
|
|
def get_moniker_from_page(page)
|
|
# <a class="tab-feed" href="/stacc/gennmai-226">Feed</a>
|
|
stacc_link = page.search("a.tab-feed").first
|
|
|
|
if not stacc_link.nil?
|
|
stacc_link.attr("href").sub(%r!^/stacc/!i, '')
|
|
else
|
|
raise Sources::Error.new("Couldn't find Pixiv moniker in page: #{normalized_url}")
|
|
end
|
|
end
|
|
|
|
def get_moniker_from_url
|
|
case url
|
|
when %r!#{IMG}/img/(#{MONIKER})!i
|
|
$1
|
|
when %r!#{I12}/img[0-9]+/img/(#{MONIKER})!i
|
|
$1
|
|
when %r!#{WEB}/stacc/(#{MONIKER})/?$!i
|
|
$1
|
|
else
|
|
false
|
|
end
|
|
end
|
|
|
|
def has_moniker?
|
|
get_moniker_from_url != false
|
|
end
|
|
|
|
def get_image_url_from_page(page, is_manga)
|
|
elements = page.search("div.works_display a img").find_all do |node|
|
|
node["src"] !~ /source\.pixiv\.net/
|
|
end
|
|
|
|
if elements.any?
|
|
thumbnail_url = elements.first.attr("src")
|
|
return rewrite_thumbnails(thumbnail_url, is_manga)
|
|
else
|
|
raise Sources::Error.new("Couldn't find image thumbnail URL in page: #{normalized_url}")
|
|
end
|
|
end
|
|
|
|
def get_zip_url_from_page(page)
|
|
scripts = page.search("body script").find_all do |node|
|
|
node.text =~ /_ugoira600x600\.zip/
|
|
end
|
|
|
|
if scripts.any?
|
|
javascript = scripts.first.text
|
|
|
|
json = javascript.match(/;pixiv\.context\.ugokuIllustData\s+=\s+(\{.+?\});(?:$|pixiv\.context)/)[1]
|
|
data = JSON.parse(json)
|
|
zip_url = data["src"].sub("_ugoira600x600.zip", "_ugoira1920x1080.zip")
|
|
frame_data = data["frames"]
|
|
content_type = data["mime_type"]
|
|
|
|
return [zip_url, frame_data, content_type]
|
|
end
|
|
end
|
|
|
|
def get_tags_from_page(page)
|
|
# puts page.root.to_xhtml
|
|
|
|
links = page.search("ul.tags a.text").find_all do |node|
|
|
node["href"] =~ /search\.php/
|
|
end
|
|
|
|
original_flag = page.search("a.original-works")
|
|
|
|
if links.any?
|
|
links.map! do |node|
|
|
[node.inner_text, "http://www.pixiv.net" + node.attr("href")]
|
|
end
|
|
|
|
if original_flag.any?
|
|
links << ["オリジナル", "http://www.pixiv.net/search.php?s_mode=s_tag_full&word=%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB"]
|
|
end
|
|
|
|
links
|
|
else
|
|
[]
|
|
end
|
|
end
|
|
|
|
def get_page_count_from_page(page)
|
|
elements = page.search("ul.meta li").find_all do |node|
|
|
node.text =~ /Manga|漫画|複数枚投稿/
|
|
end
|
|
|
|
if elements.any?
|
|
elements[0].text =~ /(?:Manga|漫画|複数枚投稿) (\d+)P/
|
|
$1.to_i
|
|
else
|
|
1
|
|
end
|
|
end
|
|
|
|
def normalized_url
|
|
illust_id = illust_id_from_url(@url)
|
|
"http://www.pixiv.net/member_illust.php?mode=medium&illust_id=#{illust_id}"
|
|
end
|
|
|
|
# Refer to http://danbooru.donmai.us/wiki_pages/58938 for documentation on the Pixiv API.
|
|
def get_metadata_from_spapi!(illust_id)
|
|
spapi_url = "http://spapi.pixiv.net/iphone/illust.php?illust_id=#{illust_id}&PHPSESSID=#{PixivWebAgent.phpsessid(agent)}"
|
|
|
|
agent.get(spapi_url) do |response|
|
|
metadata = CSV.parse(response.content.force_encoding("UTF-8")).first
|
|
|
|
validate_spapi_metadata!(metadata)
|
|
yield metadata
|
|
end
|
|
end
|
|
|
|
def validate_spapi_metadata!(metadata)
|
|
if metadata.nil?
|
|
raise Sources::Error.new("Pixiv API returned empty response.")
|
|
elsif metadata.size != 31
|
|
raise Sources::Error.new("Pixiv API returned unexpected number of fields.")
|
|
end
|
|
|
|
illust_id = metadata[0]
|
|
file_ext = metadata[2]
|
|
page_count = metadata[19]
|
|
moniker = metadata[24]
|
|
mobile_profile_image = metadata[30]
|
|
|
|
if file_ext !~ /#{EXT}/i
|
|
raise Sources::Error.new("Pixiv API returned unexpected file extension '#{file_ext}' for pixiv ##{illust_id}.")
|
|
elsif moniker !~ /#{MONIKER}/i
|
|
raise Sources::Error.new("Pixiv API returned invalid artist moniker '#{moniker}' for pixiv ##{illust_id}.")
|
|
elsif page_count.to_s !~ /[0-9]*/i
|
|
raise Sources::Error.new("Pixiv API returned invalid page count '#{page_count}' for pixiv ##{illust_id}.")
|
|
end
|
|
|
|
if mobile_profile_image
|
|
# http://i1.pixiv.net/img01/profile/ccz67420/mobile/5042957_80.jpg
|
|
profile_regex = %r!i[12]\.pixiv\.net/img\d+/profile/#{MONIKER}/mobile/\d+_\d+\.jpg!i
|
|
mobile_moniker = mobile_profile_image.match(profile_regex)[1]
|
|
|
|
if mobile_moniker != moniker
|
|
raise Sources::Error.new("Pixiv API returned inconsistent artist moniker '#{moniker}' for pixiv ##{illust_id}.")
|
|
end
|
|
end
|
|
end
|
|
|
|
def illust_id_from_url(url)
|
|
# http://img18.pixiv.net/img/evazion/14901720.png
|
|
#
|
|
# http://i2.pixiv.net/img18/img/evazion/14901720.png
|
|
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
|
|
# http://i2.pixiv.net/img18/img/evazion/14901720_s.png
|
|
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
|
|
# http://i1.pixiv.net/img07/img/pasirism/18557054_big_p1.png
|
|
#
|
|
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_64x64.jpg
|
|
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
|
|
#
|
|
# http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p0_master1200.jpg
|
|
# http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p0.png
|
|
#
|
|
# http://i1.pixiv.net/img-zip-ugoira/img/2014/10/03/17/29/16/46323924_ugoira1920x1080.zip
|
|
if url =~ %r!/(\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)!i
|
|
$1
|
|
|
|
# http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054
|
|
# http://www.pixiv.net/member_illust.php?mode=big&illust_id=18557054
|
|
# http://www.pixiv.net/member_illust.php?mode=manga&illust_id=18557054
|
|
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=18557054&page=1
|
|
elsif url =~ /illust_id=(\d+)/i
|
|
$1
|
|
|
|
# http://www.pixiv.net/i/18557054
|
|
elsif url =~ %r!pixiv\.net/i/(\d+)!i
|
|
$1
|
|
|
|
else
|
|
raise Sources::Error.new("Couldn't get illust ID from URL: #{url}")
|
|
end
|
|
end
|
|
|
|
def work_page?
|
|
return true if url =~ %r!#{WEB}/member_illust\.php\?mode=(?:medium|big|manga|manga_big)&illust_id=\d+!i
|
|
return true if url =~ %r!#{WEB}/i/\d+$!i
|
|
return false
|
|
end
|
|
|
|
def full_image?
|
|
# http://img18.pixiv.net/img/evazion/14901720.png?1234
|
|
return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i
|
|
|
|
# http://i2.pixiv.net/img18/img/evazion/14901720.png
|
|
# http://i1.pixiv.net/img07/img/pasirism/18557054_big_p1.png
|
|
return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i
|
|
|
|
# http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p0.png
|
|
return true if url =~ %r!#{I12}/img-original/img/#{TIMESTAMP}/\d+_p\d+\.#{EXT}$!i
|
|
|
|
# http://i1.pixiv.net/img-zip-ugoira/img/2014/10/03/17/29/16/46323924_ugoira1920x1080.zip
|
|
return true if url =~ %r!#{I12}/img-zip-ugoira/img/#{TIMESTAMP}/\d+_ugoira\d+x\d+\.zip$!i
|
|
|
|
return false
|
|
end
|
|
|
|
def sample_image?
|
|
# http://img18.pixiv.net/img/evazion/14901720_m.png
|
|
return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i
|
|
|
|
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
|
|
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
|
|
return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i
|
|
|
|
# http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p0_master1200.jpg
|
|
# http://i2.pixiv.net/c/64x64/img-master/img/2014/10/09/12/59/50/46441917_square1200.jpg
|
|
return true if url =~ %r!#{I12}/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}$!i
|
|
|
|
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
|
|
# http://i2.pixiv.net/img-inf/img/2010/11/30/08/54/06/14901765_64x64.jpg
|
|
return true if url =~ %r!#{I12}/img-inf/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}!i
|
|
|
|
return false
|
|
end
|
|
|
|
def agent
|
|
@agent ||= begin
|
|
mech = Mechanize.new
|
|
|
|
phpsessid = Cache.get("pixiv-phpsessid")
|
|
if phpsessid
|
|
cookie = Mechanize::Cookie.new("PHPSESSID", phpsessid)
|
|
cookie.domain = ".pixiv.net"
|
|
cookie.path = "/"
|
|
mech.cookie_jar.add(cookie)
|
|
else
|
|
mech.get("http://www.pixiv.net") do |page|
|
|
page.form_with(:action => "/login.php") do |form|
|
|
form['pixiv_id'] = Danbooru.config.pixiv_login
|
|
form['pass'] = Danbooru.config.pixiv_password
|
|
end.click_button
|
|
end
|
|
phpsessid = mech.cookie_jar.cookies.select{|c| c.name == "PHPSESSID"}.first
|
|
Cache.put("pixiv-phpsessid", phpsessid.value, 1.month) if phpsessid
|
|
end
|
|
|
|
mech
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|