Add debugger gem.

Fix random VCR failures in Pixiv tests.

Sometimes tests randomly fail because the PHPSESSID they use in their
HTTP requests to Pixiv is different than the one that was originally
recorded by VCR. This causes VCR to complain that the requests don't
match.

This is caused by the PHPSESSID being globally cached in Memcache.
Depending on the order the tests run in (which is random), one set of
tests can use a PHPSESSID that was recorded for a /different/ set of
tests.

Improve Pixiv URL matching.

* Allow URLs that are missing the http:// part. These are sometimes seen
  in artist entries.
* Ignore URLs from random Pixiv domains such as dic.pixiv.net,
  blog.pixiv.net, etc. These are also sometimes in artist entries.

Improve normalize_for_artist_finder! URL matching.

* Normalize www.pixiv.net/stacc/username URLs.
* Correctly normalize URLs that are missing the illust ID part on the end
  (i.e. http://i2.pixiv.net/img04/img/syounen_no_uta/). These are common
  in artist entries.

Match URLs strictly when normalizing for artist entries.

Only normalize Pixiv URLs that strictly match a known format. Pass any
unrecognized URLs through without attempting to normalize them, just to
be safe.

Normalize URLs when saving artist entries.
This commit is contained in:
evazion
2014-10-13 14:11:02 -05:00
committed by r888888888
parent 63193593d9
commit 57bb51621e
6 changed files with 191 additions and 24 deletions

View File

@@ -20,16 +20,18 @@ module Sources
end
end
def normalized_for_artist_finder?
available? && strategy.normalized_for_artist_finder?
end
def normalize_for_artist_finder!
if available?
begin
return strategy.normalize_for_artist_finder!
rescue Sources::Error
return url
end
if available? && strategy.normalizable_for_artist_finder?
strategy.normalize_for_artist_finder!
else
return url
url
end
rescue
url
end
def translated_tags

View File

@@ -18,6 +18,27 @@ module Sources
raise NotImplementedError
end
# Subclasses should return true only if the URL is in its final normalized form.
#
# Sources::Site.new("http://img.pixiv.net/img/evazion").normalized_for_artist_finder?
# => true
# Sources::Site.new("http://i2.pixiv.net/img18/img/evazion/14901720_m.png").normalized_for_artist_finder?
# => false
def normalized_for_artist_finder?
false
end
# Subclasses should return true only if the URL is a valid URL that could
# be converted into normalized form.
#
# Sources::Site.new("http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054").normalizable_for_artist_finder?
# => true
# Sources::Site.new("http://dic.pixiv.net/a/THUNDERproject").normalizable_for_artist_finder?
# => false
def normalizable_for_artist_finder?
false
end
def normalize_for_artist_finder!
url
end

View File

@@ -3,12 +3,22 @@
require 'csv'
module Sources
class Error < StandardError ; end
module Strategies
class Pixiv < Base
attr_reader :zip_url, :ugoira_frame_data, :ugoira_content_type
MONIKER = '(?:[a-zA-Z0-9_-]+)'
TIMESTAMP = '(?:[0-9]{4}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2})'
EXT = "(?:jpg|jpeg|png|gif)"
WEB = "^(?:https?://)?www\\.pixiv\\.net"
I12 = "^(?:https?://)?i[12]\\.pixiv\\.net"
IMG = "^(?:https?://)?img[0-9]*\\.pixiv\\.net"
def self.url_match?(url)
url =~ /^https?:\/\/(?:\w+\.)?pixiv\.net/
url =~ /#{WEB}|#{IMG}|#{I12}/i
end
def referer_url(template)
@@ -27,18 +37,25 @@ module Sources
@pixiv_moniker
end
def normalized_for_artist_finder?
url =~ %r!http://img\.pixiv\.net/img/#{MONIKER}/?$!i
end
def normalizable_for_artist_finder?
has_moniker? || sample_image? || full_image? || work_page?
end
def normalize_for_artist_finder!
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_m.jpg
if url =~ %r!/img/([^/]+)/\d+(?:_\w+)?\.(?:jpg|jpeg|png|gif)!i
username = $1
if has_moniker?
moniker = get_moniker_from_url
else
illust_id = illust_id_from_url(url)
get_metadata_from_spapi!(illust_id) do |metadata|
username = metadata[24]
moniker = metadata[24]
end
end
"http://img.pixiv.net/img/#{username}"
"http://img.pixiv.net/img/#{moniker}/"
end
def get
@@ -77,7 +94,7 @@ module Sources
# http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg
# => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1.png
def rewrite_new_medium_images(thumbnail_url)
if thumbnail_url =~ %r!/c/\d+x\d+/img-master/img/.*/\d+_p\d+_\w+\.jpg!i
if thumbnail_url =~ %r!/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_p\d+_\w+\.jpg!i
thumbnail_url = thumbnail_url.sub(%r!/c/\d+x\d+/img-master/!i, '/img-original/')
# => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg
@@ -117,7 +134,7 @@ module Sources
# => http://i2.pixiv.net/img18/img/evazion/14901720.png
#
def rewrite_old_small_and_medium_images(thumbnail_url, is_manga)
if thumbnail_url =~ %r!/img/[^/]+/\d+_[ms]\.(?:jpg|jpeg|png|gif)!i
if thumbnail_url =~ %r!/img/#{MONIKER}/\d+_[ms]\.#{EXT}!i
if is_manga.nil?
illust_id = illust_id_from_url(@url)
get_metadata_from_spapi!(illust_id) do |metadata|
@@ -141,7 +158,7 @@ module Sources
# http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg
# http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg
# http://i1.pixiv.net/img-original/img/2014/09/25/23/09/29/46183440_p0.jpg
if url =~ %r!/\d+_p(\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)!i
if url =~ %r!/\d+_p(\d+)(?:_\w+)?\.#{EXT}!i
$1
# http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46170939&page=0
@@ -178,6 +195,23 @@ module Sources
end
end
def get_moniker_from_url
case url
when %r!#{IMG}/img/(#{MONIKER})!i
$1
when %r!#{I12}/img[0-9]+/img/(#{MONIKER})!i
$1
when %r!#{WEB}/stacc/(#{MONIKER})/?$!i
$1
else
false
end
end
def has_moniker?
get_moniker_from_url != false
end
def get_image_url_from_page(page, is_manga)
elements = page.search("div.works_display a img").find_all do |node|
node["src"] !~ /source\.pixiv\.net/
@@ -258,10 +292,39 @@ module Sources
agent.get(spapi_url) do |response|
metadata = CSV.parse(response.content.force_encoding("UTF-8")).first
if metadata.nil?
raise Sources::Error.new("Couldn't get Pixiv API metadata from #{spapi_url}.")
else
yield metadata
validate_spapi_metadata!(metadata)
yield metadata
end
end
def validate_spapi_metadata!(metadata)
if metadata.nil?
raise Sources::Error.new("Pixiv API returned empty response.")
elsif metadata.size != 31
raise Sources::Error.new("Pixiv API returned unexpected number of fields.")
end
illust_id = metadata[0]
file_ext = metadata[2]
page_count = metadata[19]
moniker = metadata[24]
mobile_profile_image = metadata[30]
if file_ext !~ /#{EXT}/i
raise Sources::Error.new("Pixiv API returned unexpected file extension '#{file_ext}' for pixiv ##{illust_id}.")
elsif moniker !~ /#{MONIKER}/i
raise Sources::Error.new("Pixiv API returned invalid artist moniker '#{moniker}' for pixiv ##{illust_id}.")
elsif page_count.to_s !~ /[0-9]*/i
raise Sources::Error.new("Pixiv API returned invalid page count '#{page_count}' for pixiv ##{illust_id}.")
end
if mobile_profile_image
# http://i1.pixiv.net/img01/profile/ccz67420/mobile/5042957_80.jpg
profile_regex = %r!i[12]\.pixiv\.net/img\d+/profile/#{MONIKER}/mobile/\d+_\d+\.jpg!i
mobile_moniker = mobile_profile_image.match(profile_regex)[1]
if mobile_moniker != moniker
raise Sources::Error.new("Pixiv API returned inconsistent artist moniker '#{moniker}' for pixiv ##{illust_id}.")
end
end
end
@@ -300,6 +363,73 @@ module Sources
raise Sources::Error.new("Couldn't get illust ID from URL: #{url}")
end
end
def work_page?
return true if url =~ %r!#{WEB}/member_illust\.php\?mode=(?:medium|big|manga|manga_big)&illust_id=\d+!i
return true if url =~ %r!#{WEB}/i/\d+$!i
return false
end
def full_image?
# http://img18.pixiv.net/img/evazion/14901720.png?1234
return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i
# http://i2.pixiv.net/img18/img/evazion/14901720.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_big_p1.png
return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i
# http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p0.png
return true if url =~ %r!#{I12}/img-original/img/#{TIMESTAMP}/\d+_p\d+\.#{EXT}$!i
# http://i1.pixiv.net/img-zip-ugoira/img/2014/10/03/17/29/16/46323924_ugoira1920x1080.zip
return true if url =~ %r!#{I12}/img-zip-ugoira/img/#{TIMESTAMP}/\d+_ugoira\d+x\d+\.zip$!i
return false
end
def sample_image?
# http://img18.pixiv.net/img/evazion/14901720_m.png
return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i
# http://i2.pixiv.net/img18/img/evazion/14901720_m.png
# http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png
return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i
# http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p0_master1200.jpg
# http://i2.pixiv.net/c/64x64/img-master/img/2014/10/09/12/59/50/46441917_square1200.jpg
return true if url =~ %r!#{I12}/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}$!i
# http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png
# http://i2.pixiv.net/img-inf/img/2010/11/30/08/54/06/14901765_64x64.jpg
return true if url =~ %r!#{I12}/img-inf/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}!i
return false
end
def agent
@agent ||= begin
mech = Mechanize.new
phpsessid = Cache.get("pixiv-phpsessid")
if phpsessid
cookie = Mechanize::Cookie.new("PHPSESSID", phpsessid)
cookie.domain = ".pixiv.net"
cookie.path = "/"
mech.cookie_jar.add(cookie)
else
mech.get("http://www.pixiv.net") do |page|
page.form_with(:action => "/login.php") do |form|
form['pixiv_id'] = Danbooru.config.pixiv_login
form['pass'] = Danbooru.config.pixiv_password
end.click_button
end
phpsessid = mech.cookie_jar.cookies.select{|c| c.name == "PHPSESSID"}.first
Cache.put("pixiv-phpsessid", phpsessid.value, 1.month) if phpsessid
end
mech
end
end
end
end
end

View File

@@ -22,10 +22,11 @@ class Artist < ActiveRecord::Base
module ClassMethods
def find_all_by_url(url)
url = Sources::Site.new(url).normalize_for_artist_finder!
url = ArtistUrl.normalize(url)
artists = []
# return [] unless Sources::Site.new(url).normalized_for_artist_finder?
while artists.empty? && url.size > 10
u = url.sub(/\/+$/, "") + "/"
u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%'

View File

@@ -1,4 +1,5 @@
class ArtistUrl < ActiveRecord::Base
before_save :initialize_normalized_url, on: [ :create ]
before_save :normalize
validates_presence_of :url
belongs_to :artist
@@ -12,8 +13,7 @@ class ArtistUrl < ActiveRecord::Base
url = url.gsub(/^http:\/\/blog\d+\.fc2/, "http://blog.fc2")
url = url.gsub(/^http:\/\/blog-imgs-\d+\.fc2/, "http://blog.fc2")
url = url.gsub(/^http:\/\/blog-imgs-\d+-\w+\.fc2/, "http://blog.fc2")
url = url.gsub(/^http:\/\/img\d+\.pixiv\.net/, "http://img.pixiv.net")
url = url.gsub(/^http:\/\/i\d+\.pixiv\.net\/img\d+/, "http://img.pixiv.net")
url = Sources::Site.new(url).normalize_for_artist_finder!
url = url.gsub(/\/+\Z/, "")
url + "/"
end
@@ -33,7 +33,13 @@ class ArtistUrl < ActiveRecord::Base
end
def normalize
self.normalized_url = self.class.normalize(url)
if !Sources::Site.new(normalized_url).normalized_for_artist_finder?
self.normalized_url = self.class.normalize(url)
end
end
def initialize_normalized_url
self.normalized_url = url
end
def to_s

View File

@@ -105,4 +105,11 @@ VCR.configure do |c|
c.cassette_library_dir = "test/fixtures/vcr_cassettes"
c.hook_into :webmock
# c.allow_http_connections_when_no_cassette = true
c.default_cassette_options = {
match_requests_on: [
:method,
VCR.request_matchers.uri_without_param(:PHPSESSID)
]
}
end