From 57bb51621e06e53441698ffc5b13291aca73086a Mon Sep 17 00:00:00 2001 From: evazion Date: Mon, 13 Oct 2014 14:11:02 -0500 Subject: [PATCH] Add debugger gem. Fix random VCR failures in Pixiv tests. Sometimes tests randomly fail because the PHPSESSID they use in their HTTP requests to Pixiv is different than the one that was originally recorded by VCR. This causes VCR to complain that the requests don't match. This is caused by the PHPSESSID being globally cached in Memcache. Depending on the order the tests run in (which is random), one set of tests can use a PHPSESSID that was recorded for a /different/ set of tests. Improve Pixiv URL matching. * Allow URLs that are missing the http:// part. These are sometimes seen in artist entries. * Ignore URLs from random Pixiv domains such as dic.pixiv.net, blog.pixiv.net, etc. These are also sometimes in artist entries. Improve normalize_for_artist_finder! URL matching. * Normalize www.pixiv.net/stacc/username URLs. * Correctly normalize URLs that are missing the illust ID part on the end (i.e. http://i2.pixiv.net/img04/img/syounen_no_uta/). These are common in artist entries. Match URLs strictly when normalizing for artist entries. Only normalize Pixiv URLs that strictly match a known format. Pass any unrecognized URLs through without attempting to normalize them, just to be safe. Normalize URLs when saving artist entries. --- app/logical/sources/site.rb | 16 +-- app/logical/sources/strategies/base.rb | 21 ++++ app/logical/sources/strategies/pixiv.rb | 156 ++++++++++++++++++++++-- app/models/artist.rb | 3 +- app/models/artist_url.rb | 12 +- test/test_helper.rb | 7 ++ 6 files changed, 191 insertions(+), 24 deletions(-) diff --git a/app/logical/sources/site.rb b/app/logical/sources/site.rb index 313db6cc6..6e785b8f2 100644 --- a/app/logical/sources/site.rb +++ b/app/logical/sources/site.rb @@ -20,16 +20,18 @@ module Sources end end + def normalized_for_artist_finder? + available? && strategy.normalized_for_artist_finder? + end + def normalize_for_artist_finder! - if available? - begin - return strategy.normalize_for_artist_finder! - rescue Sources::Error - return url - end + if available? && strategy.normalizable_for_artist_finder? + strategy.normalize_for_artist_finder! else - return url + url end + rescue + url end def translated_tags diff --git a/app/logical/sources/strategies/base.rb b/app/logical/sources/strategies/base.rb index 6fad07ac5..fe1978ea2 100644 --- a/app/logical/sources/strategies/base.rb +++ b/app/logical/sources/strategies/base.rb @@ -18,6 +18,27 @@ module Sources raise NotImplementedError end + # Subclasses should return true only if the URL is in its final normalized form. + # + # Sources::Site.new("http://img.pixiv.net/img/evazion").normalized_for_artist_finder? + # => true + # Sources::Site.new("http://i2.pixiv.net/img18/img/evazion/14901720_m.png").normalized_for_artist_finder? + # => false + def normalized_for_artist_finder? + false + end + + # Subclasses should return true only if the URL is a valid URL that could + # be converted into normalized form. + # + # Sources::Site.new("http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054").normalizable_for_artist_finder? + # => true + # Sources::Site.new("http://dic.pixiv.net/a/THUNDERproject").normalizable_for_artist_finder? + # => false + def normalizable_for_artist_finder? + false + end + def normalize_for_artist_finder! url end diff --git a/app/logical/sources/strategies/pixiv.rb b/app/logical/sources/strategies/pixiv.rb index 0c852e44d..18f187199 100644 --- a/app/logical/sources/strategies/pixiv.rb +++ b/app/logical/sources/strategies/pixiv.rb @@ -3,12 +3,22 @@ require 'csv' module Sources + class Error < StandardError ; end + module Strategies class Pixiv < Base attr_reader :zip_url, :ugoira_frame_data, :ugoira_content_type + MONIKER = '(?:[a-zA-Z0-9_-]+)' + TIMESTAMP = '(?:[0-9]{4}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2})' + EXT = "(?:jpg|jpeg|png|gif)" + + WEB = "^(?:https?://)?www\\.pixiv\\.net" + I12 = "^(?:https?://)?i[12]\\.pixiv\\.net" + IMG = "^(?:https?://)?img[0-9]*\\.pixiv\\.net" + def self.url_match?(url) - url =~ /^https?:\/\/(?:\w+\.)?pixiv\.net/ + url =~ /#{WEB}|#{IMG}|#{I12}/i end def referer_url(template) @@ -27,18 +37,25 @@ module Sources @pixiv_moniker end + def normalized_for_artist_finder? + url =~ %r!http://img\.pixiv\.net/img/#{MONIKER}/?$!i + end + + def normalizable_for_artist_finder? + has_moniker? || sample_image? || full_image? || work_page? + end + def normalize_for_artist_finder! - # http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_m.jpg - if url =~ %r!/img/([^/]+)/\d+(?:_\w+)?\.(?:jpg|jpeg|png|gif)!i - username = $1 + if has_moniker? + moniker = get_moniker_from_url else illust_id = illust_id_from_url(url) get_metadata_from_spapi!(illust_id) do |metadata| - username = metadata[24] + moniker = metadata[24] end end - "http://img.pixiv.net/img/#{username}" + "http://img.pixiv.net/img/#{moniker}/" end def get @@ -77,7 +94,7 @@ module Sources # http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg # => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1.png def rewrite_new_medium_images(thumbnail_url) - if thumbnail_url =~ %r!/c/\d+x\d+/img-master/img/.*/\d+_p\d+_\w+\.jpg!i + if thumbnail_url =~ %r!/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_p\d+_\w+\.jpg!i thumbnail_url = thumbnail_url.sub(%r!/c/\d+x\d+/img-master/!i, '/img-original/') # => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg @@ -117,7 +134,7 @@ module Sources # => http://i2.pixiv.net/img18/img/evazion/14901720.png # def rewrite_old_small_and_medium_images(thumbnail_url, is_manga) - if thumbnail_url =~ %r!/img/[^/]+/\d+_[ms]\.(?:jpg|jpeg|png|gif)!i + if thumbnail_url =~ %r!/img/#{MONIKER}/\d+_[ms]\.#{EXT}!i if is_manga.nil? illust_id = illust_id_from_url(@url) get_metadata_from_spapi!(illust_id) do |metadata| @@ -141,7 +158,7 @@ module Sources # http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg # http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg # http://i1.pixiv.net/img-original/img/2014/09/25/23/09/29/46183440_p0.jpg - if url =~ %r!/\d+_p(\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)!i + if url =~ %r!/\d+_p(\d+)(?:_\w+)?\.#{EXT}!i $1 # http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46170939&page=0 @@ -178,6 +195,23 @@ module Sources end end + def get_moniker_from_url + case url + when %r!#{IMG}/img/(#{MONIKER})!i + $1 + when %r!#{I12}/img[0-9]+/img/(#{MONIKER})!i + $1 + when %r!#{WEB}/stacc/(#{MONIKER})/?$!i + $1 + else + false + end + end + + def has_moniker? + get_moniker_from_url != false + end + def get_image_url_from_page(page, is_manga) elements = page.search("div.works_display a img").find_all do |node| node["src"] !~ /source\.pixiv\.net/ @@ -258,10 +292,39 @@ module Sources agent.get(spapi_url) do |response| metadata = CSV.parse(response.content.force_encoding("UTF-8")).first - if metadata.nil? - raise Sources::Error.new("Couldn't get Pixiv API metadata from #{spapi_url}.") - else - yield metadata + validate_spapi_metadata!(metadata) + yield metadata + end + end + + def validate_spapi_metadata!(metadata) + if metadata.nil? + raise Sources::Error.new("Pixiv API returned empty response.") + elsif metadata.size != 31 + raise Sources::Error.new("Pixiv API returned unexpected number of fields.") + end + + illust_id = metadata[0] + file_ext = metadata[2] + page_count = metadata[19] + moniker = metadata[24] + mobile_profile_image = metadata[30] + + if file_ext !~ /#{EXT}/i + raise Sources::Error.new("Pixiv API returned unexpected file extension '#{file_ext}' for pixiv ##{illust_id}.") + elsif moniker !~ /#{MONIKER}/i + raise Sources::Error.new("Pixiv API returned invalid artist moniker '#{moniker}' for pixiv ##{illust_id}.") + elsif page_count.to_s !~ /[0-9]*/i + raise Sources::Error.new("Pixiv API returned invalid page count '#{page_count}' for pixiv ##{illust_id}.") + end + + if mobile_profile_image + # http://i1.pixiv.net/img01/profile/ccz67420/mobile/5042957_80.jpg + profile_regex = %r!i[12]\.pixiv\.net/img\d+/profile/#{MONIKER}/mobile/\d+_\d+\.jpg!i + mobile_moniker = mobile_profile_image.match(profile_regex)[1] + + if mobile_moniker != moniker + raise Sources::Error.new("Pixiv API returned inconsistent artist moniker '#{moniker}' for pixiv ##{illust_id}.") end end end @@ -300,6 +363,73 @@ module Sources raise Sources::Error.new("Couldn't get illust ID from URL: #{url}") end end + + def work_page? + return true if url =~ %r!#{WEB}/member_illust\.php\?mode=(?:medium|big|manga|manga_big)&illust_id=\d+!i + return true if url =~ %r!#{WEB}/i/\d+$!i + return false + end + + def full_image? + # http://img18.pixiv.net/img/evazion/14901720.png?1234 + return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i + + # http://i2.pixiv.net/img18/img/evazion/14901720.png + # http://i1.pixiv.net/img07/img/pasirism/18557054_big_p1.png + return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i + + # http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p0.png + return true if url =~ %r!#{I12}/img-original/img/#{TIMESTAMP}/\d+_p\d+\.#{EXT}$!i + + # http://i1.pixiv.net/img-zip-ugoira/img/2014/10/03/17/29/16/46323924_ugoira1920x1080.zip + return true if url =~ %r!#{I12}/img-zip-ugoira/img/#{TIMESTAMP}/\d+_ugoira\d+x\d+\.zip$!i + + return false + end + + def sample_image? + # http://img18.pixiv.net/img/evazion/14901720_m.png + return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i + + # http://i2.pixiv.net/img18/img/evazion/14901720_m.png + # http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png + return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i + + # http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p0_master1200.jpg + # http://i2.pixiv.net/c/64x64/img-master/img/2014/10/09/12/59/50/46441917_square1200.jpg + return true if url =~ %r!#{I12}/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}$!i + + # http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png + # http://i2.pixiv.net/img-inf/img/2010/11/30/08/54/06/14901765_64x64.jpg + return true if url =~ %r!#{I12}/img-inf/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}!i + + return false + end + + def agent + @agent ||= begin + mech = Mechanize.new + + phpsessid = Cache.get("pixiv-phpsessid") + if phpsessid + cookie = Mechanize::Cookie.new("PHPSESSID", phpsessid) + cookie.domain = ".pixiv.net" + cookie.path = "/" + mech.cookie_jar.add(cookie) + else + mech.get("http://www.pixiv.net") do |page| + page.form_with(:action => "/login.php") do |form| + form['pixiv_id'] = Danbooru.config.pixiv_login + form['pass'] = Danbooru.config.pixiv_password + end.click_button + end + phpsessid = mech.cookie_jar.cookies.select{|c| c.name == "PHPSESSID"}.first + Cache.put("pixiv-phpsessid", phpsessid.value, 1.month) if phpsessid + end + + mech + end + end end end end diff --git a/app/models/artist.rb b/app/models/artist.rb index d28cd0448..154bb075d 100644 --- a/app/models/artist.rb +++ b/app/models/artist.rb @@ -22,10 +22,11 @@ class Artist < ActiveRecord::Base module ClassMethods def find_all_by_url(url) - url = Sources::Site.new(url).normalize_for_artist_finder! url = ArtistUrl.normalize(url) artists = [] + # return [] unless Sources::Site.new(url).normalized_for_artist_finder? + while artists.empty? && url.size > 10 u = url.sub(/\/+$/, "") + "/" u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%' diff --git a/app/models/artist_url.rb b/app/models/artist_url.rb index 361007c79..575deb933 100644 --- a/app/models/artist_url.rb +++ b/app/models/artist_url.rb @@ -1,4 +1,5 @@ class ArtistUrl < ActiveRecord::Base + before_save :initialize_normalized_url, on: [ :create ] before_save :normalize validates_presence_of :url belongs_to :artist @@ -12,8 +13,7 @@ class ArtistUrl < ActiveRecord::Base url = url.gsub(/^http:\/\/blog\d+\.fc2/, "http://blog.fc2") url = url.gsub(/^http:\/\/blog-imgs-\d+\.fc2/, "http://blog.fc2") url = url.gsub(/^http:\/\/blog-imgs-\d+-\w+\.fc2/, "http://blog.fc2") - url = url.gsub(/^http:\/\/img\d+\.pixiv\.net/, "http://img.pixiv.net") - url = url.gsub(/^http:\/\/i\d+\.pixiv\.net\/img\d+/, "http://img.pixiv.net") + url = Sources::Site.new(url).normalize_for_artist_finder! url = url.gsub(/\/+\Z/, "") url + "/" end @@ -33,7 +33,13 @@ class ArtistUrl < ActiveRecord::Base end def normalize - self.normalized_url = self.class.normalize(url) + if !Sources::Site.new(normalized_url).normalized_for_artist_finder? + self.normalized_url = self.class.normalize(url) + end + end + + def initialize_normalized_url + self.normalized_url = url end def to_s diff --git a/test/test_helper.rb b/test/test_helper.rb index 60dd2dd6d..0e939a4f9 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -105,4 +105,11 @@ VCR.configure do |c| c.cassette_library_dir = "test/fixtures/vcr_cassettes" c.hook_into :webmock # c.allow_http_connections_when_no_cassette = true + + c.default_cassette_options = { + match_requests_on: [ + :method, + VCR.request_matchers.uri_without_param(:PHPSESSID) + ] + } end