diff --git a/app/logical/sources/site.rb b/app/logical/sources/site.rb index 313db6cc6..6e785b8f2 100644 --- a/app/logical/sources/site.rb +++ b/app/logical/sources/site.rb @@ -20,16 +20,18 @@ module Sources end end + def normalized_for_artist_finder? + available? && strategy.normalized_for_artist_finder? + end + def normalize_for_artist_finder! - if available? - begin - return strategy.normalize_for_artist_finder! - rescue Sources::Error - return url - end + if available? && strategy.normalizable_for_artist_finder? + strategy.normalize_for_artist_finder! else - return url + url end + rescue + url end def translated_tags diff --git a/app/logical/sources/strategies/base.rb b/app/logical/sources/strategies/base.rb index 6fad07ac5..fe1978ea2 100644 --- a/app/logical/sources/strategies/base.rb +++ b/app/logical/sources/strategies/base.rb @@ -18,6 +18,27 @@ module Sources raise NotImplementedError end + # Subclasses should return true only if the URL is in its final normalized form. + # + # Sources::Site.new("http://img.pixiv.net/img/evazion").normalized_for_artist_finder? + # => true + # Sources::Site.new("http://i2.pixiv.net/img18/img/evazion/14901720_m.png").normalized_for_artist_finder? + # => false + def normalized_for_artist_finder? + false + end + + # Subclasses should return true only if the URL is a valid URL that could + # be converted into normalized form. + # + # Sources::Site.new("http://www.pixiv.net/member_illust.php?mode=medium&illust_id=18557054").normalizable_for_artist_finder? + # => true + # Sources::Site.new("http://dic.pixiv.net/a/THUNDERproject").normalizable_for_artist_finder? + # => false + def normalizable_for_artist_finder? + false + end + def normalize_for_artist_finder! url end diff --git a/app/logical/sources/strategies/pixiv.rb b/app/logical/sources/strategies/pixiv.rb index 0c852e44d..18f187199 100644 --- a/app/logical/sources/strategies/pixiv.rb +++ b/app/logical/sources/strategies/pixiv.rb @@ -3,12 +3,22 @@ require 'csv' module Sources + class Error < StandardError ; end + module Strategies class Pixiv < Base attr_reader :zip_url, :ugoira_frame_data, :ugoira_content_type + MONIKER = '(?:[a-zA-Z0-9_-]+)' + TIMESTAMP = '(?:[0-9]{4}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2}/[0-9]{2})' + EXT = "(?:jpg|jpeg|png|gif)" + + WEB = "^(?:https?://)?www\\.pixiv\\.net" + I12 = "^(?:https?://)?i[12]\\.pixiv\\.net" + IMG = "^(?:https?://)?img[0-9]*\\.pixiv\\.net" + def self.url_match?(url) - url =~ /^https?:\/\/(?:\w+\.)?pixiv\.net/ + url =~ /#{WEB}|#{IMG}|#{I12}/i end def referer_url(template) @@ -27,18 +37,25 @@ module Sources @pixiv_moniker end + def normalized_for_artist_finder? + url =~ %r!http://img\.pixiv\.net/img/#{MONIKER}/?$!i + end + + def normalizable_for_artist_finder? + has_moniker? || sample_image? || full_image? || work_page? + end + def normalize_for_artist_finder! - # http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_m.jpg - if url =~ %r!/img/([^/]+)/\d+(?:_\w+)?\.(?:jpg|jpeg|png|gif)!i - username = $1 + if has_moniker? + moniker = get_moniker_from_url else illust_id = illust_id_from_url(url) get_metadata_from_spapi!(illust_id) do |metadata| - username = metadata[24] + moniker = metadata[24] end end - "http://img.pixiv.net/img/#{username}" + "http://img.pixiv.net/img/#{moniker}/" end def get @@ -77,7 +94,7 @@ module Sources # http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg # => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1.png def rewrite_new_medium_images(thumbnail_url) - if thumbnail_url =~ %r!/c/\d+x\d+/img-master/img/.*/\d+_p\d+_\w+\.jpg!i + if thumbnail_url =~ %r!/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_p\d+_\w+\.jpg!i thumbnail_url = thumbnail_url.sub(%r!/c/\d+x\d+/img-master/!i, '/img-original/') # => http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p1_master1200.jpg @@ -117,7 +134,7 @@ module Sources # => http://i2.pixiv.net/img18/img/evazion/14901720.png # def rewrite_old_small_and_medium_images(thumbnail_url, is_manga) - if thumbnail_url =~ %r!/img/[^/]+/\d+_[ms]\.(?:jpg|jpeg|png|gif)!i + if thumbnail_url =~ %r!/img/#{MONIKER}/\d+_[ms]\.#{EXT}!i if is_manga.nil? illust_id = illust_id_from_url(@url) get_metadata_from_spapi!(illust_id) do |metadata| @@ -141,7 +158,7 @@ module Sources # http://i2.pixiv.net/img04/img/syounen_no_uta/46170939_p0.jpg # http://i1.pixiv.net/c/600x600/img-master/img/2014/09/24/23/25/08/46168376_p0_master1200.jpg # http://i1.pixiv.net/img-original/img/2014/09/25/23/09/29/46183440_p0.jpg - if url =~ %r!/\d+_p(\d+)(?:_\w+)?\.(?:jpg|jpeg|png|gif|zip)!i + if url =~ %r!/\d+_p(\d+)(?:_\w+)?\.#{EXT}!i $1 # http://www.pixiv.net/member_illust.php?mode=manga_big&illust_id=46170939&page=0 @@ -178,6 +195,23 @@ module Sources end end + def get_moniker_from_url + case url + when %r!#{IMG}/img/(#{MONIKER})!i + $1 + when %r!#{I12}/img[0-9]+/img/(#{MONIKER})!i + $1 + when %r!#{WEB}/stacc/(#{MONIKER})/?$!i + $1 + else + false + end + end + + def has_moniker? + get_moniker_from_url != false + end + def get_image_url_from_page(page, is_manga) elements = page.search("div.works_display a img").find_all do |node| node["src"] !~ /source\.pixiv\.net/ @@ -258,10 +292,39 @@ module Sources agent.get(spapi_url) do |response| metadata = CSV.parse(response.content.force_encoding("UTF-8")).first - if metadata.nil? - raise Sources::Error.new("Couldn't get Pixiv API metadata from #{spapi_url}.") - else - yield metadata + validate_spapi_metadata!(metadata) + yield metadata + end + end + + def validate_spapi_metadata!(metadata) + if metadata.nil? + raise Sources::Error.new("Pixiv API returned empty response.") + elsif metadata.size != 31 + raise Sources::Error.new("Pixiv API returned unexpected number of fields.") + end + + illust_id = metadata[0] + file_ext = metadata[2] + page_count = metadata[19] + moniker = metadata[24] + mobile_profile_image = metadata[30] + + if file_ext !~ /#{EXT}/i + raise Sources::Error.new("Pixiv API returned unexpected file extension '#{file_ext}' for pixiv ##{illust_id}.") + elsif moniker !~ /#{MONIKER}/i + raise Sources::Error.new("Pixiv API returned invalid artist moniker '#{moniker}' for pixiv ##{illust_id}.") + elsif page_count.to_s !~ /[0-9]*/i + raise Sources::Error.new("Pixiv API returned invalid page count '#{page_count}' for pixiv ##{illust_id}.") + end + + if mobile_profile_image + # http://i1.pixiv.net/img01/profile/ccz67420/mobile/5042957_80.jpg + profile_regex = %r!i[12]\.pixiv\.net/img\d+/profile/#{MONIKER}/mobile/\d+_\d+\.jpg!i + mobile_moniker = mobile_profile_image.match(profile_regex)[1] + + if mobile_moniker != moniker + raise Sources::Error.new("Pixiv API returned inconsistent artist moniker '#{moniker}' for pixiv ##{illust_id}.") end end end @@ -300,6 +363,73 @@ module Sources raise Sources::Error.new("Couldn't get illust ID from URL: #{url}") end end + + def work_page? + return true if url =~ %r!#{WEB}/member_illust\.php\?mode=(?:medium|big|manga|manga_big)&illust_id=\d+!i + return true if url =~ %r!#{WEB}/i/\d+$!i + return false + end + + def full_image? + # http://img18.pixiv.net/img/evazion/14901720.png?1234 + return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i + + # http://i2.pixiv.net/img18/img/evazion/14901720.png + # http://i1.pixiv.net/img07/img/pasirism/18557054_big_p1.png + return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+(?:_big_p\d+)?\.#{EXT}!i + + # http://i1.pixiv.net/img-original/img/2014/10/02/13/51/23/46304396_p0.png + return true if url =~ %r!#{I12}/img-original/img/#{TIMESTAMP}/\d+_p\d+\.#{EXT}$!i + + # http://i1.pixiv.net/img-zip-ugoira/img/2014/10/03/17/29/16/46323924_ugoira1920x1080.zip + return true if url =~ %r!#{I12}/img-zip-ugoira/img/#{TIMESTAMP}/\d+_ugoira\d+x\d+\.zip$!i + + return false + end + + def sample_image? + # http://img18.pixiv.net/img/evazion/14901720_m.png + return true if url =~ %r!#{IMG}/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i + + # http://i2.pixiv.net/img18/img/evazion/14901720_m.png + # http://i1.pixiv.net/img07/img/pasirism/18557054_p1.png + return true if url =~ %r!#{I12}/img\d+/img/#{MONIKER}/\d+_(?:[sm]|p\d+)\.#{EXT}!i + + # http://i1.pixiv.net/c/600x600/img-master/img/2014/10/02/13/51/23/46304396_p0_master1200.jpg + # http://i2.pixiv.net/c/64x64/img-master/img/2014/10/09/12/59/50/46441917_square1200.jpg + return true if url =~ %r!#{I12}/c/\d+x\d+/img-master/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}$!i + + # http://i1.pixiv.net/img-inf/img/2011/05/01/23/28/04/18557054_s.png + # http://i2.pixiv.net/img-inf/img/2010/11/30/08/54/06/14901765_64x64.jpg + return true if url =~ %r!#{I12}/img-inf/img/#{TIMESTAMP}/\d+_\w+\.#{EXT}!i + + return false + end + + def agent + @agent ||= begin + mech = Mechanize.new + + phpsessid = Cache.get("pixiv-phpsessid") + if phpsessid + cookie = Mechanize::Cookie.new("PHPSESSID", phpsessid) + cookie.domain = ".pixiv.net" + cookie.path = "/" + mech.cookie_jar.add(cookie) + else + mech.get("http://www.pixiv.net") do |page| + page.form_with(:action => "/login.php") do |form| + form['pixiv_id'] = Danbooru.config.pixiv_login + form['pass'] = Danbooru.config.pixiv_password + end.click_button + end + phpsessid = mech.cookie_jar.cookies.select{|c| c.name == "PHPSESSID"}.first + Cache.put("pixiv-phpsessid", phpsessid.value, 1.month) if phpsessid + end + + mech + end + end end end end diff --git a/app/models/artist.rb b/app/models/artist.rb index d28cd0448..154bb075d 100644 --- a/app/models/artist.rb +++ b/app/models/artist.rb @@ -22,10 +22,11 @@ class Artist < ActiveRecord::Base module ClassMethods def find_all_by_url(url) - url = Sources::Site.new(url).normalize_for_artist_finder! url = ArtistUrl.normalize(url) artists = [] + # return [] unless Sources::Site.new(url).normalized_for_artist_finder? + while artists.empty? && url.size > 10 u = url.sub(/\/+$/, "") + "/" u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%' diff --git a/app/models/artist_url.rb b/app/models/artist_url.rb index 361007c79..575deb933 100644 --- a/app/models/artist_url.rb +++ b/app/models/artist_url.rb @@ -1,4 +1,5 @@ class ArtistUrl < ActiveRecord::Base + before_save :initialize_normalized_url, on: [ :create ] before_save :normalize validates_presence_of :url belongs_to :artist @@ -12,8 +13,7 @@ class ArtistUrl < ActiveRecord::Base url = url.gsub(/^http:\/\/blog\d+\.fc2/, "http://blog.fc2") url = url.gsub(/^http:\/\/blog-imgs-\d+\.fc2/, "http://blog.fc2") url = url.gsub(/^http:\/\/blog-imgs-\d+-\w+\.fc2/, "http://blog.fc2") - url = url.gsub(/^http:\/\/img\d+\.pixiv\.net/, "http://img.pixiv.net") - url = url.gsub(/^http:\/\/i\d+\.pixiv\.net\/img\d+/, "http://img.pixiv.net") + url = Sources::Site.new(url).normalize_for_artist_finder! url = url.gsub(/\/+\Z/, "") url + "/" end @@ -33,7 +33,13 @@ class ArtistUrl < ActiveRecord::Base end def normalize - self.normalized_url = self.class.normalize(url) + if !Sources::Site.new(normalized_url).normalized_for_artist_finder? + self.normalized_url = self.class.normalize(url) + end + end + + def initialize_normalized_url + self.normalized_url = url end def to_s diff --git a/test/test_helper.rb b/test/test_helper.rb index 60dd2dd6d..0e939a4f9 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -105,4 +105,11 @@ VCR.configure do |c| c.cassette_library_dir = "test/fixtures/vcr_cassettes" c.hook_into :webmock # c.allow_http_connections_when_no_cassette = true + + c.default_cassette_options = { + match_requests_on: [ + :method, + VCR.request_matchers.uri_without_param(:PHPSESSID) + ] + } end