diff --git a/app/logical/artist_finder.rb b/app/logical/artist_finder.rb index 52d3d6e48..916bec996 100644 --- a/app/logical/artist_finder.rb +++ b/app/logical/artist_finder.rb @@ -149,7 +149,7 @@ module ArtistFinder # @param url [String] the artist profile URL # @return [Array] the list of matching artists def find_artists(url) - url = ArtistUrl.normalize(url) + url = ArtistUrl.normalize_normalized_url(url) artists = [] while artists.empty? && url.size > 10 diff --git a/app/models/artist_url.rb b/app/models/artist_url.rb index 21815f2de..3112f4328 100644 --- a/app/models/artist_url.rb +++ b/app/models/artist_url.rb @@ -1,8 +1,8 @@ # frozen_string_literal: true class ArtistUrl < ApplicationRecord - before_validation :initialize_normalized_url, on: :create - before_validation :normalize + normalize :url, :normalize_url + validates :url, presence: true, uniqueness: { scope: :artist_id } validate :validate_url_format belongs_to :artist, :touch => true @@ -18,7 +18,7 @@ class ArtistUrl < ApplicationRecord [is_active, url] end - def self.normalize(url) + def self.normalize_normalized_url(url) if url.nil? nil else @@ -29,7 +29,7 @@ class ArtistUrl < ApplicationRecord url = url.sub(%r{^http://pictures.hentai-foundry.com//}, "http://pictures.hentai-foundry.com/") # the strategy won't always work for twitter because it looks for a status - url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com} + url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com}i url = Sources::Strategies.find(url).normalize_for_artist_finder @@ -67,7 +67,7 @@ class ArtistUrl < ApplicationRecord elsif url.include?("*") where_ilike(attr, url) else - where(attr => normalize(url)) + where(attr => normalize_normalized_url(url)) end end @@ -118,20 +118,17 @@ class ArtistUrl < ApplicationRecord sites.index(site_name) || 1000 end - def normalize - # Perform some normalization with Addressable on the URL itself - # - Converts scheme and hostname to downcase - # - Converts unicode hostname to Punycode + def self.normalize_url(url) uri = Addressable::URI.parse(url) uri.site = uri.normalized_site - self.url = uri.to_s - self.normalized_url = self.class.normalize(url) + uri.to_s rescue Addressable::URI::InvalidURIError - # Don't bother normalizing the URL if there is errors + url end - def initialize_normalized_url - self.normalized_url = url + def url=(url) + super(url) + self.normalized_url = self.class.normalize_normalized_url(self.url) end def to_s diff --git a/test/unit/artist_test.rb b/test/unit/artist_test.rb index 91a528f05..0cb9d40e4 100644 --- a/test/unit/artist_test.rb +++ b/test/unit/artist_test.rb @@ -158,6 +158,13 @@ class ArtistTest < ActiveSupport::TestCase assert_equal(old_url_ids, ArtistUrl.order("id").pluck(&:id)) end + should "normalize urls before removing duplicates" do + @artist = create(:artist, url_string: "https://Twitter.com/o8q https://twitter.com/o8q") + + assert_equal(1, @artist.urls.count) + assert_equal(["https://twitter.com/o8q"], @artist.urls.map(&:to_s)) + end + should "ignore pixiv.net/ and pixiv.net/img/ url matches" do a1 = FactoryBot.create(:artist, :name => "yomosaka", :url_string => "http://i2.pixiv.net/img18/img/evazion/14901720.png") a2 = FactoryBot.create(:artist, :name => "niwatazumi_bf", :url_string => "http://i2.pixiv.net/img18/img/evazion/14901720_big_p0.png")