Fix #5002: "Urls url has already been taken" when submitting duplicate urls with different capitalization

Fix URLs being normalized after checking for duplicates rather than
before, which meant that URLs that differed in capitalization weren't
detected as duplicates.
This commit is contained in:
evazion
2022-02-08 17:38:47 -06:00
parent 67d834775d
commit 21c0d55aa4
3 changed files with 19 additions and 15 deletions

View File

@@ -149,7 +149,7 @@ module ArtistFinder
# @param url [String] the artist profile URL
# @return [Array<Artist>] the list of matching artists
def find_artists(url)
url = ArtistUrl.normalize(url)
url = ArtistUrl.normalize_normalized_url(url)
artists = []
while artists.empty? && url.size > 10

View File

@@ -1,8 +1,8 @@
# frozen_string_literal: true
class ArtistUrl < ApplicationRecord
before_validation :initialize_normalized_url, on: :create
before_validation :normalize
normalize :url, :normalize_url
validates :url, presence: true, uniqueness: { scope: :artist_id }
validate :validate_url_format
belongs_to :artist, :touch => true
@@ -18,7 +18,7 @@ class ArtistUrl < ApplicationRecord
[is_active, url]
end
def self.normalize(url)
def self.normalize_normalized_url(url)
if url.nil?
nil
else
@@ -29,7 +29,7 @@ class ArtistUrl < ApplicationRecord
url = url.sub(%r{^http://pictures.hentai-foundry.com//}, "http://pictures.hentai-foundry.com/")
# the strategy won't always work for twitter because it looks for a status
url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com}
url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com}i
url = Sources::Strategies.find(url).normalize_for_artist_finder
@@ -67,7 +67,7 @@ class ArtistUrl < ApplicationRecord
elsif url.include?("*")
where_ilike(attr, url)
else
where(attr => normalize(url))
where(attr => normalize_normalized_url(url))
end
end
@@ -118,20 +118,17 @@ class ArtistUrl < ApplicationRecord
sites.index(site_name) || 1000
end
def normalize
# Perform some normalization with Addressable on the URL itself
# - Converts scheme and hostname to downcase
# - Converts unicode hostname to Punycode
def self.normalize_url(url)
uri = Addressable::URI.parse(url)
uri.site = uri.normalized_site
self.url = uri.to_s
self.normalized_url = self.class.normalize(url)
uri.to_s
rescue Addressable::URI::InvalidURIError
# Don't bother normalizing the URL if there is errors
url
end
def initialize_normalized_url
self.normalized_url = url
def url=(url)
super(url)
self.normalized_url = self.class.normalize_normalized_url(self.url)
end
def to_s

View File

@@ -158,6 +158,13 @@ class ArtistTest < ActiveSupport::TestCase
assert_equal(old_url_ids, ArtistUrl.order("id").pluck(&:id))
end
should "normalize urls before removing duplicates" do
@artist = create(:artist, url_string: "https://Twitter.com/o8q https://twitter.com/o8q")
assert_equal(1, @artist.urls.count)
assert_equal(["https://twitter.com/o8q"], @artist.urls.map(&:to_s))
end
should "ignore pixiv.net/ and pixiv.net/img/ url matches" do
a1 = FactoryBot.create(:artist, :name => "yomosaka", :url_string => "http://i2.pixiv.net/img18/img/evazion/14901720.png")
a2 = FactoryBot.create(:artist, :name => "niwatazumi_bf", :url_string => "http://i2.pixiv.net/img18/img/evazion/14901720_big_p0.png")