artists: change how artist urls are normalized.

Change how artist URLs are normalized in artist entries. Don't try to secretly
convert image URLs to profile URLs in artist entries. For example, if someone puts a
Pixiv image URL in an artist entry, don't secretly try to fetch the source and
convert it into a profile URL in the `normalized_url` field.

We did this because years ago, it was standard practice to put image URLs in artist
entries. Pixiv image URLs used to contain the artist's username, so we used to put
image URLs in artist entries for artist finding purposes. But Pixiv changed it so
that image URLs no longer contained the username, so we dealt with it by adding a
`normalized_url` column to artist_urls and secretly converting image URLs to profile
URLs in this field. But this is no longer necessary because now we don't normally put
image URLs in artist entries in the first place.

Now the `profile_url` method in `Source::URL` is used to normalize URLs in artist
entries. This lets us parse various profile URL formats and normalize them into a
single canonical form.

This also removes the `normalize_for_artist_finder` method from source strategies.
Instead the `profile_url` method is used for artist finding purposes. So the profile
URL returned by the source strategy needs to be the same as the URL in the artist
entry in order for artist finding to work.
This commit is contained in:
evazion
2022-03-12 19:54:47 -06:00
parent 9343f7c912
commit cf8b8207e2
9 changed files with 55 additions and 135 deletions

View File

@@ -97,7 +97,8 @@ module Sources
[artist_name, tag_name].compact.uniq
end
# A link to the artist's profile page on the site.
# A link to the artist's profile page on the site. This will be used for
# artist finding purposes, so it needs to match the URL in the artist entry.
def profile_url
nil
end
@@ -137,12 +138,6 @@ module Sources
end
memoize :http_downloader
# The url to use for artist finding purposes. This will be stored in the
# artist entry. Normally this will be the profile url.
def normalize_for_artist_finder
profile_url.presence || url
end
# Given a post/image url, this is the normalized url that will be displayed in a post's page in its stead.
# This function should never make any network call, even indirectly. Return nil to never normalize.
def normalize_for_source
@@ -150,7 +145,7 @@ module Sources
end
def artists
ArtistFinder.find_artists(normalize_for_artist_finder.to_s)
ArtistFinder.find_artists(profile_url.to_s)
end
# A new artist entry with suggested defaults for when the artist doesn't
@@ -234,7 +229,6 @@ module Sources
:image_urls => image_urls,
:page_url => page_url,
:canonical_url => canonical_url,
:normalized_for_artist_finder_url => normalize_for_artist_finder,
:tags => tags,
:normalized_tags => normalized_tags,
:translated_tags => translated_tags,

View File

@@ -40,10 +40,7 @@ module Sources
end
def profile_url
user_id = api_client&.user_id
return if user_id.blank? # artists can be anonymous
"https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}"
"https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}" if api_client.user_id.present?
end
def artist_name

View File

@@ -62,12 +62,10 @@ module Sources
end
def profile_url
if parsed_url.profile_url.present?
parsed_url.profile_url
elsif api_illust[:userId].present?
if api_illust[:userId].present?
"https://www.pixiv.net/users/#{api_illust[:userId]}"
else
nil
elsif parsed_url.profile_url.present?
parsed_url.profile_url
end
end

View File

@@ -93,10 +93,6 @@ module Sources::Strategies
api_response[:full_text].to_s
end
def normalize_for_artist_finder
profile_url.try(:downcase).presence || url
end
def normalize_for_source
if tag_name_from_url.present? && status_id.present?
"https://twitter.com/#{tag_name_from_url}/status/#{status_id}"

View File

@@ -242,7 +242,8 @@ class Artist < ApplicationRecord
elsif query.include?("*")
where(id: ArtistURL.where_like(:url, query).select(:artist_id))
elsif query =~ %r{\Ahttps?://}i
ArtistFinder.find_artists(query)
url = Sources::Strategies.find(query).profile_url || query
ArtistFinder.find_artists(url)
else
where(id: ArtistURL.where_like(:url, "*#{query}*").select(:artist_id))
end

View File

@@ -19,27 +19,16 @@ class ArtistURL < ApplicationRecord
end
def self.normalize_normalized_url(url)
if url.nil?
nil
else
url = url.sub(%r{^https://}, "http://")
url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com")
url = url.sub(%r{^http://pictures.hentai-foundry.com//}, "http://pictures.hentai-foundry.com/")
return nil if url.nil?
# the strategy won't always work for twitter because it looks for a status
url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com}i
url = Source::URL.parse(url)&.profile_url || url
url = url.sub(%r{^https://}, "http://")
url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com")
url = Sources::Strategies.find(url).normalize_for_artist_finder
# XXX the Pixiv strategy should implement normalize_for_artist_finder and return the correct url directly.
url = url.sub(%r{\Ahttps?://www\.pixiv\.net/(?:en/)?users/(\d+)\z}i, 'https://www.pixiv.net/member.php?id=\1')
url = url.gsub(%r{/+\Z}, "")
url = url.gsub(%r{^https://}, "http://")
url + "/"
end
url = url.gsub(%r{/+\Z}, "")
url + "/"
end
def self.search(params = {})
@@ -67,7 +56,8 @@ class ArtistURL < ApplicationRecord
elsif url.include?("*")
where_ilike(attr, url)
else
where(attr => normalize_normalized_url(url))
profile_url = Sources::Strategies.find(url).profile_url || url
where(attr => normalize_normalized_url(profile_url))
end
end