artists: change how artist urls are normalized.

Change how artist URLs are normalized in artist entries. Don't try to secretly
convert image URLs to profile URLs in artist entries. For example, if someone puts a
Pixiv image URL in an artist entry, don't secretly try to fetch the source and
convert it into a profile URL in the `normalized_url` field.

We did this because years ago, it was standard practice to put image URLs in artist
entries. Pixiv image URLs used to contain the artist's username, so we used to put
image URLs in artist entries for artist finding purposes. But Pixiv changed it so
that image URLs no longer contained the username, so we dealt with it by adding a
`normalized_url` column to artist_urls and secretly converting image URLs to profile
URLs in this field. But this is no longer necessary because now we don't normally put
image URLs in artist entries in the first place.

Now the `profile_url` method in `Source::URL` is used to normalize URLs in artist
entries. This lets us parse various profile URL formats and normalize them into a
single canonical form.

This also removes the `normalize_for_artist_finder` method from source strategies.
Instead the `profile_url` method is used for artist finding purposes. So the profile
URL returned by the source strategy needs to be the same as the URL in the artist
entry in order for artist finding to work.
This commit is contained in:
evazion
2022-03-12 19:54:47 -06:00
parent 9343f7c912
commit cf8b8207e2
9 changed files with 55 additions and 135 deletions

View File

@@ -97,7 +97,8 @@ module Sources
[artist_name, tag_name].compact.uniq [artist_name, tag_name].compact.uniq
end end
# A link to the artist's profile page on the site. # A link to the artist's profile page on the site. This will be used for
# artist finding purposes, so it needs to match the URL in the artist entry.
def profile_url def profile_url
nil nil
end end
@@ -137,12 +138,6 @@ module Sources
end end
memoize :http_downloader memoize :http_downloader
# The url to use for artist finding purposes. This will be stored in the
# artist entry. Normally this will be the profile url.
def normalize_for_artist_finder
profile_url.presence || url
end
# Given a post/image url, this is the normalized url that will be displayed in a post's page in its stead. # Given a post/image url, this is the normalized url that will be displayed in a post's page in its stead.
# This function should never make any network call, even indirectly. Return nil to never normalize. # This function should never make any network call, even indirectly. Return nil to never normalize.
def normalize_for_source def normalize_for_source
@@ -150,7 +145,7 @@ module Sources
end end
def artists def artists
ArtistFinder.find_artists(normalize_for_artist_finder.to_s) ArtistFinder.find_artists(profile_url.to_s)
end end
# A new artist entry with suggested defaults for when the artist doesn't # A new artist entry with suggested defaults for when the artist doesn't
@@ -234,7 +229,6 @@ module Sources
:image_urls => image_urls, :image_urls => image_urls,
:page_url => page_url, :page_url => page_url,
:canonical_url => canonical_url, :canonical_url => canonical_url,
:normalized_for_artist_finder_url => normalize_for_artist_finder,
:tags => tags, :tags => tags,
:normalized_tags => normalized_tags, :normalized_tags => normalized_tags,
:translated_tags => translated_tags, :translated_tags => translated_tags,

View File

@@ -40,10 +40,7 @@ module Sources
end end
def profile_url def profile_url
user_id = api_client&.user_id "https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}" if api_client.user_id.present?
return if user_id.blank? # artists can be anonymous
"https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}"
end end
def artist_name def artist_name

View File

@@ -62,12 +62,10 @@ module Sources
end end
def profile_url def profile_url
if parsed_url.profile_url.present? if api_illust[:userId].present?
parsed_url.profile_url
elsif api_illust[:userId].present?
"https://www.pixiv.net/users/#{api_illust[:userId]}" "https://www.pixiv.net/users/#{api_illust[:userId]}"
else elsif parsed_url.profile_url.present?
nil parsed_url.profile_url
end end
end end

View File

@@ -93,10 +93,6 @@ module Sources::Strategies
api_response[:full_text].to_s api_response[:full_text].to_s
end end
def normalize_for_artist_finder
profile_url.try(:downcase).presence || url
end
def normalize_for_source def normalize_for_source
if tag_name_from_url.present? && status_id.present? if tag_name_from_url.present? && status_id.present?
"https://twitter.com/#{tag_name_from_url}/status/#{status_id}" "https://twitter.com/#{tag_name_from_url}/status/#{status_id}"

View File

@@ -242,7 +242,8 @@ class Artist < ApplicationRecord
elsif query.include?("*") elsif query.include?("*")
where(id: ArtistURL.where_like(:url, query).select(:artist_id)) where(id: ArtistURL.where_like(:url, query).select(:artist_id))
elsif query =~ %r{\Ahttps?://}i elsif query =~ %r{\Ahttps?://}i
ArtistFinder.find_artists(query) url = Sources::Strategies.find(query).profile_url || query
ArtistFinder.find_artists(url)
else else
where(id: ArtistURL.where_like(:url, "*#{query}*").select(:artist_id)) where(id: ArtistURL.where_like(:url, "*#{query}*").select(:artist_id))
end end

View File

@@ -19,27 +19,16 @@ class ArtistURL < ApplicationRecord
end end
def self.normalize_normalized_url(url) def self.normalize_normalized_url(url)
if url.nil? return nil if url.nil?
nil
else
url = url.sub(%r{^https://}, "http://")
url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com")
url = url.sub(%r{^http://pictures.hentai-foundry.com//}, "http://pictures.hentai-foundry.com/")
# the strategy won't always work for twitter because it looks for a status url = Source::URL.parse(url)&.profile_url || url
url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com}i url = url.sub(%r{^https://}, "http://")
url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com")
url = Sources::Strategies.find(url).normalize_for_artist_finder url = url.gsub(%r{/+\Z}, "")
url + "/"
# XXX the Pixiv strategy should implement normalize_for_artist_finder and return the correct url directly.
url = url.sub(%r{\Ahttps?://www\.pixiv\.net/(?:en/)?users/(\d+)\z}i, 'https://www.pixiv.net/member.php?id=\1')
url = url.gsub(%r{/+\Z}, "")
url = url.gsub(%r{^https://}, "http://")
url + "/"
end
end end
def self.search(params = {}) def self.search(params = {})
@@ -67,7 +56,8 @@ class ArtistURL < ApplicationRecord
elsif url.include?("*") elsif url.include?("*")
where_ilike(attr, url) where_ilike(attr, url)
else else
where(attr => normalize_normalized_url(url)) profile_url = Sources::Strategies.find(url).profile_url || url
where(attr => normalize_normalized_url(profile_url))
end end
end end

View File

@@ -2,14 +2,14 @@ require 'test_helper'
class ArtistTest < ActiveSupport::TestCase class ArtistTest < ActiveSupport::TestCase
def assert_artist_found(expected_name, source_url) def assert_artist_found(expected_name, source_url)
artists = ArtistFinder.find_artists(source_url).to_a artists = Artist.search(url_matches: source_url).to_a
assert_equal(1, artists.size) assert_equal(1, artists.size)
assert_equal(expected_name, artists.first.name, "Testing URL: #{source_url}") assert_equal(expected_name, artists.first.name, "Testing URL: #{source_url}")
end end
def assert_artist_not_found(source_url) def assert_artist_not_found(source_url)
artists = ArtistFinder.find_artists(source_url).to_a artists = Artist.search(url_matches: source_url).to_a
assert_equal(0, artists.size, "Testing URL: #{source_url}") assert_equal(0, artists.size, "Testing URL: #{source_url}")
end end

View File

@@ -52,149 +52,97 @@ class ArtistURLTest < ActiveSupport::TestCase
end end
should "normalise https" do should "normalise https" do
url = FactoryBot.create(:artist_url, :url => "https://google.com") url = create(:artist_url, url: "https://google.com")
assert_equal("https://google.com", url.url) assert_equal("https://google.com", url.url)
assert_equal("http://google.com/", url.normalized_url) assert_equal("http://google.com/", url.normalized_url)
end end
should "normalise domains to lowercase" do should "normalise domains to lowercase" do
url = FactoryBot.create(:artist_url, url: "https://ArtistName.example.com") url = create(:artist_url, url: "https://ArtistName.example.com")
assert_equal("http://artistname.example.com/", url.normalized_url) assert_equal("http://artistname.example.com/", url.normalized_url)
end end
context "normalize twitter profile urls" do should "normalize ArtStation urls" do
setup do url = create(:artist_url, url: "https://www.artstation.com/koyorin")
@url = FactoryBot.create(:artist_url, :url => "https://twitter.com/BLAH") assert_equal("http://www.artstation.com/koyorin/", url.normalized_url)
end
should "downcase the url" do url = create(:artist_url, url: "https://koyorin.artstation.com"),
assert_equal("http://twitter.com/blah/", @url.normalized_url) assert_equal("http://www.artstation.com/koyorin/", url.normalized_url)
end
end
context "artstation urls" do
setup do
@urls = [
FactoryBot.create(:artist_url, url: "https://www.artstation.com/koyorin"),
FactoryBot.create(:artist_url, url: "https://koyorin.artstation.com"),
FactoryBot.create(:artist_url, url: "https://www.artstation.com/artwork/04XA4")
]
end
should "normalize" do
assert_equal("http://www.artstation.com/koyorin/", @urls[0].normalized_url)
assert_equal("http://www.artstation.com/koyorin/", @urls[1].normalized_url)
assert_equal("http://www.artstation.com/jeyrain/", @urls[2].normalized_url)
end
end
context "deviantart urls" do
setup do
@urls = [
FactoryBot.create(:artist_url, url: "https://www.deviantart.com/aeror404/art/Holiday-Elincia-424551484"),
FactoryBot.create(:artist_url, url: "http://noizave.deviantart.com/art/test-post-please-ignore-685436408"),
FactoryBot.create(:artist_url, url: "https://www.deviantart.com/noizave")
]
end
should "normalize" do
assert_equal("http://www.deviantart.com/aeror404/", @urls[0].normalized_url)
assert_equal("http://www.deviantart.com/noizave/", @urls[1].normalized_url)
assert_equal("http://www.deviantart.com/noizave/", @urls[2].normalized_url)
end
end
context "nicoseiga urls" do
setup do
@urls = [
FactoryBot.create(:artist_url, url: "http://seiga.nicovideo.jp/user/illust/7017777"),
FactoryBot.create(:artist_url, url: "http://lohas.nicoseiga.jp/o/910aecf08e542285862954017f8a33a8c32a8aec/1433298801/4937663"),
FactoryBot.create(:artist_url, url: "http://seiga.nicovideo.jp/seiga/im4937663")
]
end
should "normalize" do
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[0].normalized_url)
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[1].normalized_url)
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[2].normalized_url)
end
end end
should "normalize fc2 urls" do should "normalize fc2 urls" do
url = FactoryBot.create(:artist_url, :url => "http://blog55.fc2.com/monet") url = create(:artist_url, url: "http://blog55.fc2.com/monet")
assert_equal("http://blog55.fc2.com/monet", url.url) assert_equal("http://blog55.fc2.com/monet", url.url)
assert_equal("http://monet.blog.fc2.com/", url.normalized_url) assert_equal("http://monet.blog.fc2.com/", url.normalized_url)
url = FactoryBot.create(:artist_url, :url => "http://blog-imgs-55.fc2.com/monet") url = create(:artist_url, url: "http://blog-imgs-55.fc2.com/monet")
assert_equal("http://blog-imgs-55.fc2.com/monet", url.url) assert_equal("http://blog-imgs-55.fc2.com/monet", url.url)
assert_equal("http://monet.blog.fc2.com/", url.normalized_url) assert_equal("http://monet.blog.fc2.com/", url.normalized_url)
end end
should "normalize deviant art artist urls" do should "normalize deviant art artist urls" do
url = FactoryBot.create(:artist_url, :url => "https://www.deviantart.com/aeror404/art/Holiday-Elincia-424551484") url = create(:artist_url, url: "https://noizave.deviantart.com")
assert_equal("http://www.deviantart.com/aeror404/", url.normalized_url) assert_equal("http://www.deviantart.com/noizave/", url.normalized_url)
end end
should "normalize nico seiga artist urls" do should "normalize nico seiga artist urls" do
url = FactoryBot.create(:artist_url, :url => "http://seiga.nicovideo.jp/user/illust/7017777") url = create(:artist_url, url: "http://seiga.nicovideo.jp/user/illust/7017777")
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", url.normalized_url) assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", url.normalized_url)
url = FactoryBot.create(:artist_url, :url => "http://seiga.nicovideo.jp/seiga/im4937663") url = create(:artist_url, url: "http://seiga.nicovideo.jp/manga/list?user_id=23839737")
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", url.normalized_url) assert_equal("http://seiga.nicovideo.jp/user/illust/23839737/", url.normalized_url)
url = create(:artist_url, url: "https://www.nicovideo.jp/user/20446930/mylist/28674289")
assert_equal("http://seiga.nicovideo.jp/user/illust/20446930/", url.normalized_url)
end end
should "normalize hentai foundry artist urls" do should "normalize hentai foundry artist urls" do
url = FactoryBot.create(:artist_url, :url => "http://pictures.hentai-foundry.com//a/AnimeFlux/219123.jpg") url = create(:artist_url, url: "https://www.hentai-foundry.com/user/kajinman/profile")
assert_equal("http://www.hentai-foundry.com/user/AnimeFlux/", url.normalized_url) assert_equal("http://www.hentai-foundry.com/user/kajinman/", url.normalized_url)
end
should "normalize pixiv urls" do
url = FactoryBot.create(:artist_url, :url => "https://i.pximg.net/img-original/img/2010/11/30/08/39/58/14901720_p0.png")
assert_equal("https://i.pximg.net/img-original/img/2010/11/30/08/39/58/14901720_p0.png", url.url)
assert_equal("http://www.pixiv.net/member.php?id=339253/", url.normalized_url)
end end
should "normalize pixiv stacc urls" do should "normalize pixiv stacc urls" do
url = FactoryBot.create(:artist_url, :url => "https://www.pixiv.net/stacc/evazion") url = create(:artist_url, url: "https://www.pixiv.net/stacc/evazion")
assert_equal("https://www.pixiv.net/stacc/evazion", url.url)
assert_equal("http://www.pixiv.net/stacc/evazion/", url.normalized_url) assert_equal("http://www.pixiv.net/stacc/evazion/", url.normalized_url)
end end
should "normalize pixiv fanbox account urls" do should "normalize pixiv fanbox account urls" do
url = FactoryBot.create(:artist_url, :url => "http://www.pixiv.net/fanbox/creator/3113804") url = create(:artist_url, url: "https://www.pixiv.net/fanbox/creator/3113804")
assert_equal("http://www.pixiv.net/fanbox/creator/3113804", url.url) assert_equal("http://www.pixiv.net/fanbox/creator/3113804/", url.normalized_url)
assert_equal("http://drw24olf.fanbox.cc/", url.normalized_url)
url = create(:artist_url, url: "https://omu001.fanbox.cc/posts/39714")
assert_equal("http://omu001.fanbox.cc/", url.normalized_url)
end end
should "normalize pixiv.net/user/123 urls" do should "normalize pixiv.net/user/123 urls" do
url = create(:artist_url, url: "https://www.pixiv.net/en/users/123") url = create(:artist_url, url: "https://www.pixiv.net/en/users/123")
assert_equal("https://www.pixiv.net/en/users/123", url.url)
assert_equal("http://www.pixiv.net/member.php?id=123/", url.normalized_url) assert_equal("http://www.pixiv.net/member.php?id=123/", url.normalized_url)
end end
should "normalize twitter urls" do should "normalize twitter urls" do
url = FactoryBot.create(:artist_url, :url => "https://twitter.com/aoimanabu/status/892370963630743552") url = create(:artist_url, url: "https://twitter.com/aoimanabu/status/892370963630743552")
assert_equal("https://twitter.com/aoimanabu/status/892370963630743552", url.url)
assert_equal("http://twitter.com/aoimanabu/", url.normalized_url) assert_equal("http://twitter.com/aoimanabu/", url.normalized_url)
url = create(:artist_url, url: "https://twitter.com/BLAH")
assert_equal("http://twitter.com/BLAH/", url.normalized_url)
end end
should "normalize https://twitter.com/intent/user?user_id=* urls" do should "normalize https://twitter.com/intent/user?user_id=* urls" do
url = FactoryBot.create(:artist_url, :url => "https://twitter.com/intent/user?user_id=2784590030") url = create(:artist_url, url: "https://twitter.com/intent/user?user_id=2784590030")
assert_equal("https://twitter.com/intent/user?user_id=2784590030", url.url)
assert_equal("http://twitter.com/intent/user?user_id=2784590030/", url.normalized_url) assert_equal("http://twitter.com/intent/user?user_id=2784590030/", url.normalized_url)
end end
should "normalize nijie urls" do should "normalize nijie urls" do
url = FactoryBot.create(:artist_url, url: "https://pic03.nijie.info/nijie_picture/236014_20170620101426_0.png") url = create(:artist_url, url: "https://pic03.nijie.info/nijie_picture/236014_20170620101426_0.png")
assert_equal("http://nijie.info/members.php?id=236014/", url.normalized_url) assert_equal("http://nijie.info/members.php?id=236014/", url.normalized_url)
url = FactoryBot.create(:artist_url, url: "https://nijie.info/members.php?id=161703") url = create(:artist_url, url: "https://nijie.info/members.php?id=161703")
assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url) assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url)
url = FactoryBot.create(:artist_url, url: "https://www.nijie.info/members_illust.php?id=161703") url = create(:artist_url, url: "https://www.nijie.info/members_illust.php?id=161703")
assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url) assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url)
url = FactoryBot.create(:artist_url, url: "https://nijie.info/invalid.php") url = create(:artist_url, url: "https://nijie.info/invalid.php")
assert_equal("http://nijie.info/invalid.php/", url.normalized_url) assert_equal("http://nijie.info/invalid.php/", url.normalized_url)
end end

View File

@@ -67,10 +67,6 @@ module Sources
should "get the artist name" do should "get the artist name" do
assert_equal("Afrobull", @site.artist_name) assert_equal("Afrobull", @site.artist_name)
end end
should "get the normalized url" do
assert_equal("https://www.hentai-foundry.com/user/Afrobull", @site.normalize_for_artist_finder)
end
end end
context "A deleted picture" do context "A deleted picture" do