artists: change how artist urls are normalized.

Change how artist URLs are normalized in artist entries. Don't try to secretly
convert image URLs to profile URLs in artist entries. For example, if someone puts a
Pixiv image URL in an artist entry, don't secretly try to fetch the source and
convert it into a profile URL in the `normalized_url` field.

We did this because years ago, it was standard practice to put image URLs in artist
entries. Pixiv image URLs used to contain the artist's username, so we used to put
image URLs in artist entries for artist finding purposes. But Pixiv changed it so
that image URLs no longer contained the username, so we dealt with it by adding a
`normalized_url` column to artist_urls and secretly converting image URLs to profile
URLs in this field. But this is no longer necessary because now we don't normally put
image URLs in artist entries in the first place.

Now the `profile_url` method in `Source::URL` is used to normalize URLs in artist
entries. This lets us parse various profile URL formats and normalize them into a
single canonical form.

This also removes the `normalize_for_artist_finder` method from source strategies.
Instead the `profile_url` method is used for artist finding purposes. So the profile
URL returned by the source strategy needs to be the same as the URL in the artist
entry in order for artist finding to work.
This commit is contained in:
evazion
2022-03-12 19:54:47 -06:00
parent 9343f7c912
commit cf8b8207e2
9 changed files with 55 additions and 135 deletions

View File

@@ -97,7 +97,8 @@ module Sources
[artist_name, tag_name].compact.uniq
end
# A link to the artist's profile page on the site.
# A link to the artist's profile page on the site. This will be used for
# artist finding purposes, so it needs to match the URL in the artist entry.
def profile_url
nil
end
@@ -137,12 +138,6 @@ module Sources
end
memoize :http_downloader
# The url to use for artist finding purposes. This will be stored in the
# artist entry. Normally this will be the profile url.
def normalize_for_artist_finder
profile_url.presence || url
end
# Given a post/image url, this is the normalized url that will be displayed in a post's page in its stead.
# This function should never make any network call, even indirectly. Return nil to never normalize.
def normalize_for_source
@@ -150,7 +145,7 @@ module Sources
end
def artists
ArtistFinder.find_artists(normalize_for_artist_finder.to_s)
ArtistFinder.find_artists(profile_url.to_s)
end
# A new artist entry with suggested defaults for when the artist doesn't
@@ -234,7 +229,6 @@ module Sources
:image_urls => image_urls,
:page_url => page_url,
:canonical_url => canonical_url,
:normalized_for_artist_finder_url => normalize_for_artist_finder,
:tags => tags,
:normalized_tags => normalized_tags,
:translated_tags => translated_tags,

View File

@@ -40,10 +40,7 @@ module Sources
end
def profile_url
user_id = api_client&.user_id
return if user_id.blank? # artists can be anonymous
"https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}"
"https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}" if api_client.user_id.present?
end
def artist_name

View File

@@ -62,12 +62,10 @@ module Sources
end
def profile_url
if parsed_url.profile_url.present?
parsed_url.profile_url
elsif api_illust[:userId].present?
if api_illust[:userId].present?
"https://www.pixiv.net/users/#{api_illust[:userId]}"
else
nil
elsif parsed_url.profile_url.present?
parsed_url.profile_url
end
end

View File

@@ -93,10 +93,6 @@ module Sources::Strategies
api_response[:full_text].to_s
end
def normalize_for_artist_finder
profile_url.try(:downcase).presence || url
end
def normalize_for_source
if tag_name_from_url.present? && status_id.present?
"https://twitter.com/#{tag_name_from_url}/status/#{status_id}"

View File

@@ -242,7 +242,8 @@ class Artist < ApplicationRecord
elsif query.include?("*")
where(id: ArtistURL.where_like(:url, query).select(:artist_id))
elsif query =~ %r{\Ahttps?://}i
ArtistFinder.find_artists(query)
url = Sources::Strategies.find(query).profile_url || query
ArtistFinder.find_artists(url)
else
where(id: ArtistURL.where_like(:url, "*#{query}*").select(:artist_id))
end

View File

@@ -19,27 +19,16 @@ class ArtistURL < ApplicationRecord
end
def self.normalize_normalized_url(url)
if url.nil?
nil
else
url = url.sub(%r{^https://}, "http://")
url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com")
url = url.sub(%r{^http://pictures.hentai-foundry.com//}, "http://pictures.hentai-foundry.com/")
return nil if url.nil?
# the strategy won't always work for twitter because it looks for a status
url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com}i
url = Source::URL.parse(url)&.profile_url || url
url = url.sub(%r{^https://}, "http://")
url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2")
url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com")
url = Sources::Strategies.find(url).normalize_for_artist_finder
# XXX the Pixiv strategy should implement normalize_for_artist_finder and return the correct url directly.
url = url.sub(%r{\Ahttps?://www\.pixiv\.net/(?:en/)?users/(\d+)\z}i, 'https://www.pixiv.net/member.php?id=\1')
url = url.gsub(%r{/+\Z}, "")
url = url.gsub(%r{^https://}, "http://")
url + "/"
end
url = url.gsub(%r{/+\Z}, "")
url + "/"
end
def self.search(params = {})
@@ -67,7 +56,8 @@ class ArtistURL < ApplicationRecord
elsif url.include?("*")
where_ilike(attr, url)
else
where(attr => normalize_normalized_url(url))
profile_url = Sources::Strategies.find(url).profile_url || url
where(attr => normalize_normalized_url(profile_url))
end
end

View File

@@ -2,14 +2,14 @@ require 'test_helper'
class ArtistTest < ActiveSupport::TestCase
def assert_artist_found(expected_name, source_url)
artists = ArtistFinder.find_artists(source_url).to_a
artists = Artist.search(url_matches: source_url).to_a
assert_equal(1, artists.size)
assert_equal(expected_name, artists.first.name, "Testing URL: #{source_url}")
end
def assert_artist_not_found(source_url)
artists = ArtistFinder.find_artists(source_url).to_a
artists = Artist.search(url_matches: source_url).to_a
assert_equal(0, artists.size, "Testing URL: #{source_url}")
end

View File

@@ -52,149 +52,97 @@ class ArtistURLTest < ActiveSupport::TestCase
end
should "normalise https" do
url = FactoryBot.create(:artist_url, :url => "https://google.com")
url = create(:artist_url, url: "https://google.com")
assert_equal("https://google.com", url.url)
assert_equal("http://google.com/", url.normalized_url)
end
should "normalise domains to lowercase" do
url = FactoryBot.create(:artist_url, url: "https://ArtistName.example.com")
url = create(:artist_url, url: "https://ArtistName.example.com")
assert_equal("http://artistname.example.com/", url.normalized_url)
end
context "normalize twitter profile urls" do
setup do
@url = FactoryBot.create(:artist_url, :url => "https://twitter.com/BLAH")
end
should "normalize ArtStation urls" do
url = create(:artist_url, url: "https://www.artstation.com/koyorin")
assert_equal("http://www.artstation.com/koyorin/", url.normalized_url)
should "downcase the url" do
assert_equal("http://twitter.com/blah/", @url.normalized_url)
end
end
context "artstation urls" do
setup do
@urls = [
FactoryBot.create(:artist_url, url: "https://www.artstation.com/koyorin"),
FactoryBot.create(:artist_url, url: "https://koyorin.artstation.com"),
FactoryBot.create(:artist_url, url: "https://www.artstation.com/artwork/04XA4")
]
end
should "normalize" do
assert_equal("http://www.artstation.com/koyorin/", @urls[0].normalized_url)
assert_equal("http://www.artstation.com/koyorin/", @urls[1].normalized_url)
assert_equal("http://www.artstation.com/jeyrain/", @urls[2].normalized_url)
end
end
context "deviantart urls" do
setup do
@urls = [
FactoryBot.create(:artist_url, url: "https://www.deviantart.com/aeror404/art/Holiday-Elincia-424551484"),
FactoryBot.create(:artist_url, url: "http://noizave.deviantart.com/art/test-post-please-ignore-685436408"),
FactoryBot.create(:artist_url, url: "https://www.deviantart.com/noizave")
]
end
should "normalize" do
assert_equal("http://www.deviantart.com/aeror404/", @urls[0].normalized_url)
assert_equal("http://www.deviantart.com/noizave/", @urls[1].normalized_url)
assert_equal("http://www.deviantart.com/noizave/", @urls[2].normalized_url)
end
end
context "nicoseiga urls" do
setup do
@urls = [
FactoryBot.create(:artist_url, url: "http://seiga.nicovideo.jp/user/illust/7017777"),
FactoryBot.create(:artist_url, url: "http://lohas.nicoseiga.jp/o/910aecf08e542285862954017f8a33a8c32a8aec/1433298801/4937663"),
FactoryBot.create(:artist_url, url: "http://seiga.nicovideo.jp/seiga/im4937663")
]
end
should "normalize" do
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[0].normalized_url)
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[1].normalized_url)
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[2].normalized_url)
end
url = create(:artist_url, url: "https://koyorin.artstation.com"),
assert_equal("http://www.artstation.com/koyorin/", url.normalized_url)
end
should "normalize fc2 urls" do
url = FactoryBot.create(:artist_url, :url => "http://blog55.fc2.com/monet")
url = create(:artist_url, url: "http://blog55.fc2.com/monet")
assert_equal("http://blog55.fc2.com/monet", url.url)
assert_equal("http://monet.blog.fc2.com/", url.normalized_url)
url = FactoryBot.create(:artist_url, :url => "http://blog-imgs-55.fc2.com/monet")
url = create(:artist_url, url: "http://blog-imgs-55.fc2.com/monet")
assert_equal("http://blog-imgs-55.fc2.com/monet", url.url)
assert_equal("http://monet.blog.fc2.com/", url.normalized_url)
end
should "normalize deviant art artist urls" do
url = FactoryBot.create(:artist_url, :url => "https://www.deviantart.com/aeror404/art/Holiday-Elincia-424551484")
assert_equal("http://www.deviantart.com/aeror404/", url.normalized_url)
url = create(:artist_url, url: "https://noizave.deviantart.com")
assert_equal("http://www.deviantart.com/noizave/", url.normalized_url)
end
should "normalize nico seiga artist urls" do
url = FactoryBot.create(:artist_url, :url => "http://seiga.nicovideo.jp/user/illust/7017777")
url = create(:artist_url, url: "http://seiga.nicovideo.jp/user/illust/7017777")
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", url.normalized_url)
url = FactoryBot.create(:artist_url, :url => "http://seiga.nicovideo.jp/seiga/im4937663")
assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", url.normalized_url)
url = create(:artist_url, url: "http://seiga.nicovideo.jp/manga/list?user_id=23839737")
assert_equal("http://seiga.nicovideo.jp/user/illust/23839737/", url.normalized_url)
url = create(:artist_url, url: "https://www.nicovideo.jp/user/20446930/mylist/28674289")
assert_equal("http://seiga.nicovideo.jp/user/illust/20446930/", url.normalized_url)
end
should "normalize hentai foundry artist urls" do
url = FactoryBot.create(:artist_url, :url => "http://pictures.hentai-foundry.com//a/AnimeFlux/219123.jpg")
assert_equal("http://www.hentai-foundry.com/user/AnimeFlux/", url.normalized_url)
end
should "normalize pixiv urls" do
url = FactoryBot.create(:artist_url, :url => "https://i.pximg.net/img-original/img/2010/11/30/08/39/58/14901720_p0.png")
assert_equal("https://i.pximg.net/img-original/img/2010/11/30/08/39/58/14901720_p0.png", url.url)
assert_equal("http://www.pixiv.net/member.php?id=339253/", url.normalized_url)
url = create(:artist_url, url: "https://www.hentai-foundry.com/user/kajinman/profile")
assert_equal("http://www.hentai-foundry.com/user/kajinman/", url.normalized_url)
end
should "normalize pixiv stacc urls" do
url = FactoryBot.create(:artist_url, :url => "https://www.pixiv.net/stacc/evazion")
assert_equal("https://www.pixiv.net/stacc/evazion", url.url)
url = create(:artist_url, url: "https://www.pixiv.net/stacc/evazion")
assert_equal("http://www.pixiv.net/stacc/evazion/", url.normalized_url)
end
should "normalize pixiv fanbox account urls" do
url = FactoryBot.create(:artist_url, :url => "http://www.pixiv.net/fanbox/creator/3113804")
assert_equal("http://www.pixiv.net/fanbox/creator/3113804", url.url)
assert_equal("http://drw24olf.fanbox.cc/", url.normalized_url)
url = create(:artist_url, url: "https://www.pixiv.net/fanbox/creator/3113804")
assert_equal("http://www.pixiv.net/fanbox/creator/3113804/", url.normalized_url)
url = create(:artist_url, url: "https://omu001.fanbox.cc/posts/39714")
assert_equal("http://omu001.fanbox.cc/", url.normalized_url)
end
should "normalize pixiv.net/user/123 urls" do
url = create(:artist_url, url: "https://www.pixiv.net/en/users/123")
assert_equal("https://www.pixiv.net/en/users/123", url.url)
assert_equal("http://www.pixiv.net/member.php?id=123/", url.normalized_url)
end
should "normalize twitter urls" do
url = FactoryBot.create(:artist_url, :url => "https://twitter.com/aoimanabu/status/892370963630743552")
assert_equal("https://twitter.com/aoimanabu/status/892370963630743552", url.url)
url = create(:artist_url, url: "https://twitter.com/aoimanabu/status/892370963630743552")
assert_equal("http://twitter.com/aoimanabu/", url.normalized_url)
url = create(:artist_url, url: "https://twitter.com/BLAH")
assert_equal("http://twitter.com/BLAH/", url.normalized_url)
end
should "normalize https://twitter.com/intent/user?user_id=* urls" do
url = FactoryBot.create(:artist_url, :url => "https://twitter.com/intent/user?user_id=2784590030")
assert_equal("https://twitter.com/intent/user?user_id=2784590030", url.url)
url = create(:artist_url, url: "https://twitter.com/intent/user?user_id=2784590030")
assert_equal("http://twitter.com/intent/user?user_id=2784590030/", url.normalized_url)
end
should "normalize nijie urls" do
url = FactoryBot.create(:artist_url, url: "https://pic03.nijie.info/nijie_picture/236014_20170620101426_0.png")
url = create(:artist_url, url: "https://pic03.nijie.info/nijie_picture/236014_20170620101426_0.png")
assert_equal("http://nijie.info/members.php?id=236014/", url.normalized_url)
url = FactoryBot.create(:artist_url, url: "https://nijie.info/members.php?id=161703")
url = create(:artist_url, url: "https://nijie.info/members.php?id=161703")
assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url)
url = FactoryBot.create(:artist_url, url: "https://www.nijie.info/members_illust.php?id=161703")
url = create(:artist_url, url: "https://www.nijie.info/members_illust.php?id=161703")
assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url)
url = FactoryBot.create(:artist_url, url: "https://nijie.info/invalid.php")
url = create(:artist_url, url: "https://nijie.info/invalid.php")
assert_equal("http://nijie.info/invalid.php/", url.normalized_url)
end

View File

@@ -67,10 +67,6 @@ module Sources
should "get the artist name" do
assert_equal("Afrobull", @site.artist_name)
end
should "get the normalized url" do
assert_equal("https://www.hentai-foundry.com/user/Afrobull", @site.normalize_for_artist_finder)
end
end
context "A deleted picture" do