From cf8b8207e241749f35625232255ddd9a7356af06 Mon Sep 17 00:00:00 2001 From: evazion Date: Sat, 12 Mar 2022 19:54:47 -0600 Subject: [PATCH] artists: change how artist urls are normalized. Change how artist URLs are normalized in artist entries. Don't try to secretly convert image URLs to profile URLs in artist entries. For example, if someone puts a Pixiv image URL in an artist entry, don't secretly try to fetch the source and convert it into a profile URL in the `normalized_url` field. We did this because years ago, it was standard practice to put image URLs in artist entries. Pixiv image URLs used to contain the artist's username, so we used to put image URLs in artist entries for artist finding purposes. But Pixiv changed it so that image URLs no longer contained the username, so we dealt with it by adding a `normalized_url` column to artist_urls and secretly converting image URLs to profile URLs in this field. But this is no longer necessary because now we don't normally put image URLs in artist entries in the first place. Now the `profile_url` method in `Source::URL` is used to normalize URLs in artist entries. This lets us parse various profile URL formats and normalize them into a single canonical form. This also removes the `normalize_for_artist_finder` method from source strategies. Instead the `profile_url` method is used for artist finding purposes. So the profile URL returned by the source strategy needs to be the same as the URL in the artist entry in order for artist finding to work. --- app/logical/sources/strategies/base.rb | 12 +- app/logical/sources/strategies/nico_seiga.rb | 5 +- app/logical/sources/strategies/pixiv.rb | 8 +- app/logical/sources/strategies/twitter.rb | 4 - app/models/artist.rb | 3 +- app/models/artist_url.rb | 30 ++--- test/unit/artist_test.rb | 4 +- test/unit/artist_url_test.rb | 120 ++++++------------- test/unit/sources/hentai_foundry_test.rb | 4 - 9 files changed, 55 insertions(+), 135 deletions(-) diff --git a/app/logical/sources/strategies/base.rb b/app/logical/sources/strategies/base.rb index 96a159612..2fdda17d9 100644 --- a/app/logical/sources/strategies/base.rb +++ b/app/logical/sources/strategies/base.rb @@ -97,7 +97,8 @@ module Sources [artist_name, tag_name].compact.uniq end - # A link to the artist's profile page on the site. + # A link to the artist's profile page on the site. This will be used for + # artist finding purposes, so it needs to match the URL in the artist entry. def profile_url nil end @@ -137,12 +138,6 @@ module Sources end memoize :http_downloader - # The url to use for artist finding purposes. This will be stored in the - # artist entry. Normally this will be the profile url. - def normalize_for_artist_finder - profile_url.presence || url - end - # Given a post/image url, this is the normalized url that will be displayed in a post's page in its stead. # This function should never make any network call, even indirectly. Return nil to never normalize. def normalize_for_source @@ -150,7 +145,7 @@ module Sources end def artists - ArtistFinder.find_artists(normalize_for_artist_finder.to_s) + ArtistFinder.find_artists(profile_url.to_s) end # A new artist entry with suggested defaults for when the artist doesn't @@ -234,7 +229,6 @@ module Sources :image_urls => image_urls, :page_url => page_url, :canonical_url => canonical_url, - :normalized_for_artist_finder_url => normalize_for_artist_finder, :tags => tags, :normalized_tags => normalized_tags, :translated_tags => translated_tags, diff --git a/app/logical/sources/strategies/nico_seiga.rb b/app/logical/sources/strategies/nico_seiga.rb index 5eff5caa0..31c1e0f26 100644 --- a/app/logical/sources/strategies/nico_seiga.rb +++ b/app/logical/sources/strategies/nico_seiga.rb @@ -40,10 +40,7 @@ module Sources end def profile_url - user_id = api_client&.user_id - return if user_id.blank? # artists can be anonymous - - "https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}" + "https://seiga.nicovideo.jp/user/illust/#{api_client.user_id}" if api_client.user_id.present? end def artist_name diff --git a/app/logical/sources/strategies/pixiv.rb b/app/logical/sources/strategies/pixiv.rb index 0cffcccb8..75101e087 100644 --- a/app/logical/sources/strategies/pixiv.rb +++ b/app/logical/sources/strategies/pixiv.rb @@ -62,12 +62,10 @@ module Sources end def profile_url - if parsed_url.profile_url.present? - parsed_url.profile_url - elsif api_illust[:userId].present? + if api_illust[:userId].present? "https://www.pixiv.net/users/#{api_illust[:userId]}" - else - nil + elsif parsed_url.profile_url.present? + parsed_url.profile_url end end diff --git a/app/logical/sources/strategies/twitter.rb b/app/logical/sources/strategies/twitter.rb index 374f860c7..1f0bf06b2 100644 --- a/app/logical/sources/strategies/twitter.rb +++ b/app/logical/sources/strategies/twitter.rb @@ -93,10 +93,6 @@ module Sources::Strategies api_response[:full_text].to_s end - def normalize_for_artist_finder - profile_url.try(:downcase).presence || url - end - def normalize_for_source if tag_name_from_url.present? && status_id.present? "https://twitter.com/#{tag_name_from_url}/status/#{status_id}" diff --git a/app/models/artist.rb b/app/models/artist.rb index d225d12d3..fd6658bc8 100644 --- a/app/models/artist.rb +++ b/app/models/artist.rb @@ -242,7 +242,8 @@ class Artist < ApplicationRecord elsif query.include?("*") where(id: ArtistURL.where_like(:url, query).select(:artist_id)) elsif query =~ %r{\Ahttps?://}i - ArtistFinder.find_artists(query) + url = Sources::Strategies.find(query).profile_url || query + ArtistFinder.find_artists(url) else where(id: ArtistURL.where_like(:url, "*#{query}*").select(:artist_id)) end diff --git a/app/models/artist_url.rb b/app/models/artist_url.rb index dbad3882c..445bab327 100644 --- a/app/models/artist_url.rb +++ b/app/models/artist_url.rb @@ -19,27 +19,16 @@ class ArtistURL < ApplicationRecord end def self.normalize_normalized_url(url) - if url.nil? - nil - else - url = url.sub(%r{^https://}, "http://") - url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2") - url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2") - url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com") - url = url.sub(%r{^http://pictures.hentai-foundry.com//}, "http://pictures.hentai-foundry.com/") + return nil if url.nil? - # the strategy won't always work for twitter because it looks for a status - url = url.downcase if url =~ %r{^https?://(?:mobile\.)?twitter\.com}i + url = Source::URL.parse(url)&.profile_url || url + url = url.sub(%r{^https://}, "http://") + url = url.sub(%r{^http://blog-imgs-\d+\.fc2}, "http://blog.fc2") + url = url.sub(%r{^http://blog-imgs-\d+-\w+\.fc2}, "http://blog.fc2") + url = url.sub(%r{^http://blog\d*\.fc2\.com/(?:\w/){,3}(\w+)}, "http://\\1.blog.fc2.com") - url = Sources::Strategies.find(url).normalize_for_artist_finder - - # XXX the Pixiv strategy should implement normalize_for_artist_finder and return the correct url directly. - url = url.sub(%r{\Ahttps?://www\.pixiv\.net/(?:en/)?users/(\d+)\z}i, 'https://www.pixiv.net/member.php?id=\1') - - url = url.gsub(%r{/+\Z}, "") - url = url.gsub(%r{^https://}, "http://") - url + "/" - end + url = url.gsub(%r{/+\Z}, "") + url + "/" end def self.search(params = {}) @@ -67,7 +56,8 @@ class ArtistURL < ApplicationRecord elsif url.include?("*") where_ilike(attr, url) else - where(attr => normalize_normalized_url(url)) + profile_url = Sources::Strategies.find(url).profile_url || url + where(attr => normalize_normalized_url(profile_url)) end end diff --git a/test/unit/artist_test.rb b/test/unit/artist_test.rb index 97dc2328d..07a3beb82 100644 --- a/test/unit/artist_test.rb +++ b/test/unit/artist_test.rb @@ -2,14 +2,14 @@ require 'test_helper' class ArtistTest < ActiveSupport::TestCase def assert_artist_found(expected_name, source_url) - artists = ArtistFinder.find_artists(source_url).to_a + artists = Artist.search(url_matches: source_url).to_a assert_equal(1, artists.size) assert_equal(expected_name, artists.first.name, "Testing URL: #{source_url}") end def assert_artist_not_found(source_url) - artists = ArtistFinder.find_artists(source_url).to_a + artists = Artist.search(url_matches: source_url).to_a assert_equal(0, artists.size, "Testing URL: #{source_url}") end diff --git a/test/unit/artist_url_test.rb b/test/unit/artist_url_test.rb index a9ccca9f9..938fb4657 100644 --- a/test/unit/artist_url_test.rb +++ b/test/unit/artist_url_test.rb @@ -52,149 +52,97 @@ class ArtistURLTest < ActiveSupport::TestCase end should "normalise https" do - url = FactoryBot.create(:artist_url, :url => "https://google.com") + url = create(:artist_url, url: "https://google.com") assert_equal("https://google.com", url.url) assert_equal("http://google.com/", url.normalized_url) end should "normalise domains to lowercase" do - url = FactoryBot.create(:artist_url, url: "https://ArtistName.example.com") + url = create(:artist_url, url: "https://ArtistName.example.com") assert_equal("http://artistname.example.com/", url.normalized_url) end - context "normalize twitter profile urls" do - setup do - @url = FactoryBot.create(:artist_url, :url => "https://twitter.com/BLAH") - end + should "normalize ArtStation urls" do + url = create(:artist_url, url: "https://www.artstation.com/koyorin") + assert_equal("http://www.artstation.com/koyorin/", url.normalized_url) - should "downcase the url" do - assert_equal("http://twitter.com/blah/", @url.normalized_url) - end - end - - context "artstation urls" do - setup do - @urls = [ - FactoryBot.create(:artist_url, url: "https://www.artstation.com/koyorin"), - FactoryBot.create(:artist_url, url: "https://koyorin.artstation.com"), - FactoryBot.create(:artist_url, url: "https://www.artstation.com/artwork/04XA4") - ] - end - - should "normalize" do - assert_equal("http://www.artstation.com/koyorin/", @urls[0].normalized_url) - assert_equal("http://www.artstation.com/koyorin/", @urls[1].normalized_url) - assert_equal("http://www.artstation.com/jeyrain/", @urls[2].normalized_url) - end - end - - context "deviantart urls" do - setup do - @urls = [ - FactoryBot.create(:artist_url, url: "https://www.deviantart.com/aeror404/art/Holiday-Elincia-424551484"), - FactoryBot.create(:artist_url, url: "http://noizave.deviantart.com/art/test-post-please-ignore-685436408"), - FactoryBot.create(:artist_url, url: "https://www.deviantart.com/noizave") - ] - end - - should "normalize" do - assert_equal("http://www.deviantart.com/aeror404/", @urls[0].normalized_url) - assert_equal("http://www.deviantart.com/noizave/", @urls[1].normalized_url) - assert_equal("http://www.deviantart.com/noizave/", @urls[2].normalized_url) - end - end - - context "nicoseiga urls" do - setup do - @urls = [ - FactoryBot.create(:artist_url, url: "http://seiga.nicovideo.jp/user/illust/7017777"), - FactoryBot.create(:artist_url, url: "http://lohas.nicoseiga.jp/o/910aecf08e542285862954017f8a33a8c32a8aec/1433298801/4937663"), - FactoryBot.create(:artist_url, url: "http://seiga.nicovideo.jp/seiga/im4937663") - ] - end - - should "normalize" do - assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[0].normalized_url) - assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[1].normalized_url) - assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", @urls[2].normalized_url) - end + url = create(:artist_url, url: "https://koyorin.artstation.com"), + assert_equal("http://www.artstation.com/koyorin/", url.normalized_url) end should "normalize fc2 urls" do - url = FactoryBot.create(:artist_url, :url => "http://blog55.fc2.com/monet") + url = create(:artist_url, url: "http://blog55.fc2.com/monet") assert_equal("http://blog55.fc2.com/monet", url.url) assert_equal("http://monet.blog.fc2.com/", url.normalized_url) - url = FactoryBot.create(:artist_url, :url => "http://blog-imgs-55.fc2.com/monet") + url = create(:artist_url, url: "http://blog-imgs-55.fc2.com/monet") assert_equal("http://blog-imgs-55.fc2.com/monet", url.url) assert_equal("http://monet.blog.fc2.com/", url.normalized_url) end should "normalize deviant art artist urls" do - url = FactoryBot.create(:artist_url, :url => "https://www.deviantart.com/aeror404/art/Holiday-Elincia-424551484") - assert_equal("http://www.deviantart.com/aeror404/", url.normalized_url) + url = create(:artist_url, url: "https://noizave.deviantart.com") + assert_equal("http://www.deviantart.com/noizave/", url.normalized_url) end should "normalize nico seiga artist urls" do - url = FactoryBot.create(:artist_url, :url => "http://seiga.nicovideo.jp/user/illust/7017777") + url = create(:artist_url, url: "http://seiga.nicovideo.jp/user/illust/7017777") assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", url.normalized_url) - url = FactoryBot.create(:artist_url, :url => "http://seiga.nicovideo.jp/seiga/im4937663") - assert_equal("http://seiga.nicovideo.jp/user/illust/7017777/", url.normalized_url) + url = create(:artist_url, url: "http://seiga.nicovideo.jp/manga/list?user_id=23839737") + assert_equal("http://seiga.nicovideo.jp/user/illust/23839737/", url.normalized_url) + + url = create(:artist_url, url: "https://www.nicovideo.jp/user/20446930/mylist/28674289") + assert_equal("http://seiga.nicovideo.jp/user/illust/20446930/", url.normalized_url) end should "normalize hentai foundry artist urls" do - url = FactoryBot.create(:artist_url, :url => "http://pictures.hentai-foundry.com//a/AnimeFlux/219123.jpg") - assert_equal("http://www.hentai-foundry.com/user/AnimeFlux/", url.normalized_url) - end - - should "normalize pixiv urls" do - url = FactoryBot.create(:artist_url, :url => "https://i.pximg.net/img-original/img/2010/11/30/08/39/58/14901720_p0.png") - assert_equal("https://i.pximg.net/img-original/img/2010/11/30/08/39/58/14901720_p0.png", url.url) - assert_equal("http://www.pixiv.net/member.php?id=339253/", url.normalized_url) + url = create(:artist_url, url: "https://www.hentai-foundry.com/user/kajinman/profile") + assert_equal("http://www.hentai-foundry.com/user/kajinman/", url.normalized_url) end should "normalize pixiv stacc urls" do - url = FactoryBot.create(:artist_url, :url => "https://www.pixiv.net/stacc/evazion") - assert_equal("https://www.pixiv.net/stacc/evazion", url.url) + url = create(:artist_url, url: "https://www.pixiv.net/stacc/evazion") assert_equal("http://www.pixiv.net/stacc/evazion/", url.normalized_url) end should "normalize pixiv fanbox account urls" do - url = FactoryBot.create(:artist_url, :url => "http://www.pixiv.net/fanbox/creator/3113804") - assert_equal("http://www.pixiv.net/fanbox/creator/3113804", url.url) - assert_equal("http://drw24olf.fanbox.cc/", url.normalized_url) + url = create(:artist_url, url: "https://www.pixiv.net/fanbox/creator/3113804") + assert_equal("http://www.pixiv.net/fanbox/creator/3113804/", url.normalized_url) + + url = create(:artist_url, url: "https://omu001.fanbox.cc/posts/39714") + assert_equal("http://omu001.fanbox.cc/", url.normalized_url) end should "normalize pixiv.net/user/123 urls" do url = create(:artist_url, url: "https://www.pixiv.net/en/users/123") - assert_equal("https://www.pixiv.net/en/users/123", url.url) assert_equal("http://www.pixiv.net/member.php?id=123/", url.normalized_url) end should "normalize twitter urls" do - url = FactoryBot.create(:artist_url, :url => "https://twitter.com/aoimanabu/status/892370963630743552") - assert_equal("https://twitter.com/aoimanabu/status/892370963630743552", url.url) + url = create(:artist_url, url: "https://twitter.com/aoimanabu/status/892370963630743552") assert_equal("http://twitter.com/aoimanabu/", url.normalized_url) + + url = create(:artist_url, url: "https://twitter.com/BLAH") + assert_equal("http://twitter.com/BLAH/", url.normalized_url) end should "normalize https://twitter.com/intent/user?user_id=* urls" do - url = FactoryBot.create(:artist_url, :url => "https://twitter.com/intent/user?user_id=2784590030") - assert_equal("https://twitter.com/intent/user?user_id=2784590030", url.url) + url = create(:artist_url, url: "https://twitter.com/intent/user?user_id=2784590030") assert_equal("http://twitter.com/intent/user?user_id=2784590030/", url.normalized_url) end should "normalize nijie urls" do - url = FactoryBot.create(:artist_url, url: "https://pic03.nijie.info/nijie_picture/236014_20170620101426_0.png") + url = create(:artist_url, url: "https://pic03.nijie.info/nijie_picture/236014_20170620101426_0.png") assert_equal("http://nijie.info/members.php?id=236014/", url.normalized_url) - url = FactoryBot.create(:artist_url, url: "https://nijie.info/members.php?id=161703") + url = create(:artist_url, url: "https://nijie.info/members.php?id=161703") assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url) - url = FactoryBot.create(:artist_url, url: "https://www.nijie.info/members_illust.php?id=161703") + url = create(:artist_url, url: "https://www.nijie.info/members_illust.php?id=161703") assert_equal("http://nijie.info/members.php?id=161703/", url.normalized_url) - url = FactoryBot.create(:artist_url, url: "https://nijie.info/invalid.php") + url = create(:artist_url, url: "https://nijie.info/invalid.php") assert_equal("http://nijie.info/invalid.php/", url.normalized_url) end diff --git a/test/unit/sources/hentai_foundry_test.rb b/test/unit/sources/hentai_foundry_test.rb index 39b880d31..c381623bb 100644 --- a/test/unit/sources/hentai_foundry_test.rb +++ b/test/unit/sources/hentai_foundry_test.rb @@ -67,10 +67,6 @@ module Sources should "get the artist name" do assert_equal("Afrobull", @site.artist_name) end - - should "get the normalized url" do - assert_equal("https://www.hentai-foundry.com/user/Afrobull", @site.normalize_for_artist_finder) - end end context "A deleted picture" do