From c64df46de4fc59590d17ff8d6af91d8640a5f8a4 Mon Sep 17 00:00:00 2001 From: evazion Date: Fri, 18 Mar 2022 03:48:38 -0500 Subject: [PATCH] artists: make artist finder use `url` instead of `normalized_url`. Make the artist finder search for artists using the `url` field instead of the `normalized_url` field. This lets us get rid of `normalized_url` in the future. As described in 10dac3ee5, artist URLs have both a `url` column and a `normalized_url` column. The `normalized_url` column was the one used for artist finding. The `url` was secretly normalized behind the scenes so that artist finding would work no matter how the URL was written in the artist entry. This is no longer necessary now that URLs are directly normalized in artist entries. This fixes various cases where artist finding didn't work for non-obvious reasons, usually because the URL wasn't written in the right format so it wasn't properly normalized behind the scenes. This also makes it so that artist finding is case-insensitive, which fixes #4821. Hopefully no sites are perverse enough to allow two different usernames that differ only in case. Users running their own Danbooru instance may have to fix the URLs in their artist entries for artist finding to work again. There are a few fix scripts to help with this: * script/fixes/104_normalize_weibo_artist_urls.rb * script/fixes/105_normalize_pixiv_artist_urls.rb * script/fixes/106_normalize_artist_urls.rb --- app/logical/artist_finder.rb | 21 ++++++++++++------- ...2614_add_lower_url_index_to_artist_urls.rb | 11 ++++++++++ db/structure.sql | 10 ++++++++- 3 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 db/migrate/20220318082614_add_lower_url_index_to_artist_urls.rb diff --git a/app/logical/artist_finder.rb b/app/logical/artist_finder.rb index 7e2a88da7..902158f27 100644 --- a/app/logical/artist_finder.rb +++ b/app/logical/artist_finder.rb @@ -142,7 +142,7 @@ module ArtistFinder SITE_BLACKLIST_REGEXP = Regexp.union(SITE_BLACKLIST.map do |domain| domain = Regexp.escape(domain) if domain.is_a?(String) - %r{\Ahttps?://(?:[a-zA-Z0-9_-]+\.)*#{domain}/\z}i + %r{\A(?:[a-zA-Z0-9_-]+\.)*#{domain}}i end) # Find the artist for a given artist profile URL. May return multiple Artists @@ -155,17 +155,22 @@ module ArtistFinder # @return [Array] the list of matching artists def find_artists(url) url = ArtistURL.normalize_normalized_url(url) - artists = [] - while artists.empty? && url.size > 10 - u = url.sub(%r{/+$}, "") + "/" - u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%' - artists += Artist.joins(:urls).where(["artists.is_deleted = FALSE AND artist_urls.normalized_url LIKE ? ESCAPE E'\\\\'", u]).limit(10).order("artists.name").all - url = File.dirname(url) + "/" + # First try an exact match + artists = Artist.active.joins(:urls).where(urls: { url: url }) + + # If that fails, try removing the rightmost path component until we find an artist URL that matches the current URL. + url = url.downcase.gsub(%r{\Ahttps?://|/\z}, "") # "https://example.com/A/B/C/" => "example.com/a/b/c" + while artists.empty? && url != "." + u = url.gsub("*", '\*') + "/*" + artists += Artist.active.joins(:urls).where_like("regexp_replace(lower(artist_urls.url), '^https?://|/$', '', 'g') || '/'", u).limit(10) + + # File.dirname("example.com/a/b/c") => "example.com/a/b"; File.dirname("example.com") => "." + url = File.dirname(url) break if url =~ SITE_BLACKLIST_REGEXP end - Artist.where(id: artists.uniq(&:name).take(20)) + Artist.where(id: artists.uniq.take(20)) end end diff --git a/db/migrate/20220318082614_add_lower_url_index_to_artist_urls.rb b/db/migrate/20220318082614_add_lower_url_index_to_artist_urls.rb new file mode 100644 index 000000000..e383c40de --- /dev/null +++ b/db/migrate/20220318082614_add_lower_url_index_to_artist_urls.rb @@ -0,0 +1,11 @@ +class AddLowerURLIndexToArtistURLs < ActiveRecord::Migration[7.0] + disable_ddl_transaction! + + def change + # This index is used by the ArtistFinder. + # + # regexp_replace(lower('https://www.twitter.com/DanbooruBot'), '^https?://|/$', '', 'g') || '/' + # => 'www.twitter.com/danboorubot/' + add_index :artist_urls, "(regexp_replace(lower(artist_urls.url), '^https?://|/$', '', 'g') || '/') text_pattern_ops", name: :index_artist_urls_on_regexp_replace_lower_url, algorithm: :concurrently + end +end diff --git a/db/structure.sql b/db/structure.sql index f238d2568..30de5fb6b 100644 --- a/db/structure.sql +++ b/db/structure.sql @@ -3139,6 +3139,13 @@ CREATE INDEX index_artist_urls_on_normalized_url_pattern ON public.artist_urls U CREATE INDEX index_artist_urls_on_normalized_url_trgm ON public.artist_urls USING gin (normalized_url public.gin_trgm_ops); +-- +-- Name: index_artist_urls_on_regexp_replace_lower_url; Type: INDEX; Schema: public; Owner: - +-- + +CREATE INDEX index_artist_urls_on_regexp_replace_lower_url ON public.artist_urls USING btree (((regexp_replace(lower(url), '^https?://|/$'::text, ''::text, 'g'::text) || '/'::text)) text_pattern_ops); + + -- -- Name: index_artist_urls_on_url_trgm; Type: INDEX; Schema: public; Owner: - -- @@ -5779,6 +5786,7 @@ INSERT INTO "schema_migrations" (version) VALUES ('20220207195123'), ('20220210171310'), ('20220210200157'), -('20220211075129'); +('20220211075129'), +('20220318082614');