diff --git a/app/logical/artist_finder.rb b/app/logical/artist_finder.rb index 7e2a88da7..902158f27 100644 --- a/app/logical/artist_finder.rb +++ b/app/logical/artist_finder.rb @@ -142,7 +142,7 @@ module ArtistFinder SITE_BLACKLIST_REGEXP = Regexp.union(SITE_BLACKLIST.map do |domain| domain = Regexp.escape(domain) if domain.is_a?(String) - %r{\Ahttps?://(?:[a-zA-Z0-9_-]+\.)*#{domain}/\z}i + %r{\A(?:[a-zA-Z0-9_-]+\.)*#{domain}}i end) # Find the artist for a given artist profile URL. May return multiple Artists @@ -155,17 +155,22 @@ module ArtistFinder # @return [Array] the list of matching artists def find_artists(url) url = ArtistURL.normalize_normalized_url(url) - artists = [] - while artists.empty? && url.size > 10 - u = url.sub(%r{/+$}, "") + "/" - u = u.to_escaped_for_sql_like.gsub(/\*/, '%') + '%' - artists += Artist.joins(:urls).where(["artists.is_deleted = FALSE AND artist_urls.normalized_url LIKE ? ESCAPE E'\\\\'", u]).limit(10).order("artists.name").all - url = File.dirname(url) + "/" + # First try an exact match + artists = Artist.active.joins(:urls).where(urls: { url: url }) + + # If that fails, try removing the rightmost path component until we find an artist URL that matches the current URL. + url = url.downcase.gsub(%r{\Ahttps?://|/\z}, "") # "https://example.com/A/B/C/" => "example.com/a/b/c" + while artists.empty? && url != "." + u = url.gsub("*", '\*') + "/*" + artists += Artist.active.joins(:urls).where_like("regexp_replace(lower(artist_urls.url), '^https?://|/$', '', 'g') || '/'", u).limit(10) + + # File.dirname("example.com/a/b/c") => "example.com/a/b"; File.dirname("example.com") => "." + url = File.dirname(url) break if url =~ SITE_BLACKLIST_REGEXP end - Artist.where(id: artists.uniq(&:name).take(20)) + Artist.where(id: artists.uniq.take(20)) end end diff --git a/db/migrate/20220318082614_add_lower_url_index_to_artist_urls.rb b/db/migrate/20220318082614_add_lower_url_index_to_artist_urls.rb new file mode 100644 index 000000000..e383c40de --- /dev/null +++ b/db/migrate/20220318082614_add_lower_url_index_to_artist_urls.rb @@ -0,0 +1,11 @@ +class AddLowerURLIndexToArtistURLs < ActiveRecord::Migration[7.0] + disable_ddl_transaction! + + def change + # This index is used by the ArtistFinder. + # + # regexp_replace(lower('https://www.twitter.com/DanbooruBot'), '^https?://|/$', '', 'g') || '/' + # => 'www.twitter.com/danboorubot/' + add_index :artist_urls, "(regexp_replace(lower(artist_urls.url), '^https?://|/$', '', 'g') || '/') text_pattern_ops", name: :index_artist_urls_on_regexp_replace_lower_url, algorithm: :concurrently + end +end diff --git a/db/structure.sql b/db/structure.sql index f238d2568..30de5fb6b 100644 --- a/db/structure.sql +++ b/db/structure.sql @@ -3139,6 +3139,13 @@ CREATE INDEX index_artist_urls_on_normalized_url_pattern ON public.artist_urls U CREATE INDEX index_artist_urls_on_normalized_url_trgm ON public.artist_urls USING gin (normalized_url public.gin_trgm_ops); +-- +-- Name: index_artist_urls_on_regexp_replace_lower_url; Type: INDEX; Schema: public; Owner: - +-- + +CREATE INDEX index_artist_urls_on_regexp_replace_lower_url ON public.artist_urls USING btree (((regexp_replace(lower(url), '^https?://|/$'::text, ''::text, 'g'::text) || '/'::text)) text_pattern_ops); + + -- -- Name: index_artist_urls_on_url_trgm; Type: INDEX; Schema: public; Owner: - -- @@ -5779,6 +5786,7 @@ INSERT INTO "schema_migrations" (version) VALUES ('20220207195123'), ('20220210171310'), ('20220210200157'), -('20220211075129'); +('20220211075129'), +('20220318082614');