From fc5db679e47ff433128b4142f5db9074d682eb5e Mon Sep 17 00:00:00 2001 From: evazion Date: Sun, 10 Jan 2021 03:35:12 -0600 Subject: [PATCH] autocomplete: optimize searching by artist/wiki page other names. Optimize searches for non-English phrases in autocomplete. These searches were pretty slow, and could sometimes cause sitewide lag spikes when users typed long strings of non-English text into the search box and caused an unintentional DoS. The trick is to use an `array_to_tsvector(other_names) USING gin` index on other_names. This supports fast string prefix matching against all elements of the array. The downside is that it doesn't allow infix or suffix matches, so we can't support wildcards in general. Wildcards didn't quite work anyway, since artist and wiki other names can contain literal '*' characters. --- app/logical/autocomplete_service.rb | 20 ++++++++----------- app/logical/concerns/searchable.rb | 5 +++++ ...svector_index_on_wiki_pages_and_artists.rb | 6 ++++++ db/structure.sql | 17 +++++++++++++++- 4 files changed, 35 insertions(+), 13 deletions(-) create mode 100644 db/migrate/20210110090656_add_array_to_tsvector_index_on_wiki_pages_and_artists.rb diff --git a/app/logical/autocomplete_service.rb b/app/logical/autocomplete_service.rb index b870755b5..fbc91d729 100644 --- a/app/logical/autocomplete_service.rb +++ b/app/logical/autocomplete_service.rb @@ -65,7 +65,9 @@ class AutocompleteService end def autocomplete_tag(string) - if string.starts_with?("/") + if !string.ascii_only? + results = tag_other_name_matches(string) + elsif string.starts_with?("/") string = string + "*" unless string.include?("*") results = tag_matches(string) @@ -77,11 +79,8 @@ class AutocompleteService results = results.uniq { |r| r[:value] }.take(limit) elsif string.include?("*") results = tag_matches(string) - results = tag_other_name_matches(string) if results.blank? else - string += "*" - results = tag_matches(string) - results = tag_other_name_matches(string) if results.blank? + results = tag_matches(string + "*") results = tag_autocorrect_matches(string) if results.blank? end @@ -89,7 +88,7 @@ class AutocompleteService end def tag_matches(string) - return [] if string =~ /[^[:ascii:]]/ + return [] unless string.ascii_only? name_matches = Tag.nonempty.name_matches(string).order(post_count: :desc).limit(limit) alias_matches = Tag.nonempty.alias_matches(string).order(post_count: :desc).limit(limit) @@ -112,7 +111,6 @@ class AutocompleteService end def tag_autocorrect_matches(string) - string = string.delete("*") tags = Tag.nonempty.autocorrect_matches(string).limit(limit) tags.map do |tag| @@ -121,16 +119,14 @@ class AutocompleteService end def tag_other_name_matches(string) - return [] unless string =~ /[^[:ascii:]]/ - - artists = Artist.undeleted.any_other_name_like(string) - wikis = WikiPage.undeleted.other_names_match(string) + artists = Artist.undeleted.where_any_in_array_starts_with(:other_names, string) + wikis = WikiPage.undeleted.where_any_in_array_starts_with(:other_names, string) tags = Tag.where(name: wikis.select(:title)).or(Tag.where(name: artists.select(:name))) tags = tags.nonempty.order(post_count: :desc).limit(limit).includes(:wiki_page, :artist) tags.map do |tag| other_names = tag.artist&.other_names.to_a + tag.wiki_page&.other_names.to_a - antecedent = other_names.find { |other_name| other_name.ilike?(string) } + antecedent = other_names.find { |other_name| other_name.ilike?(string + "*") } { type: "tag-other-name", label: tag.pretty_name, value: tag.name, category: tag.category, post_count: tag.post_count, antecedent: antecedent } end end diff --git a/app/logical/concerns/searchable.rb b/app/logical/concerns/searchable.rb index ebc8548ed..bc519dc11 100644 --- a/app/logical/concerns/searchable.rb +++ b/app/logical/concerns/searchable.rb @@ -97,6 +97,11 @@ module Searchable where("? ~<< ANY(#{qualified_column_for(attr)})", "(?#{flags})#{regex}") end + # The column should have a `array_to_tsvector(column) using gin` index for best performance. + def where_any_in_array_starts_with(attr, value) + where("array_to_tsvector(#{qualified_column_for(attr)}) @@ ?", value.to_escaped_for_tsquery + ":*") + end + def where_text_includes_lower(attr, values) where("lower(#{qualified_column_for(attr)}) IN (?)", values.map(&:downcase)) end diff --git a/db/migrate/20210110090656_add_array_to_tsvector_index_on_wiki_pages_and_artists.rb b/db/migrate/20210110090656_add_array_to_tsvector_index_on_wiki_pages_and_artists.rb new file mode 100644 index 000000000..3f951be47 --- /dev/null +++ b/db/migrate/20210110090656_add_array_to_tsvector_index_on_wiki_pages_and_artists.rb @@ -0,0 +1,6 @@ +class AddArrayToTsvectorIndexOnWikiPagesAndArtists < ActiveRecord::Migration[6.1] + def change + add_index :wiki_pages, "array_to_tsvector(other_names)", using: :gin + add_index :artists, "array_to_tsvector(other_names)", using: :gin + end +end diff --git a/db/structure.sql b/db/structure.sql index e6f0daaa5..04a414cc9 100644 --- a/db/structure.sql +++ b/db/structure.sql @@ -4898,6 +4898,13 @@ CREATE INDEX index_artist_versions_on_updater_id ON public.artist_versions USING CREATE INDEX index_artist_versions_on_updater_ip_addr ON public.artist_versions USING btree (updater_ip_addr); +-- +-- Name: index_artists_on_array_to_tsvector_other_names; Type: INDEX; Schema: public; Owner: - +-- + +CREATE INDEX index_artists_on_array_to_tsvector_other_names ON public.artists USING gin (array_to_tsvector(other_names)); + + -- -- Name: index_artists_on_group_name; Type: INDEX; Schema: public; Owner: - -- @@ -7535,6 +7542,13 @@ CREATE INDEX index_wiki_page_versions_on_updater_ip_addr ON public.wiki_page_ver CREATE INDEX index_wiki_page_versions_on_wiki_page_id ON public.wiki_page_versions USING btree (wiki_page_id); +-- +-- Name: index_wiki_pages_on_array_to_tsvector_other_names; Type: INDEX; Schema: public; Owner: - +-- + +CREATE INDEX index_wiki_pages_on_array_to_tsvector_other_names ON public.wiki_pages USING gin (array_to_tsvector(other_names)); + + -- -- Name: index_wiki_pages_on_body_index_index; Type: INDEX; Schema: public; Owner: - -- @@ -7870,6 +7884,7 @@ INSERT INTO "schema_migrations" (version) VALUES ('20210108030722'), ('20210108030723'), ('20210108030724'), -('20210110015410'); +('20210110015410'), +('20210110090656');