From f6abf39ebc241b782c383631c2f93711a9f96664 Mon Sep 17 00:00:00 2001
From: evazion <noizave@gmail.com>
Date: Mon, 11 Oct 2021 23:39:03 -0500
Subject: [PATCH] search: try to optimize slow searches.

Try to optimize certain types of common slow searches:

* Searches for mutually-exclusive tags (e.g. `1girl multiple_girls`,
  `touhou solo -1girl -1boy`)
* Relatively large tags that are heavily skewed towards old posts
  (e.g. lucky_star, haruhi_suzumiya_no_yuuutsu, inazuma_eleven_(series),
  imageboard_desourced).
* Mid-sized tags in the <30k post range that Postgres thinks are
  big enough for a post id index scan, but a tag index scan is faster.

The general pattern is Postgres not using the tag index because it
thinks scanning down the post id index would be faster, but it's
actually much slower because it degrades to a full table scan. This
usually happens when Postgres thinks a tag is larger or more common than
it really is. Here we try to force Postgres into using the tag index
when we know the search is small.

One case that is still slow is `2girls -multiple_girls`. This returns no
results, but we can't know that without searching all of `2girls`. The
general case is searching for `A -B` where A is a subset of B and A and B
are both large tags.

Hopefully fixes #581, #654, #743, #1020, #1039, #1421, #2207, #4070,
 #4337, #4896, and various other issues raised over the years regarding
slow searches.
---
 app/logical/post_query_builder.rb     | 52 ++++++++++++++++++++++++++-
 app/logical/post_sets/post.rb         | 16 ++-------
 app/logical/related_tag_calculator.rb |  2 +-
 config/danbooru_default_config.rb     |  7 ++++
 4 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/app/logical/post_query_builder.rb b/app/logical/post_query_builder.rb
index 72ffdf18e..71bfbcecf 100644
--- a/app/logical/post_query_builder.rb
+++ b/app/logical/post_query_builder.rb
@@ -514,6 +514,52 @@ class PostQueryBuilder
     relation
   end
 
+  def paginated_posts(page, small_search_threshold: Danbooru.config.small_search_threshold, **options)
+    posts = build.paginate(page, **options)
+    posts = optimize_search(posts, 30_000)
+    posts.load
+  end
+
+  # XXX This is an ugly hack to try to deal with slow searches. By default,
+  # Postgres wants to do an index scan down the post id index for large
+  # order:id searches, and a bitmap scan on the tag index for small searches.
+  # The problem is that Postgres can't always tell whether a search is large or
+  # small. For large mutually-exclusive tags like 1girl + multiple_girls,
+  # Postgres assumes the search is large when actually it's small. For small
+  # tags, Postgres sometimes assumes tags in the 10k-50k range are large enough
+  # for a post id index scan, when in reality a tag index bitmap scan would be
+  # better.
+  def optimize_search(relation, small_search_threshold)
+    return relation unless small_search_threshold.present?
+    return relation unless relation.order_values == ["posts.id DESC"]
+
+    if post_count.nil?
+      # If post_count is nil, then the search took too long to count and we don't
+      # know whether it's large or small. First we try it normally assuming it's
+      # large, then if that times out we try again assuming it's small.
+      posts = Post.with_timeout(1000) { relation.load }
+      posts = small_search(relation) if posts.nil?
+    elsif post_count <= small_search_threshold
+      # Otherwise if we know the search is small, then treat it as a small search.
+      posts = small_search(relation)
+    else
+      # Otherwise if we know it's large, treat it normally
+      posts = relation
+    end
+
+    posts
+  end
+
+  # Perform a search, forcing Postgres to do a bitmap scan on the tags index.
+  # https://www.postgresql.org/docs/current/runtime-config-query.html
+  def small_search(relation)
+    Post.transaction do
+      Post.connection.execute("SET LOCAL enable_seqscan = off")
+      Post.connection.execute("SET LOCAL enable_indexscan = off")
+      relation.load
+    end
+  end
+
   def search_order(relation, order)
     case order.to_s.downcase
     when "id", "id_asc"
@@ -891,6 +937,10 @@ class PostQueryBuilder
   end
 
   concerning :CountMethods do
+    def post_count
+      fast_count
+    end
+
     # Return an estimate of the number of posts returned by the search.  By
     # default, we try to use an estimated or cached count before doing an exact
     # count.
@@ -1093,5 +1143,5 @@ class PostQueryBuilder
     end
   end
 
-  memoize :split_query
+  memoize :split_query, :post_count
 end
diff --git a/app/logical/post_sets/post.rb b/app/logical/post_sets/post.rb
index cca5a44fe..7e16df17b 100644
--- a/app/logical/post_sets/post.rb
+++ b/app/logical/post_sets/post.rb
@@ -7,7 +7,8 @@ module PostSets
     MAX_PER_PAGE = 200
     MAX_SIDEBAR_TAGS = 25
 
-    attr_reader :page, :random, :post_count, :format, :tag_string, :query, :normalized_query
+    attr_reader :page, :random, :format, :tag_string, :query, :normalized_query
+    delegate :post_count, to: :normalized_query
 
     def initialize(tags, page = 1, per_page = nil, user: CurrentUser.user, random: false, format: "html")
       @query = PostQueryBuilder.new(tags, user, tag_limit: user.tag_query_limit, safe_mode: CurrentUser.safe_mode?, hide_deleted_posts: user.hide_deleted_posts?)
@@ -97,27 +98,16 @@ module PostSets
       random || query.find_metatag(:order) == "random"
     end
 
-    def get_post_count
-      if %w[json atom xml].include?(format.downcase)
-        # no need to get counts for formats that don't use a paginator
-        nil
-      else
-        normalized_query.fast_count
-      end
-    end
-
     def get_random_posts
       ::Post.user_tag_match(tag_string).random(per_page)
     end
 
     def posts
       @posts ||= begin
-        @post_count = get_post_count
-
         if is_random?
           get_random_posts.paginate(page, search_count: false, limit: per_page, max_limit: max_per_page).load
         else
-          normalized_query.build.paginate(page, count: post_count, search_count: !post_count.nil?, limit: per_page, max_limit: max_per_page).load
+          normalized_query.paginated_posts(page, count: post_count, search_count: !post_count.nil?, limit: per_page, max_limit: max_per_page).load
         end
       end
     end
diff --git a/app/logical/related_tag_calculator.rb b/app/logical/related_tag_calculator.rb
index cd9e73f6f..ad5a9d0fd 100644
--- a/app/logical/related_tag_calculator.rb
+++ b/app/logical/related_tag_calculator.rb
@@ -23,7 +23,7 @@ module RelatedTagCalculator
   # @param category [Integer] an optional tag category, to restrict the tags to a given category.
   # @return [Array<Tag>] the set of similar tags, ordered by most similar
   def self.similar_tags_for_search(post_query, search_sample_size: 1000, tag_sample_size: 250, category: nil)
-    search_count = post_query.fast_count
+    search_count = post_query.post_count
     return [] if search_count.nil?
 
     search_sample_size = [search_count, search_sample_size].min
diff --git a/config/danbooru_default_config.rb b/config/danbooru_default_config.rb
index 971cdcec8..cbbccebb0 100644
--- a/config/danbooru_default_config.rb
+++ b/config/danbooru_default_config.rb
@@ -318,6 +318,13 @@ module Danbooru
       []
     end
 
+    # Tag searches with less than this many results will be considered "small
+    # searches" and optimized specially. This is unnecessary unless you have a
+    # Danbooru-sized database.
+    def small_search_threshold
+      nil
+    end
+
     # Your Pixiv PHPSESSID cookie. Get this by logging in to Pixiv and using
     # the devtools to find the PHPSESSID cookie. This is need for Pixiv upload
     # support.