Add a new tag tag search parser that supports full boolean expressions, including `and`, `or`, and `not` operators and parenthesized subexpressions. This is only the parser itself, not the code for converting the search into SQL. The new parser isn't used yet for actual searches. Searches still use the old parser. Some example syntax: * `1girl 1boy` * `1girl and 1boy` (same as `1girl 1boy`) * `1girl or 1boy` * `~1girl ~1boy` (same as `1girl or 1boy`) * `1girl and ((blonde_hair blue_eyes) or (red_hair green_eyes))` * `1girl ~(blonde_hair blue_eyes) ~(red_hair green_eyes)` (same as above) * `1girl -(blonde_hair blue_eyes)` * `*_hair *_eyes` * `*_hair or *_eyes` * `user:evazion or fav:evazion` * `~user:evazion ~fav:evazion` Rules: AND is implicit between terms, but may be written explicitly: * `a b c` is `a and b and c` AND has higher precedence (binds tighter) than OR: * `a or b and c or d` is `a or (b and c) or d` * `a or b c or d e` is `a or (b and c) or (d and e)` All `~` operators in the same subexpression are combined into a single OR: * `a b ~c ~d` is `a b (c or d)` * `~a ~b and ~c ~d` is `(a or b) (c or d)` * `(~a ~b) (~c ~d)` is `(a or b) (c or d)` A single `~` operator in a subexpression by itself is ignored: * `a ~b` is `a b` * `~a and ~b` is `a and b`, which is `a b` * `(~a) ~b` is `a ~b`, which is `a b` The parser is written as a backtracking recursive descent parser built on top of StringScanner and a handful of parser combinators. The parser generates an AST, which is then simplified using Boolean algebra to remove redundant nodes and to convert the expression to conjunctive normal form (that is, a product of sums, or an AND of ORs).
270 lines
6.6 KiB
Ruby
270 lines
6.6 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "strscan"
|
|
|
|
# A PostQuery::Parser parses a search string into a PostQuery::AST.
|
|
#
|
|
# @example
|
|
#
|
|
# ast = PostQuery.new("1girl or 1boy").parse
|
|
#
|
|
# Grammar:
|
|
#
|
|
# root = or_clause [root]
|
|
# or_clause = and_clause "or" or_clause
|
|
# | and_clause
|
|
# and_clause = factor_list "and" and_clause
|
|
# | factor_list
|
|
# factor_list = factor [factor_list]
|
|
# factor = "-" expr
|
|
# | "~" expr
|
|
# | expr
|
|
# expr = "(" or_clause ")" | term
|
|
# term = metatag | tag | wildcard
|
|
# metatag = metatag_name ":" quoted_string
|
|
# metatag_name = "user" | "fav" | "pool" | "order" | ...
|
|
# quoted_string = '"' /[^"]+/ '"'
|
|
# tag = /[^ *]+/
|
|
# wildcard = /[^ ]+/
|
|
#
|
|
# Ref:
|
|
#
|
|
# * https://hmac.dev/posts/2019-05-19-ruby-parser-combinators.html
|
|
|
|
class PostQuery
|
|
class Parser
|
|
class Error < StandardError; end
|
|
|
|
METATAG_NAME_REGEX = /(#{PostQueryBuilder::METATAGS.join("|")}):/i
|
|
|
|
attr_reader :input
|
|
private attr_reader :scanner, :unclosed_parens
|
|
|
|
# @param input [String] The search string to parse.
|
|
def initialize(input)
|
|
@input = input.to_s.clone.freeze
|
|
@scanner = StringScanner.new(@input)
|
|
@unclosed_parens = 0
|
|
end
|
|
|
|
# Parse a search and return the AST.
|
|
#
|
|
# @param string [String] The search string to parse.
|
|
# @returns [PostQuery::AST] The AST of the parsed search.
|
|
def self.parse(string)
|
|
new(string).parse
|
|
end
|
|
|
|
concerning :ParserMethods do
|
|
# Parse the search and return the AST, or return a search that matches nothing if the parse failed.
|
|
#
|
|
# @return [PostQuery::AST] The AST of the parsed search.
|
|
def parse
|
|
parse!
|
|
rescue Error
|
|
node(:none)
|
|
end
|
|
|
|
# Parse the search and return the AST, or raise an error if the parse failed.
|
|
#
|
|
# @return [PostQuery::AST] The AST of the parsed search.
|
|
def parse!
|
|
ast = root
|
|
raise Error, "Unexpected EOS (rest: '#{scanner.rest}')" unless scanner.eos?
|
|
raise Error, "Unclosed parentheses (#{@unclosed_parens})" unless @unclosed_parens == 0
|
|
ast
|
|
end
|
|
|
|
private
|
|
|
|
# root = or_clause [root]
|
|
def root
|
|
a = zero_or_more { or_clause }
|
|
space
|
|
|
|
if a.empty?
|
|
node(:all)
|
|
elsif a.size == 1
|
|
a.first
|
|
else
|
|
node(:and, *a)
|
|
end
|
|
end
|
|
|
|
# or_clause = and_clause "or" or_clause | and_clause
|
|
def or_clause
|
|
a = and_clause
|
|
|
|
space
|
|
if accept(/or +/i)
|
|
b = or_clause
|
|
node(:or, a, b)
|
|
else
|
|
a
|
|
end
|
|
end
|
|
|
|
# and_clause = factor_list "and" and_clause | factor_list
|
|
def and_clause
|
|
a = factor_list
|
|
|
|
space
|
|
if accept(/and +/i)
|
|
b = and_clause
|
|
node(:and, a, b)
|
|
else
|
|
a
|
|
end
|
|
end
|
|
|
|
# factor_list = factor [factor_list]
|
|
def factor_list
|
|
a = one_or_more { factor }
|
|
node(:and, *a)
|
|
end
|
|
|
|
# factor = "-" expr | "~" expr | expr
|
|
def factor
|
|
space
|
|
|
|
if accept("-")
|
|
node(:not, expr)
|
|
elsif accept("~")
|
|
node(:opt, expr)
|
|
else
|
|
expr
|
|
end
|
|
end
|
|
|
|
# expr = "(" or_clause ")" | term
|
|
def expr
|
|
space
|
|
|
|
if accept("(")
|
|
@unclosed_parens += 1
|
|
a = or_clause
|
|
expect(")")
|
|
@unclosed_parens -= 1
|
|
a
|
|
else
|
|
term
|
|
end
|
|
end
|
|
|
|
def term
|
|
metatag || wildcard || tag
|
|
end
|
|
|
|
# metatag = metatag_name ":" quoted_string
|
|
# metatag_name = "user" | "fav" | "pool" | "order" | ...
|
|
def metatag
|
|
if accept(METATAG_NAME_REGEX)
|
|
name = @scanner.matched.delete_suffix(":")
|
|
value = quoted_string
|
|
node(:metatag, name.downcase, value)
|
|
end
|
|
end
|
|
|
|
def quoted_string
|
|
if accept('"')
|
|
a = accept(/([^"\\]|\\")*/).gsub(/\\"/, '"') # handle backslash escaped quotes
|
|
expect('"')
|
|
a
|
|
else
|
|
string(/[^ ]+/)
|
|
end
|
|
end
|
|
|
|
# A wildcard is a string that contains a '*' character and that begins with a nonspace, non-')', non-'~', or non-'-' character, followed by nonspace characters.
|
|
def wildcard
|
|
if t = accept(/(?=[^ ]*\*)[^ \)~-][^ ]*/)
|
|
space
|
|
node(:wildcard, t.downcase)
|
|
end
|
|
end
|
|
|
|
# A tag is a string that begins with a nonspace, non-')', non-'~', or non-'-' character, followed by nonspace characters.
|
|
def tag
|
|
t = string(/[^ \)~-][^ ]*/)
|
|
raise Error if t.downcase.in?(%w[and or])
|
|
space
|
|
node(:tag, t.downcase)
|
|
end
|
|
|
|
def string(pattern)
|
|
str = expect(pattern)
|
|
|
|
# XXX: Now put back any trailing right parens we mistakenly consumed.
|
|
n = @unclosed_parens
|
|
while n > 0 && str.ends_with?(")")
|
|
str.chop!
|
|
scanner.pos -= 1
|
|
n -= 1
|
|
end
|
|
|
|
str
|
|
end
|
|
|
|
def space
|
|
expect(/ */)
|
|
end
|
|
end
|
|
|
|
concerning :HelperMethods do
|
|
private
|
|
|
|
# Try to match `pattern`, returning the string if it matched or nil if it didn't.
|
|
#
|
|
# @param pattern [Regexp, String] The pattern to match.
|
|
# @return [String, nil] The matched string, or nil
|
|
def accept(pattern)
|
|
@scanner.scan(pattern)
|
|
end
|
|
|
|
# Try to match `pattern`, returning the string if it matched or raising an Error if it didn't.
|
|
#
|
|
# @param pattern [Regexp, String] The pattern to match.
|
|
# @return [String] The matched string
|
|
def expect(pattern)
|
|
str = accept(pattern)
|
|
raise Error, "Expected '#{pattern}'; got '#{str}'" if str.nil?
|
|
str
|
|
end
|
|
|
|
# Try to parse the given block, backtracking to the original state if the parse failed.
|
|
def backtrack(&block)
|
|
saved_pos = @scanner.pos
|
|
saved_unclosed_parens = @unclosed_parens
|
|
raise Error if @scanner.eos?
|
|
yield
|
|
rescue Error
|
|
@scanner.pos = saved_pos
|
|
@unclosed_parens = saved_unclosed_parens
|
|
raise
|
|
end
|
|
|
|
# Parse the block zero or more times, returning an array of parse results.
|
|
def zero_or_more(&block)
|
|
matches = []
|
|
loop do
|
|
matches << backtrack { yield }
|
|
end
|
|
rescue Error
|
|
matches
|
|
end
|
|
|
|
# Parse the block one or more times, returning an array of parse results.
|
|
def one_or_more(&block)
|
|
first = yield
|
|
rest = zero_or_more(&block)
|
|
[first, *rest]
|
|
end
|
|
|
|
# Build an AST node of the given type.
|
|
def node(type, *args)
|
|
AST.new(type, args)
|
|
end
|
|
end
|
|
end
|
|
end
|