Files
danbooru/test/unit/post_query_parser_test.rb
evazion 4c7cfc73c6 search: add new tag search parser.
Add a new tag tag search parser that supports full boolean expressions, including `and`,
`or`, and `not` operators and parenthesized subexpressions.

This is only the parser itself, not the code for converting the search into SQL. The new
parser isn't used yet for actual searches. Searches still use the old parser.

Some example syntax:

* `1girl 1boy`
* `1girl and 1boy` (same as `1girl 1boy`)
* `1girl or 1boy`
* `~1girl ~1boy` (same as `1girl or 1boy`)
* `1girl and ((blonde_hair blue_eyes) or (red_hair green_eyes))`
* `1girl ~(blonde_hair blue_eyes) ~(red_hair green_eyes)` (same as above)
* `1girl -(blonde_hair blue_eyes)`
* `*_hair *_eyes`
* `*_hair or *_eyes`
* `user:evazion or fav:evazion`
* `~user:evazion ~fav:evazion`

Rules:

AND is implicit between terms, but may be written explicitly:

* `a b c` is `a and b and c`

AND has higher precedence (binds tighter) than OR:

* `a or b and c or d` is `a or (b and c) or d`
* `a or b c or d e` is `a or (b and c) or (d and e)`

All `~` operators in the same subexpression are combined into a single OR:

* `a b ~c ~d` is `a b (c or d)`
* `~a ~b and ~c ~d` is `(a or b) (c or d)`
* `(~a ~b) (~c ~d)` is `(a or b) (c or d)`

A single `~` operator in a subexpression by itself is ignored:

* `a ~b` is `a b`
* `~a and ~b` is `a and b`, which is `a b`
* `(~a) ~b` is `a ~b`, which is `a b`

The parser is written as a backtracking recursive descent parser built on top of
StringScanner and a handful of parser combinators. The parser generates an AST, which is
then simplified using Boolean algebra to remove redundant nodes and to convert the
expression to conjunctive normal form (that is, a product of sums, or an AND of ORs).
2022-03-29 18:21:46 -05:00

298 lines
12 KiB
Ruby

require 'test_helper'
class PostQueryParserTest < ActiveSupport::TestCase
def assert_parse_equals(expected, input)
assert_equal(expected, PostQuery::Parser.parse(input).simplify.to_sexp)
end
context "PostQueryParser:" do
should "parse empty queries correctly" do
assert_parse_equals("all", "")
assert_parse_equals("all", " ")
end
should "parse basic tags correctly" do
assert_parse_equals("a", "a")
assert_parse_equals("a", "A")
assert_parse_equals("foo_(bar)", "foo_(bar)")
assert_parse_equals("foo_(bar)", "(foo_(bar))")
assert_parse_equals("foo_(bar_(baz))", "foo_(bar_(baz))")
assert_parse_equals("foo_(bar_(baz))", "(foo_(bar_(baz)))")
assert_parse_equals(";)", ";)")
assert_parse_equals("9", "(9)")
end
should "parse basic queries correctly" do
assert_parse_equals("(and a b)", "a b")
assert_parse_equals("(or a b)", "a or b")
assert_parse_equals("(or a b)", "~a ~b")
assert_parse_equals("(not a)", "-a")
assert_parse_equals("(and (not b) a)", "a -b")
assert_parse_equals("fav:a", "fav:a")
assert_parse_equals("(not fav:a)", "-fav:a")
assert_parse_equals("(and fav:a fav:b)", "fav:a fav:b")
end
should "parse metatags correctly" do
assert_parse_equals("fav:a", "fav:a")
assert_parse_equals("user:a", "user:a")
assert_parse_equals("pool:a", "pool:a")
assert_parse_equals("order:a", "order:a")
assert_parse_equals("source:a", "source:a")
assert_parse_equals("fav:a", "FAV:a")
assert_parse_equals("fav:A", "fav:A")
assert_parse_equals("fav:a", "~fav:a")
assert_parse_equals("(not fav:a)", "-fav:a")
assert_parse_equals("(and fav:a fav:b)", "fav:a fav:b")
assert_parse_equals("(or fav:a fav:b)", "~fav:a ~fav:b")
assert_parse_equals("(or fav:a fav:b)", "fav:a or fav:b")
assert_parse_equals("fav:a", "(fav:a)")
assert_parse_equals("fav:(a)", "fav:(a)")
assert_parse_equals("fav:(a", "(fav:(a)")
assert_parse_equals('source:foo bar', 'source:"foo bar"')
assert_parse_equals('source:foobar"(', 'source:foobar"(')
assert_parse_equals('source:', 'source:""')
assert_parse_equals(%q{source:don't say "lazy" okay}, %q{source:"don't say \"lazy\" okay"})
assert_parse_equals(%q{(and source:foo)bar a)}, %q{(a (source:"foo)bar"))})
end
should "parse wildcard tags correctly" do
assert_parse_equals("(wildcard *)", "*")
assert_parse_equals("(wildcard *a)", "*a")
assert_parse_equals("(wildcard a*)", "a*")
assert_parse_equals("(wildcard *a*)", "*a*")
assert_parse_equals("(wildcard a*b)", "a*b")
assert_parse_equals("(and b (wildcard *))", "* b")
assert_parse_equals("(and b (wildcard *a))", "*a b")
assert_parse_equals("(and b (wildcard a*))", "a* b")
assert_parse_equals("(and b (wildcard *a*))", "*a* b")
assert_parse_equals("(and a (wildcard *))", "a *")
assert_parse_equals("(and a (wildcard *b))", "a *b")
assert_parse_equals("(and a (wildcard b*))", "a b*")
assert_parse_equals("(and a (wildcard *b*))", "a *b*")
assert_parse_equals("(and (not (wildcard *)) a)", "a -*")
assert_parse_equals("(and (not (wildcard b*)) a)", "a -b*")
assert_parse_equals("(and (not (wildcard *b)) a)", "a -*b")
assert_parse_equals("(and (not (wildcard *b*)) a)", "a -*b*")
assert_parse_equals("(or a (wildcard *))", "~a ~*")
assert_parse_equals("(or a (wildcard *))", "~* ~a")
assert_parse_equals("(or a (wildcard *a))", "~a ~*a")
assert_parse_equals("(or a (wildcard *a))", "~*a ~a")
assert_parse_equals("(or a (wildcard a*))", "a or a*")
assert_parse_equals("(and a (wildcard a*))", "a and a*")
assert_parse_equals("(and (wildcard a*) (wildcard b*))", "a* b*")
assert_parse_equals("(or (wildcard a*) (wildcard b*))", "a* or b*")
assert_parse_equals("(and a c (wildcard b*))", "a b* c")
assert_parse_equals("(and (not (wildcard *)) a c)", "a -* c")
end
should "parse single tag queries correctly" do
assert_parse_equals("a", "a")
assert_parse_equals("a", "a ")
assert_parse_equals("a", " a")
assert_parse_equals("a", " a ")
assert_parse_equals("a", "(a)")
assert_parse_equals("a", "( a)")
assert_parse_equals("a", "(a )")
assert_parse_equals("a", " ( a ) ")
assert_parse_equals("a", "((a))")
assert_parse_equals("a", "( ( a ) )")
assert_parse_equals("a", " ( ( a ) ) ")
end
should "parse nested AND queries correctly" do
assert_parse_equals("(and a b)", "a b")
assert_parse_equals("(and a b)", "(a b)")
assert_parse_equals("(and a b)", "a (b)")
assert_parse_equals("(and a b)", "(a) b")
assert_parse_equals("(and a b)", "(a) (b)")
assert_parse_equals("(and a b)", "((a) (b))")
assert_parse_equals("(and a b c)", "a b c")
assert_parse_equals("(and a b c)", "(a b) c")
assert_parse_equals("(and a b c)", "((a) b) c")
assert_parse_equals("(and a b c)", "(((a) b) c)")
assert_parse_equals("(and a b c)", "((a b) c)")
assert_parse_equals("(and a b c)", "((a) (b) (c))")
assert_parse_equals("(and a b c)", "a (b c)")
assert_parse_equals("(and a b c)", "a (b (c))")
assert_parse_equals("(and a b c)", "(a (b (c)))")
assert_parse_equals("(and a b c)", "(a (b c))")
assert_parse_equals("(and a b c)", "(a b c)")
assert_parse_equals("(and a b)", "a and b")
assert_parse_equals("(and a b)", "a AND b")
assert_parse_equals("(and a b)", "(a and b)")
assert_parse_equals("(and a b c)", "a and b and c")
assert_parse_equals("(and a b c)", "(a and b) and c")
assert_parse_equals("(and a b c)", "a and (b and c)")
assert_parse_equals("(and a b c)", "(a and b and c)")
end
should "parse nested OR queries correctly" do
assert_parse_equals("(or a b)", "a or b")
assert_parse_equals("(or a b)", "a OR b")
assert_parse_equals("(or a b)", "(a or b)")
assert_parse_equals("(or a b)", "(a) or (b)")
assert_parse_equals("(or a b c)", "a or b or c")
assert_parse_equals("(or a b c)", "(a or b) or c")
assert_parse_equals("(or a b c)", "a or (b or c)")
assert_parse_equals("(or a b c)", "(a or b or c)")
assert_parse_equals("(or a b c d)", "a or (b or (c or d))")
assert_parse_equals("(or a b c d)", "((a or b) or c) or d")
assert_parse_equals("(or a b c d)", "(a or b) or (c or d)")
end
should "parse the '~' operator correctly" do
assert_parse_equals("(or a b)", "~a ~b")
assert_parse_equals("(or a b c)", "~a ~b ~c")
assert_parse_equals("(or a b c d)", "~a ~b ~c ~d")
assert_parse_equals("a", "~a")
assert_parse_equals("a", "(~a)")
assert_parse_equals("a", "~(a)")
assert_parse_equals("a", "~(~a)")
assert_parse_equals("a", "~(~(~a))")
assert_parse_equals("(not a)", "~(-a)")
assert_parse_equals("(not a)", "-(~a)")
assert_parse_equals("a", "-(~(-(~a)))")
assert_parse_equals("a", "~(-(~(-a)))")
assert_parse_equals("(and a b)", "a ~b")
assert_parse_equals("(and a b)", "~a b")
assert_parse_equals("(and a b)", "((a) ~b)")
assert_parse_equals("(and a b)", "~(a b)")
assert_parse_equals("(and a b)", "~a and ~b")
assert_parse_equals("(or a b)", "~a or ~b")
assert_parse_equals("(or (not a) (not b))", "~(-a) or ~(-b)")
assert_parse_equals("(or a b)", "~(a) ~(b)")
assert_parse_equals("(and a b)", "(~a) (~b)")
assert_parse_equals("(and (or b c) a)", "(~a) ~b ~c")
assert_parse_equals("(and (or b c) a)", "~a (~b ~c)")
assert_parse_equals("(or a b c d)", "~a ~b or ~c ~d")
assert_parse_equals("(and (or a b) (or c d))", "~a ~b and ~c ~d")
assert_parse_equals("(and (or a b) (or c d))", "(~a ~b) (~c ~d)")
assert_parse_equals("(and (or a c) (or a d) (or b c) (or b d))", "~(a b) ~(c d)")
assert_parse_equals("(and (or a c) (or a d) (or b c) (or b d))", "(a b) or (c d)")
assert_parse_equals("(and a b c d)", " a b c d")
assert_parse_equals("(and a b c d)", " a b c ~d")
assert_parse_equals("(and a b c d)", " a b ~c d")
assert_parse_equals("(and (or c d) a b)", " a b ~c ~d")
assert_parse_equals("(and a b c d)", " a ~b c d")
assert_parse_equals("(and (or b d) a c)", " a ~b c ~d")
assert_parse_equals("(and (or b c) a d)", " a ~b ~c d")
assert_parse_equals("(and (or b c d) a)", " a ~b ~c ~d")
assert_parse_equals("(and a b c d)", "~a b c d")
assert_parse_equals("(and (or a d) b c)", "~a b c ~d")
assert_parse_equals("(and (or a c) b d)", "~a b ~c d")
assert_parse_equals("(and (or a c d) b)", "~a b ~c ~d")
assert_parse_equals("(and (or a b) c d)", "~a ~b c d")
assert_parse_equals("(and (or a b d) c)", "~a ~b c ~d")
assert_parse_equals("(and (or a b c) d)", "~a ~b ~c d")
assert_parse_equals("(or a b c d)", "~a ~b ~c ~d")
end
should "parse NOT queries correctly" do
assert_parse_equals("(not a)", "-a")
assert_parse_equals("(and (not b) a)", "(a -b)")
assert_parse_equals("(and (not b) a)", "a (-b)")
assert_parse_equals("(and (not b) a)", "((a) -b)")
end
should "eliminate double negations" do
assert_parse_equals("(not a)", "-a")
assert_parse_equals("(not a)", "-(-(-a))")
assert_parse_equals("a", "-(-a)")
assert_parse_equals("a", "-(-(-(-a)))")
assert_parse_equals("(and a b c)", "a -(-(b)) c")
assert_parse_equals("(and a b c d)", "a -(-(b -(-c))) d")
end
should "apply DeMorgan's law" do
assert_parse_equals("(or (not a) (not b))", "-(a b)")
assert_parse_equals("(and (not a) (not b))", "-(a or b)")
assert_parse_equals("(or (not a) (not b) (not c))", "-(a b c)")
assert_parse_equals("(and (not a) (not b) (not c))", "-(a or b or c)")
assert_parse_equals("(or a b c)", "-(-a -b -c)")
assert_parse_equals("(and a b c)", "-(-a or -b or -c)")
assert_parse_equals("(and (or (not a) (not c) (not d)) (or (not a) b))", "-(a -(b -(c d)))")
end
should "apply the distributive law" do
assert_parse_equals("(and (or a b) (or a c))", "a or (b c)")
assert_parse_equals("(and (or a b) (or a c))", "(b c) or a")
assert_parse_equals("(and (or a c) (or a d) (or b c) (or b d))", "(a b) or (c d)")
assert_parse_equals("(and (or a c e) (or a c f) (or a d e) (or a d f) (or b c e) (or b c f) (or b d e) (or b d f))", "(a b) or (c d) or (e f)")
end
should "return the empty search for syntax errors" do
assert_parse_equals("none", "(")
assert_parse_equals("none", ")")
assert_parse_equals("none", "-")
assert_parse_equals("none", "~")
assert_parse_equals("none", "(a")
assert_parse_equals("none", ")a")
assert_parse_equals("none", "-~a")
assert_parse_equals("none", "~-a")
assert_parse_equals("none", "~~a")
assert_parse_equals("none", "--a")
assert_parse_equals("none", "and")
assert_parse_equals("none", "-and")
assert_parse_equals("none", "~and")
assert_parse_equals("none", "or")
assert_parse_equals("none", "-or")
assert_parse_equals("none", "~or")
assert_parse_equals("none", "a and")
assert_parse_equals("none", "a or")
assert_parse_equals("none", "and a")
assert_parse_equals("none", "or a")
assert_parse_equals("none", "a -")
assert_parse_equals("none", "a ~")
assert_parse_equals("none", "(a b")
assert_parse_equals("none", "(a (b)")
assert_parse_equals("none", 'source:"foo')
assert_parse_equals("none", 'source:"foo bar')
end
end
end