Skip to content

Commit

Permalink
query: move phrasification to mu-query-parser
Browse files Browse the repository at this point in the history
Do the "phrasification" for matching fields later during query parsing;
this allows for handling combination fields correctly.

Also match both the normal term and the "phrase term", so we catch more
cases. Update/extend unit tests.

This fixes the "kata-container" issue also for body test.

Fixes #2167.
  • Loading branch information
djcb committed Sep 17, 2023
1 parent 7cbab21 commit 5bda8c3
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 41 deletions.
65 changes: 59 additions & 6 deletions lib/mu-query-parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,42 @@ struct ParseContext {
};




/**
* Indexable fields become _phrase_ fields if they contain
* wordbreakable data;
*
* @param field
* @param val
*
* @return
*/
static Option<Sexp>
phrasify(const Field& field, const Sexp& val)
{
if (!field.is_phrasable_term() || !val.stringp())
return Nothing; // nothing to phrasify

auto words{utf8_wordbreak(val.string())};
if (words.find(' ') == std::string::npos)
return Nothing; // nothing to phrasify

auto phrase = Sexp {
Sexp::Symbol{field.name},
Sexp{phrase_sym, Sexp{std::move(words)}}};

// if the field both a normal term & phrasable, match both
// if they are different
if (val.string() != words)
return Sexp{or_sym,
Sexp {Sexp::Symbol{field.name}, Sexp(val.string())},
std::move(phrase)};
else
return phrase;
}


/*
* Grammar
*
Expand All @@ -87,6 +123,7 @@ struct ParseContext {

static Sexp query(Sexp& tokens, ParseContext& ctx);


static Sexp
matcher(Sexp& tokens, ParseContext& ctx)
{
Expand All @@ -95,22 +132,38 @@ matcher(Sexp& tokens, ParseContext& ctx)

auto val{*tokens.head()};
tokens.pop_front();

/* special case: if we find some non-matcher type here, we need to
* second-guess the tokenizer */
/* special case: if we find some non-matcher type here, we need to second-guess the token */
if (!looks_like_matcher(val))
val = Sexp{placeholder_sym, val.symbol().name};

const auto fieldsym{val.front().symbol()};

// Note the _expand_ case is what we use when processing the query 'for real';
// the non-expand case is only to have a bit more human-readable Sexp for use
// mu find's '--analyze'
//
// Re: phrase-fields We map something like 'subject:hello-world'
// to
// (or (subject "hello-world" (subject (phrase "hello world"))))

if (ctx.expand) { /* should we expand meta-fields? */
const auto symbol{val.front().symbol()};
const auto fields = fields_from_name(symbol == placeholder_sym ? "" : symbol.name);
auto fields = fields_from_name(fieldsym == placeholder_sym ? "" : fieldsym.name);
if (!fields.empty()) {
Sexp vals{};
vals.add(or_sym);
for (auto&& field: fields)
vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}});
if (auto&& phrase{phrasify(field, *second(val))}; phrase)
vals.add(std::move(*phrase));
else
vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}});
val = std::move(vals);
}

}

if (auto&& field{field_from_name(fieldsym.name)}; field) {
if (auto&& phrase(phrasify(*field, *second(val))); phrase)
val = std::move(*phrase);
}

return val;
Expand Down
29 changes: 1 addition & 28 deletions lib/mu-query-processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ struct Element {
ValueType value{};
};
struct Basic: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Phrase: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Regex: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Wildcard: public FieldValue<std::string> {using FieldValue::FieldValue;};
struct Range: public FieldValue<std::pair<std::string, std::string>> {
Expand All @@ -89,7 +88,6 @@ struct Element {
std::string,
/* value types */
Basic,
Phrase,
Regex,
Wildcard,
Range
Expand Down Expand Up @@ -152,9 +150,6 @@ struct Element {
}
} else if constexpr (std::is_same_v<T, Basic>) {
return Sexp { field_sym(arg.field), arg.value };
} else if constexpr (std::is_same_v<T, Phrase>) {
return Sexp {field_sym(arg.field),
Sexp{ phrase_sym, arg.value }};
} else if constexpr (std::is_same_v<T, Regex>) {
return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}};
} else if constexpr (std::is_same_v<T, Wildcard>) {
Expand Down Expand Up @@ -337,24 +332,6 @@ basify(Element&& element)
return element;
}

static Option<Element>
phrasify(Element&& element)
{
auto&& basic{element.get_opt<Element::Basic>()};
if (!basic)
return element;

auto&& field = field_from_name(*basic->field);
if (!field || field->is_indexable_term()) {
auto&& val{basic->value};
if (val.find(' ') != std::string::npos)
element.value = Element::Phrase{basic->field, val};
}

return element;
}


static Option<Element>
wildcardify(Element&& element)
{
Expand Down Expand Up @@ -467,7 +444,7 @@ process(const std::string& expr)
.and_then(opify)
.and_then(basify)
.and_then(regexpify)
.and_then(phrasify)
//.and_then(phrasify)
.and_then(wildcardify)
.and_then(rangify);
if (element)
Expand Down Expand Up @@ -527,10 +504,6 @@ test_processor()
std::vector<TestCase> cases = {
// basics
TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"},
TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"},
TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"},

// maildir must _not_ be phrasified
TestCase{R"(maildir:/"hello world")", R"(((maildir "/hello world")))"},
};

Expand Down
13 changes: 8 additions & 5 deletions lib/mu-query-xapianizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ string_nth(const Sexp& args, size_t n)
static Result<Xapian::Query>
phrase(const Field& field, Sexp&& s)
{
if (!field.is_indexable_term())
if (!field.is_phrasable_term())
return Err(Error::Code::InvalidArgument,
"field {} does not support phrases", field.name);

Expand Down Expand Up @@ -273,13 +273,12 @@ parse_basic(const Field &field, Sexp &&vals, Mu::ParserFlags flags)
default: {
auto q{Xapian::Query{field.xapian_term(val)}};
if (ngrams) { // special case: cjk; see if we can create an expanded query.
if (field.is_indexable_term() && contains_unbroken_script(val))
if (field.is_phrasable_term() && contains_unbroken_script(val))
if (auto&& ng{ngram_expand(field, val)}; ng)
return ng;
}
return q;
}}

}

static Result<Xapian::Query>
Expand Down Expand Up @@ -420,6 +419,8 @@ using TestCase = std::pair<std::string, std::string>;
static void
test_xapian()
{
allow_warnings();

auto&& testhome{unwrap(make_temp_dir())};
auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)};
auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))};
Expand All @@ -429,7 +430,8 @@ test_xapian()
auto&& zz{make_xapian_query(store, R"(subject:"hello world")")};
assert_valid_result(zz);
/* LCOV_EXCL_START*/
if (zz->get_description() != R"(Query((Shello PHRASE 2 Sworld)))") {
if (zz->get_description() != R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))") {
mu_println("{}", zz->get_description());
if (mu_test_mu_hacker()) {
// in the mu hacker case, we want to be warned if Xapian changed.
g_critical("xapian version mismatch");
Expand All @@ -446,7 +448,8 @@ test_xapian()
TestCase{R"(i:[email protected])", R"(Query([email protected]))"},
TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"},
TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"},
TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"},
TestCase{R"(subject:"hello world")",
R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))"},
TestCase{R"(subject:/boo/")", R"(Query())"},

// ranges.
Expand Down
10 changes: 8 additions & 2 deletions lib/tests/test-mu-store-query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ I said: "Aujourd'hui!"
}};
TempDir tdir;
auto store{make_test_store(tdir.path(), test_msgs, {})};
store.commit();

// matches
for (auto&& expr: {
Expand Down Expand Up @@ -692,17 +693,22 @@ Date: Wed, 26 Oct 2022 11:01:54 -0700
To: [email protected]
Subject: kata-containers
voodoo-containers
Boo!
)"},
}};

TempDir tdir;
auto store{make_test_store(tdir.path(), test_msgs, {})};
/* true: match; false: no match */
const auto cases = std::array<std::pair<const char*, bool>, 3>{{
const auto cases = std::vector<std::pair<const char*, bool>>{{
{"subject:kata", true},
{"subject:containers", true},
{"subject:kata-containers", true}
{"subject:kata-containers", true},
{"subject:\"kata containers\"", true},
{"voodoo-containers", true},
{"voodoo containers", true}
}};

for (auto&& test: cases) {
Expand Down

0 comments on commit 5bda8c3

Please sign in to comment.