Skip to content

Commit

Permalink
[fix] (inverted index) Fix match function without inverted index (#38989
Browse files Browse the repository at this point in the history
)

## Proposed changes

### BUG

The properties of the inverted index were not passed from the FE to the
BE, resulting in inconsistencies between non-indexed and indexed match
queries.

### FIX
Fix match function without inverted index
  • Loading branch information
csun5285 authored Aug 8, 2024
1 parent df55639 commit 3f74137
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 83 deletions.
3 changes: 3 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,9 @@ lucene::util::bkd::relation InvertedIndexVisitor<QT>::compare(std::vector<uint8_
Status InvertedIndexIterator::read_from_inverted_index(
const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type,
uint32_t segment_num_rows, std::shared_ptr<roaring::Roaring>& bit_map, bool skip_try) {
DBUG_EXECUTE_IF("return_inverted_index_bypass", {
return Status::Error<ErrorCode::INVERTED_INDEX_BYPASS>("inverted index bypass");
});
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
Expand Down
13 changes: 13 additions & 0 deletions be/src/vec/exprs/vmatch_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@

#include "vec/exprs/vmatch_predicate.h"

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow-field"
#endif

#include <CLucene/analysis/LanguageBasedAnalyzer.h>
#include <fmt/format.h>
#include <fmt/ranges.h> // IWYU pragma: keep
#include <gen_cpp/Exprs_types.h>
Expand All @@ -29,6 +35,7 @@
#include <string_view>
#include <vector>

#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "common/status.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/core/block.h"
Expand All @@ -53,6 +60,12 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) {
_inverted_index_ctx->parser_mode = node.match_predicate.parser_mode;
_inverted_index_ctx->char_filter_map = node.match_predicate.char_filter_map;
_analyzer = InvertedIndexReader::create_analyzer(_inverted_index_ctx.get());
_analyzer->set_lowercase(node.match_predicate.parser_lowercase);
if (node.match_predicate.parser_stopwords == "none") {
_analyzer->set_stopwords(nullptr);
} else {
_analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
_inverted_index_ctx->analyzer = _analyzer.get();
}

Expand Down
110 changes: 48 additions & 62 deletions be/src/vec/functions/match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,29 @@ inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_ty
return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
} else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
} else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
}
return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
}

void FunctionMatchBase::analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& column_name) const {
VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
<< inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
query_tokens->emplace_back(match_query_str);
return;
}
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
doris::segment_v2::InvertedIndexReader::get_analyse_result(
*query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
get_query_type_from_fn_name());
}

inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
const ColumnString* string_col, int32_t current_block_row_idx,
Expand All @@ -134,23 +153,31 @@ inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
const auto& str_ref = string_col->get_data_at(current_src_array_offset);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
data_tokens.emplace_back(str_ref.to_string());
continue;
}
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
inverted_index_ctx, str_ref.to_string());

std::vector<std::string> element_tokens;

doris::segment_v2::InvertedIndexReader::get_analyse_result(
element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end());
}
} else {
const auto& str_ref = string_col->get_data_at(current_block_row_idx);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
str_ref.to_string());

doris::segment_v2::InvertedIndexReader::get_analyse_result(data_tokens, reader.get(),
inverted_index_ctx->analyzer,
column_name, query_type, false);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
data_tokens.emplace_back(str_ref.to_string());
} else {
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
inverted_index_ctx, str_ref.to_string());
doris::segment_v2::InvertedIndexReader::get_analyse_result(
data_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
}
}
return data_tokens;
}
Expand All @@ -177,23 +204,14 @@ Status FunctionMatchAny::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchAny::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -224,23 +242,14 @@ Status FunctionMatchAll::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchAll::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -277,23 +286,14 @@ Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::s
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchPhrase::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -345,25 +345,14 @@ Status FunctionMatchPhrasePrefix::execute_match(
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchPhrasePrefix::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);

auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY);

analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -414,18 +403,15 @@ Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::s
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
<< inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);

if (match_query_str.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down
6 changes: 6 additions & 0 deletions be/src/vec/functions/match.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const std::string MATCH_ALL_FUNCTION = "match_all";
const std::string MATCH_PHRASE_FUNCTION = "match_phrase";
const std::string MATCH_PHRASE_PREFIX_FUNCTION = "match_phrase_prefix";
const std::string MATCH_PHRASE_REGEXP_FUNCTION = "match_regexp";
const std::string MATCH_PHRASE_EDGE_FUNCTION = "match_phrase_edge";

class FunctionMatchBase : public IFunction {
public:
Expand All @@ -81,6 +82,11 @@ class FunctionMatchBase : public IFunction {

doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name() const;

void analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& field_name) const;

std::vector<std::string> analyse_data_token(const std::string& column_name,
InvertedIndexCtx* inverted_index_ctx,
const ColumnString* string_col,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,18 @@ public static Map<String, String> getInvertedIndexCharFilter(Map<String, String>
return charFilterMap;
}

public static boolean getInvertedIndexParserLowercase(Map<String, String> properties) {
String lowercase = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
// default is true if not set
return lowercase != null ? Boolean.parseBoolean(lowercase) : true;
}

public static String getInvertedIndexParserStopwords(Map<String, String> properties) {
String stopwrods = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
// default is "" if not set
return stopwrods != null ? stopwrods : "";
}

public static void checkInvertedIndexParser(String indexColName, PrimitiveType colType,
Map<String, String> properties) throws AnalysisException {
String parser = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ public static void initBuiltins(FunctionSet functionSet) {
private String invertedIndexParser;
private String invertedIndexParserMode;
private Map<String, String> invertedIndexCharFilter;
private boolean invertedIndexParserLowercase = true;
private String invertedIndexParserStopwords = "";

private MatchPredicate() {
// use for serde only
Expand All @@ -178,23 +180,22 @@ protected MatchPredicate(MatchPredicate other) {
invertedIndexParser = other.invertedIndexParser;
invertedIndexParserMode = other.invertedIndexParserMode;
invertedIndexCharFilter = other.invertedIndexCharFilter;
invertedIndexParserLowercase = other.invertedIndexParserLowercase;
invertedIndexParserStopwords = other.invertedIndexParserStopwords;
}

/**
* use for Nereids ONLY
*/
public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType,
NullableMode nullableMode, String invertedIndexParser, String invertedIndexParserMode,
Map<String, String> invertedIndexCharFilter) {
NullableMode nullableMode, Index invertedIndex) {
this(op, e1, e2);
if (invertedIndexParser != null) {
this.invertedIndexParser = invertedIndexParser;
}
if (invertedIndexParserMode != null) {
this.invertedIndexParserMode = invertedIndexParserMode;
}
if (invertedIndexParserMode != null) {
this.invertedIndexCharFilter = invertedIndexCharFilter;
if (invertedIndex != null) {
this.invertedIndexParser = invertedIndex.getInvertedIndexParser();
this.invertedIndexParserMode = invertedIndex.getInvertedIndexParserMode();
this.invertedIndexCharFilter = invertedIndex.getInvertedIndexCharFilter();
this.invertedIndexParserLowercase = invertedIndex.getInvertedIndexParserLowercase();
this.invertedIndexParserStopwords = invertedIndex.getInvertedIndexParserStopwords();
}
fn = new Function(new FunctionName(op.name), Lists.newArrayList(e1.getType(), e2.getType()), retType,
false, true, nullableMode);
Expand Down Expand Up @@ -228,6 +229,8 @@ protected void toThrift(TExprNode msg) {
msg.setOpcode(op.getOpcode());
msg.match_predicate = new TMatchPredicate(invertedIndexParser, invertedIndexParserMode);
msg.match_predicate.setCharFilterMap(invertedIndexCharFilter);
msg.match_predicate.setParserLowercase(invertedIndexParserLowercase);
msg.match_predicate.setParserStopwords(invertedIndexParserStopwords);
}

@Override
Expand Down Expand Up @@ -272,6 +275,8 @@ public void analyzeImpl(Analyzer analyzer) throws AnalysisException {
invertedIndexParser = index.getInvertedIndexParser();
invertedIndexParserMode = index.getInvertedIndexParserMode();
invertedIndexCharFilter = index.getInvertedIndexCharFilter();
invertedIndexParserLowercase = index.getInvertedIndexParserLowercase();
invertedIndexParserStopwords = index.getInvertedIndexParserStopwords();
break;
}
}
Expand Down
8 changes: 8 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@ public Map<String, String> getInvertedIndexCharFilter() {
return InvertedIndexUtil.getInvertedIndexCharFilter(properties);
}

public boolean getInvertedIndexParserLowercase() {
return InvertedIndexUtil.getInvertedIndexParserLowercase(properties);
}

public String getInvertedIndexParserStopwords() {
return InvertedIndexUtil.getInvertedIndexParserStopwords(properties);
}

public boolean isLightIndexChangeSupported() {
return indexType == IndexDef.IndexType.INVERTED;
}
Expand Down
Loading

0 comments on commit 3f74137

Please sign in to comment.