Skip to content

Commit

Permalink
Merge branch 'branch-2.0' into branch-2.0-fix-publish-failed-return-ok
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaokang authored Dec 22, 2023
2 parents 5016bab + 69357f3 commit df3b4cb
Show file tree
Hide file tree
Showing 53 changed files with 1,308 additions and 249 deletions.
4 changes: 4 additions & 0 deletions be/src/exec/olap_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,10 @@ class ColumnValueRange {
condition.__set_condition_op("match_all");
} else if (value.first == MatchType::MATCH_PHRASE) {
condition.__set_condition_op("match_phrase");
} else if (value.first == MatchType::MATCH_PHRASE_PREFIX) {
condition.__set_condition_op("match_phrase_prefix");
} else if (value.first == MatchType::MATCH_REGEXP) {
condition.__set_condition_op("match_regexp");
} else if (value.first == MatchType::MATCH_ELEMENT_EQ) {
condition.__set_condition_op("match_element_eq");
} else if (value.first == MatchType::MATCH_ELEMENT_LT) {
Expand Down
17 changes: 16 additions & 1 deletion be/src/exec/olap_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ enum class MatchType {
MATCH_ELEMENT_GT = 5,
MATCH_ELEMENT_LE = 6,
MATCH_ELEMENT_GE = 7,
MATCH_PHRASE_PREFIX = 8,
MATCH_REGEXP = 9,
};

inline MatchType to_match_type(TExprOpcode::type type) {
Expand All @@ -183,6 +185,12 @@ inline MatchType to_match_type(TExprOpcode::type type) {
case TExprOpcode::type::MATCH_PHRASE:
return MatchType::MATCH_PHRASE;
break;
case TExprOpcode::type::MATCH_PHRASE_PREFIX:
return MatchType::MATCH_PHRASE_PREFIX;
break;
case TExprOpcode::type::MATCH_REGEXP:
return MatchType::MATCH_REGEXP;
break;
case TExprOpcode::type::MATCH_ELEMENT_EQ:
return MatchType::MATCH_ELEMENT_EQ;
break;
Expand Down Expand Up @@ -212,6 +220,10 @@ inline MatchType to_match_type(const std::string& condition_op) {
return MatchType::MATCH_ALL;
} else if (condition_op.compare("match_phrase") == 0) {
return MatchType::MATCH_PHRASE;
} else if (condition_op.compare("match_phrase_prefix") == 0) {
return MatchType::MATCH_PHRASE_PREFIX;
} else if (condition_op.compare("match_regexp") == 0) {
return MatchType::MATCH_REGEXP;
} else if (condition_op.compare("match_element_eq") == 0) {
return MatchType::MATCH_ELEMENT_EQ;
} else if (condition_op.compare("match_element_lt") == 0) {
Expand All @@ -229,6 +241,8 @@ inline MatchType to_match_type(const std::string& condition_op) {
inline bool is_match_condition(const std::string& op) {
if (0 == strcasecmp(op.c_str(), "match_any") || 0 == strcasecmp(op.c_str(), "match_all") ||
0 == strcasecmp(op.c_str(), "match_phrase") ||
0 == strcasecmp(op.c_str(), "match_phrase_prefix") ||
0 == strcasecmp(op.c_str(), "match_regexp") ||
0 == strcasecmp(op.c_str(), "match_element_eq") ||
0 == strcasecmp(op.c_str(), "match_element_lt") ||
0 == strcasecmp(op.c_str(), "match_element_gt") ||
Expand All @@ -241,7 +255,8 @@ inline bool is_match_condition(const std::string& op) {

inline bool is_match_operator(const TExprOpcode::type& op_type) {
return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == op_type ||
TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type ||
TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_PHRASE_PREFIX == op_type ||
TExprOpcode::MATCH_REGEXP == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type ||
TExprOpcode::MATCH_ELEMENT_LT == op_type || TExprOpcode::MATCH_ELEMENT_GT == op_type ||
TExprOpcode::MATCH_ELEMENT_LE == op_type || TExprOpcode::MATCH_ELEMENT_GE == op_type;
}
Expand Down
8 changes: 7 additions & 1 deletion be/src/olap/match_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m
case MatchType::MATCH_PHRASE:
ret = InvertedIndexQueryType::MATCH_PHRASE_QUERY;
break;
case MatchType::MATCH_PHRASE_PREFIX:
ret = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
break;
case MatchType::MATCH_REGEXP:
ret = InvertedIndexQueryType::MATCH_REGEXP_QUERY;
break;
case MatchType::MATCH_ELEMENT_EQ:
ret = InvertedIndexQueryType::EQUAL_QUERY;
break;
Expand All @@ -117,7 +123,7 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m
}

bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const {
if (_match_type == MatchType::MATCH_PHRASE &&
if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX) &&
iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT &&
get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) ==
INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ ConjunctionQuery::~ConjunctionQuery() {
}

void ConjunctionQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) {
if (terms.size() < 1) {
_CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1");
if (terms.empty()) {
_CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms empty");
}

std::vector<TermIterator> iterators;
for (auto& term : terms) {
for (const auto& term : terms) {
std::wstring ws_term = StringUtil::string_to_wstring(term);
Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
_terms.push_back(t);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,25 @@ namespace doris {
DisjunctionQuery::DisjunctionQuery(IndexReader* reader) : _reader(reader) {}

DisjunctionQuery::~DisjunctionQuery() {
for (auto& term : _terms) {
if (term) {
_CLDELETE(term);
}
}
for (auto& term_doc : _term_docs) {
if (term_doc) {
_CLDELETE(term_doc);
}
}
for (auto& term : _terms) {
if (term) {
_CLDELETE(term);
}
}
}

void DisjunctionQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) {
if (terms.size() < 1) {
_CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1");
if (terms.empty()) {
_CLTHROWA(CL_ERR_IllegalArgument, "DisjunctionQuery::add: terms empty");
}

for (auto& term : terms) {
for (const auto& term : terms) {
std::wstring ws_term = StringUtil::string_to_wstring(term);
_wsterms.emplace_back(&ws_term);
Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
_terms.push_back(t);
TermDocs* term_doc = _reader->termDocs(t);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ class DisjunctionQuery {

private:
IndexReader* _reader = nullptr;
std::vector<std::wstring*> _wsterms;
std::vector<Term*> _terms;
std::vector<TermDocs*> _term_docs;
std::vector<TermIterator> _term_iterators;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "phrase_prefix_query.h"

#include "olap/rowset//segment_v2/inverted_index/query/prefix_query.h"

namespace doris {

namespace segment_v2 {

PhrasePrefixQuery::PhrasePrefixQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher)
: _searcher(searcher) {}

void PhrasePrefixQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) {
if (terms.empty()) {
return;
}

for (size_t i = 0; i < terms.size(); i++) {
if (i < terms.size() - 1) {
std::wstring ws = StringUtil::string_to_wstring(terms[i]);
Term* t = _CLNEW Term(field_name.c_str(), ws.c_str());
_query.add(t);
_CLDECDELETE(t);
} else {
std::vector<CL_NS(index)::Term*> prefix_terms;
PrefixQuery::get_prefix_terms(_searcher->getReader(), field_name, terms[i],
prefix_terms, _max_expansions);
if (prefix_terms.empty()) {
continue;
}
_query.add(prefix_terms);
for (auto& t : prefix_terms) {
_CLDECDELETE(t);
}
}
}
}

void PhrasePrefixQuery::search(roaring::Roaring& roaring) {
_searcher->_search(&_query, [&roaring](const int32_t docid, const float_t /*score*/) {
roaring.add(docid);
});
}

} // namespace segment_v2

} // namespace doris
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <CLucene.h>
#include <CLucene/index/IndexReader.h>

#include <memory>

#include "CLucene/search/MultiPhraseQuery.h"
#include "roaring/roaring.hh"

CL_NS_USE(index)
CL_NS_USE(search)

namespace doris {

namespace segment_v2 {

class PhrasePrefixQuery {
public:
PhrasePrefixQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher);
~PhrasePrefixQuery() = default;

void set_max_expansions(int32_t max_expansions) { _max_expansions = max_expansions; }

void add(const std::wstring& field_name, const std::vector<std::string>& terms);
void search(roaring::Roaring& roaring);

private:
std::shared_ptr<lucene::search::IndexSearcher> _searcher;
MultiPhraseQuery _query;

int32_t _max_expansions = 50;
};

} // namespace segment_v2

} // namespace doris
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "prefix_query.h"

namespace doris {

void PrefixQuery::get_prefix_terms(IndexReader* reader, const std::wstring& field_name,
const std::string& prefix,
std::vector<CL_NS(index)::Term*>& prefix_terms,
int32_t max_expansions) {
std::wstring ws_prefix = StringUtil::string_to_wstring(prefix);

Term* prefix_term = _CLNEW Term(field_name.c_str(), ws_prefix.c_str());
TermEnum* enumerator = reader->terms(prefix_term);

int32_t count = 0;
Term* lastTerm = nullptr;
try {
const TCHAR* prefixText = prefix_term->text();
const TCHAR* prefixField = prefix_term->field();
const TCHAR* tmp = nullptr;
size_t i = 0;
size_t prefixLen = prefix_term->textLength();
do {
lastTerm = enumerator->term();
if (lastTerm != nullptr && lastTerm->field() == prefixField) {
size_t termLen = lastTerm->textLength();
if (prefixLen > termLen) {
break;
}

tmp = lastTerm->text();

for (i = prefixLen - 1; i != -1; --i) {
if (tmp[i] != prefixText[i]) {
tmp = nullptr;
break;
}
}
if (tmp == nullptr) {
break;
}

if (max_expansions > 0 && count >= max_expansions) {
break;
}

Term* t = _CLNEW Term(field_name.c_str(), tmp);
prefix_terms.push_back(t);
count++;
} else {
break;
}
_CLDECDELETE(lastTerm);
} while (enumerator->next());
}
_CLFINALLY({
enumerator->close();
_CLDELETE(enumerator);
_CLDECDELETE(lastTerm);
_CLDECDELETE(prefix_term);
});
}

} // namespace doris
40 changes: 40 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <CLucene.h>
#include <CLucene/index/IndexReader.h>

#include <cstdint>

CL_NS_USE(index)

namespace doris {

class PrefixQuery {
public:
PrefixQuery() = default;
~PrefixQuery() = default;

static void get_prefix_terms(IndexReader* reader, const std::wstring& field_name,
const std::string& prefix,
std::vector<CL_NS(index)::Term*>& prefix_terms,
int32_t max_expansions = 50);
};

} // namespace doris
Loading

0 comments on commit df3b4cb

Please sign in to comment.