Skip to content

Commit

Permalink
[Refactor](inverted index) add analyzer for inverted index to unify a…
Browse files Browse the repository at this point in the history
…nalysis process
  • Loading branch information
airborne12 committed Sep 12, 2024
1 parent 9caefba commit a0b94ba
Show file tree
Hide file tree
Showing 9 changed files with 239 additions and 231 deletions.
2 changes: 2 additions & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct InvertedIndexCtx {
InvertedIndexParserType parser_type;
std::string parser_mode;
CharFilterMap char_filter_map;
std::string lower_case;
std::string stop_words;
lucene::analysis::Analyzer* analyzer = nullptr;
};

Expand Down
120 changes: 120 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"

#include "CLucene.h"
#include "CLucene/analysis/LanguageBasedAnalyzer.h"

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow-field"
#endif
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#ifdef __clang__
#pragma clang diagnostic pop
#endif
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"

namespace doris::segment_v2::inverted_index {

std::unique_ptr<lucene::util::Reader> InvertedIndexAnalyzer::create_reader(
CharFilterMap& char_filter_map) {
std::unique_ptr<lucene::util::Reader> reader =
std::make_unique<lucene::util::SStringReader<char>>();
if (!char_filter_map.empty()) {
reader = std::unique_ptr<lucene::util::Reader>(CharFilterFactory::create(
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], reader.release(),
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
}
return reader;
}

std::unique_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyzer(
const InvertedIndexCtx* inverted_index_ctx) {
std::unique_ptr<lucene::analysis::Analyzer> analyzer;
auto analyser_type = inverted_index_ctx->parser_type;
if (analyser_type == InvertedIndexParserType::PARSER_STANDARD ||
analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
analyzer = std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
} else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) {
analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
} else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
auto chinese_analyzer =
std::make_unique<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
chinese_analyzer->initDict(config::inverted_index_dict_path);
auto mode = inverted_index_ctx->parser_mode;
if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
} else {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
}
analyzer = std::move(chinese_analyzer);
} else {
// default
analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
// set lowercase
auto lowercase = inverted_index_ctx->lower_case;
if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
analyzer->set_lowercase(true);
} else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
analyzer->set_lowercase(false);
}
// set stop words
auto stop_words = inverted_index_ctx->stop_words;
if (stop_words == "none") {
analyzer->set_stopwords(nullptr);
} else {
analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
return analyzer;
}

void InvertedIndexAnalyzer::get_analyse_result(std::vector<std::string>& analyse_result,
lucene::util::Reader* reader,
lucene::analysis::Analyzer* analyzer,
const std::string& field_name,
InvertedIndexQueryType query_type,
bool drop_duplicates) {
analyse_result.clear();

std::wstring field_ws = StringUtil::string_to_wstring(field_name);
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
analyzer->tokenStream(field_ws.c_str(), reader));

lucene::analysis::Token token;

while (token_stream->next(&token)) {
if (token.termLength<char>() != 0) {
analyse_result.emplace_back(token.termBuffer<char>(), token.termLength<char>());
}
}

if (token_stream != nullptr) {
token_stream->close();
}

if (drop_duplicates && (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY)) {
std::set<std::string> unrepeated_result(analyse_result.begin(), analyse_result.end());
analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end());
}
}

} // namespace doris::segment_v2::inverted_index
48 changes: 48 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <memory>

#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/inverted_index_query_type.h"

namespace lucene {
namespace util {
class Reader;
}
namespace analysis {
class Analyzer;
}
} // namespace lucene

namespace doris::segment_v2::inverted_index {
class InvertedIndexAnalyzer {
public:
static std::unique_ptr<lucene::util::Reader> create_reader(CharFilterMap& char_filter_map);

static std::unique_ptr<lucene::analysis::Analyzer> create_analyzer(
const InvertedIndexCtx* inverted_index_ctx);

static void get_analyse_result(std::vector<std::string>& analyse_result,
lucene::util::Reader* reader,
lucene::analysis::Analyzer* analyzer,
const std::string& field_name, InvertedIndexQueryType query_type,
bool drop_duplicates = true);
};
} // namespace doris::segment_v2::inverted_index
136 changes: 14 additions & 122 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,13 @@

#include "olap/rowset/segment_v2/inverted_index_reader.h"

#include <CLucene/analysis/AnalysisHeader.h>
#include <CLucene/analysis/Analyzers.h>
#include <CLucene/analysis/LanguageBasedAnalyzer.h>
#include <CLucene/debug/error.h>
#include <CLucene/debug/mem.h>
#include <CLucene/index/Term.h>
#include <CLucene/search/IndexSearcher.h>
#include <CLucene/search/Query.h>
#include <CLucene/search/RangeQuery.h>
#include <CLucene/search/TermQuery.h>
#include <CLucene/store/Directory.h>
#include <CLucene/store/IndexInput.h>
#include <CLucene/util/CLStreams.h>
#include <CLucene/util/FutureArrays.h>
#include <CLucene/util/bkd/bkd_docid_iterator.h>
#include <CLucene/util/stringUtil.h>
Expand All @@ -40,26 +34,16 @@
#include <set>
#include <string>

#include "gutil/integral_types.h"
#include "inverted_index_query_type.h"
#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow-field"
#endif
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#ifdef __clang__
#pragma clang diagnostic pop
#endif
#include "common/config.h"
#include "common/logging.h"
#include "common/status.h"
#include "io/fs/file_system.h"
#include "gutil/integral_types.h"
#include "inverted_index_query_type.h"
#include "olap/inverted_index_parser.h"
#include "olap/key_coder.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
#include "olap/rowset/segment_v2/inverted_index/query/query_factory.h"
#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/rowset/segment_v2/inverted_index_file_reader.h"
Expand Down Expand Up @@ -114,83 +98,10 @@ CREATE_QUERY_VALUE_TEMPLATE(PrimitiveType::TYPE_STRING)
CREATE_QUERY_VALUE_TEMPLATE(PrimitiveType::TYPE_IPV4)
CREATE_QUERY_VALUE_TEMPLATE(PrimitiveType::TYPE_IPV6)

std::unique_ptr<lucene::analysis::Analyzer> InvertedIndexReader::create_analyzer(
InvertedIndexCtx* inverted_index_ctx) {
std::unique_ptr<lucene::analysis::Analyzer> analyzer;
auto analyser_type = inverted_index_ctx->parser_type;
if (analyser_type == InvertedIndexParserType::PARSER_STANDARD ||
analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
analyzer = std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
} else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) {
analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
} else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
auto chinese_analyzer =
std::make_unique<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
chinese_analyzer->initDict(config::inverted_index_dict_path);
auto mode = inverted_index_ctx->parser_mode;
if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
} else {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
}
analyzer = std::move(chinese_analyzer);
} else {
// default
analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
return analyzer;
}

std::unique_ptr<lucene::util::Reader> InvertedIndexReader::create_reader(
InvertedIndexCtx* inverted_index_ctx, const std::string& value) {
std::unique_ptr<lucene::util::Reader> reader =
std::make_unique<lucene::util::SStringReader<char>>();
CharFilterMap& char_filter_map = inverted_index_ctx->char_filter_map;
if (!char_filter_map.empty()) {
reader = std::unique_ptr<lucene::util::Reader>(CharFilterFactory::create(
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], reader.release(),
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
}
reader->init(value.data(), value.size(), true);
return reader;
}

std::string InvertedIndexReader::get_index_file_path() {
return _inverted_index_file_reader->get_index_file_path(&_index_meta);
}

void InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_result,
lucene::util::Reader* reader,
lucene::analysis::Analyzer* analyzer,
const std::string& field_name,
InvertedIndexQueryType query_type,
bool drop_duplicates) {
analyse_result.clear();

std::wstring field_ws = StringUtil::string_to_wstring(field_name);
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
analyzer->tokenStream(field_ws.c_str(), reader));

lucene::analysis::Token token;

while (token_stream->next(&token)) {
if (token.termLength<char>() != 0) {
analyse_result.emplace_back(token.termBuffer<char>(), token.termLength<char>());
}
}

if (token_stream != nullptr) {
token_stream->close();
}

if (drop_duplicates && (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY)) {
std::set<std::string> unrepeated_result(analyse_result.begin(), analyse_result.end());
analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end());
}
}

Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats,
InvertedIndexQueryCacheHandle* cache_handle,
lucene::store::Directory* dir) {
Expand Down Expand Up @@ -362,14 +273,17 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(_index_meta.properties())),
get_parser_mode_string_from_properties(_index_meta.properties()),
get_parser_char_filter_map_from_properties(_index_meta.properties()));
auto analyzer = create_analyzer(inverted_index_ctx.get());
setup_analyzer_lowercase(analyzer, _index_meta.properties());
setup_analyzer_use_stopwords(analyzer, _index_meta.properties());
get_parser_char_filter_map_from_properties(_index_meta.properties()),
get_parser_lowercase_from_properties(_index_meta.properties()),
get_parser_stopwords_from_properties(_index_meta.properties()));
auto analyzer = inverted_index::InvertedIndexAnalyzer::create_analyzer(
inverted_index_ctx.get());
inverted_index_ctx->analyzer = analyzer.get();
auto reader = create_reader(inverted_index_ctx.get(), search_str);
get_analyse_result(query_info.terms, reader.get(), analyzer.get(), column_name,
query_type);
auto reader = inverted_index::InvertedIndexAnalyzer::create_reader(
inverted_index_ctx->char_filter_map);
reader->init(search_str.data(), search_str.size(), true);
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
query_info.terms, reader.get(), analyzer.get(), column_name, query_type);
}
if (query_info.terms.empty()) {
auto msg = fmt::format(
Expand Down Expand Up @@ -433,28 +347,6 @@ InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}

void FullTextIndexReader::setup_analyzer_lowercase(
std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties) {
auto lowercase = get_parser_lowercase_from_properties(properties);
if (lowercase == INVERTED_INDEX_PARSER_TRUE) {
analyzer->set_lowercase(true);
} else if (lowercase == INVERTED_INDEX_PARSER_FALSE) {
analyzer->set_lowercase(false);
}
}

void FullTextIndexReader::setup_analyzer_use_stopwords(
std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties) {
auto stop_words = get_parser_stopwords_from_properties(properties);
if (stop_words == "none") {
analyzer->set_stopwords(nullptr);
} else {
analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
}

Status StringTypeInvertedIndexReader::new_iterator(
OlapReaderStatistics* stats, RuntimeState* runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator) {
Expand Down
16 changes: 0 additions & 16 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,6 @@ class InvertedIndexReader : public std::enable_shared_from_this<InvertedIndexRea
[[nodiscard]] bool has_null() const { return _has_null; }
void set_has_null(bool has_null) { _has_null = has_null; }

static void get_analyse_result(std::vector<std::string>& analyse_result,
lucene::util::Reader* reader,
lucene::analysis::Analyzer* analyzer,
const std::string& field_name, InvertedIndexQueryType query_type,
bool drop_duplicates = true);

static std::unique_ptr<lucene::util::Reader> create_reader(InvertedIndexCtx* inverted_index_ctx,
const std::string& value);
static std::unique_ptr<lucene::analysis::Analyzer> create_analyzer(
InvertedIndexCtx* inverted_index_ctx);

virtual Status handle_query_cache(InvertedIndexQueryCache* cache,
const InvertedIndexQueryCache::CacheKey& cache_key,
InvertedIndexQueryCacheHandle* cache_handler,
Expand Down Expand Up @@ -277,11 +266,6 @@ class FullTextIndexReader : public InvertedIndexReader {
}

InvertedIndexReaderType type() override;

static void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);
static void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer,
const std::map<string, string>& properties);
};

class StringTypeInvertedIndexReader : public InvertedIndexReader {
Expand Down
Loading

0 comments on commit a0b94ba

Please sign in to comment.