diff --git a/.clang-format-ignore b/.clang-format-ignore index 5f10e2a221203c..675b0803161bf0 100644 --- a/.clang-format-ignore +++ b/.clang-format-ignore @@ -9,4 +9,5 @@ be/src/util/sse2neon.h be/src/util/mustache/mustache.h be/src/util/mustache/mustache.cc be/src/util/utf8_check.cpp +be/src/storage/index/inverted/analyzer/kuromoji/dict/darts.h cloud/src/common/defer.h diff --git a/.gitignore b/.gitignore index 7a61c598c99f75..6ef1c5c52e5174 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,6 @@ compile_commands.json .github .worktrees/ + +# generated kuromoji dictionary binaries +/be/dict/kuromoji/*.bin diff --git a/.licenserc.yaml b/.licenserc.yaml index 90d2f778686701..78dacfd0f90438 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -80,6 +80,7 @@ header: - "be/src/util/sse2neo.h" - "be/src/util/sse2neon.h" - "be/src/util/utf8_check.cpp" + - "be/src/storage/index/inverted/analyzer/kuromoji/dict/darts.h" - "be/src/pch/*" - "be/test/data" - "be/test/expected_result" diff --git a/NOTICE.txt b/NOTICE.txt index 3eec702a28a663..f5a3d52d172e87 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -73,6 +73,9 @@ This software includes third party software subject to the following copyrights: - Netty Reactive Streams - https://github.com/playframework/netty-reactive-streams - Jackson-core - https://github.com/FasterXML/jackson-core - Jackson-dataformat-cbor - https://github.com/FasterXML/jackson-dataformats-binary +- Darts-clone (double-array trie) - Copyright 2008-2014 Susumu Yata - https://github.com/s-yata/darts-clone (BSD 2-clause; see dist/licenses/LICENSE-darts-clone.txt) +- mecab-ipadic (IPADIC) Japanese morphological dictionary - Copyright 2000-2003 Nara Institute of Science and Technology (NAIST) - licensed under NAIST-2003 (BSD-style); the kuromoji analyzer bundles the UTF-8 form from https://github.com/lindera/mecab-ipadic (content of mecab-ipadic-2.7.0-20070801). See dist/licenses/LICENSE-ipadic.txt. +- Apache Lucene - https://github.com/apache/lucene (Apache-2.0): the kuromoji Japanese analyzer under be/src/storage/index/inverted/analyzer/kuromoji is an independent C++ implementation modeled on Lucene's kuromoji analyzer (JapaneseTokenizer), including its search-mode compound-decomposition cost model. The licenses for these third party components are included in LICENSE.txt diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 0df7454eeacb8a..8833b8bae0e5ac 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -313,6 +313,12 @@ install(DIRECTORY ${BASE_DIR}/dict/pinyin DESTINATION ${OUTPUT_DIR}/dict) +# Japanese kuromoji dictionary +install(DIRECTORY + ${BASE_DIR}/dict/kuromoji + DESTINATION ${OUTPUT_DIR}/dict + OPTIONAL) + # Check if functions are supported in this platform. All flags will generated # in gensrc/build/common/env_config.h. # You can check funcion here which depends on platform. Don't forget add this diff --git a/be/dict/kuromoji/README.md b/be/dict/kuromoji/README.md new file mode 100644 index 00000000000000..2da450abc30c63 --- /dev/null +++ b/be/dict/kuromoji/README.md @@ -0,0 +1,36 @@ +# Kuromoji (Japanese) dictionary + +This directory holds the compiled IPADIC dictionary consumed at runtime by the +`kuromoji` inverted-index analyzer (`KuromojiAnalyzer` → `KuromojiDictionary`): + +- `system.bin` — surface→word Darts trie + word entries + feature blob +- `matrix.bin` — connection-cost matrix (1316×1316) +- `chardef.bin` — character-category map + per-category flags +- `unkdict.bin` — unknown-word entries per category + +These `*.bin` files are **generated** (not committed; see `.gitignore`). The +runtime resolves them at `${inverted_index_dict_path}/kuromoji` +(default `${DORIS_HOME}/dict/kuromoji`); `be/CMakeLists.txt` installs this +directory into the BE package. + +## How it's (re)generated + +Source: the UTF-8 IPADIC from +(tag `2.7.0-20250920`) — the original `mecab-ipadic-2.7.0-20070801` lexicon +converted to UTF-8 (license: NAIST-2003, see `dist/licenses/LICENSE-ipadic.txt`). + +Automated, two steps: + +```bash +# 1. thirdparty fetches + stages the UTF-8 IPADIC source into +# ${DORIS_THIRDPARTY}/installed/share/mecab-ipadic-2.7.0-20250920 +sh thirdparty/build-thirdparty.sh mecab_ipadic + +# 2. the CMake target builds the offline compiler and produces the *.bin here +ninja -C be/ut_build_RELEASE kuromoji_dict +``` + +CI/release should run `ninja kuromoji_dict` before packaging; the BE `install` +rule then ships this directory. Override the source dir with +`-DKUROMOJI_IPADIC_SRC=` at CMake configure time. (The tool can also be +run directly: `kuromoji_build_dict be/dict/kuromoji`.) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index b521fa57ec3dfd..1df33354a1750d 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1287,6 +1287,8 @@ DEFINE_mDouble(inverted_index_ram_buffer_size, "512"); DEFINE_mInt32(inverted_index_max_buffered_docs, "-1"); // dict path for chinese analyzer DEFINE_String(inverted_index_dict_path, "${DORIS_HOME}/dict"); +// The kuromoji (Japanese) analyzer +DEFINE_mBool(enable_kuromoji_analyzer, "false"); DEFINE_Int32(inverted_index_read_buffer_size, "4096"); // tree depth for bkd index DEFINE_Int32(max_depth_in_bkd_tree, "32"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 0b415ed5d2c4ae..fdf47c32687660 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1329,6 +1329,8 @@ DECLARE_mDouble(inverted_index_ram_buffer_size); DECLARE_mInt32(inverted_index_max_buffered_docs); // dict path for chinese analyzer DECLARE_String(inverted_index_dict_path); +// The kuromoji (Japanese) analyzer +DECLARE_mBool(enable_kuromoji_analyzer); DECLARE_Int32(inverted_index_read_buffer_size); // tree depth for bkd index DECLARE_Int32(max_depth_in_bkd_tree); diff --git a/be/src/storage/index/inverted/analyzer/analyzer.cpp b/be/src/storage/index/inverted/analyzer/analyzer.cpp index 6d16613bcd9b00..1d9ba7e3169892 100644 --- a/be/src/storage/index/inverted/analyzer/analyzer.cpp +++ b/be/src/storage/index/inverted/analyzer/analyzer.cpp @@ -39,6 +39,7 @@ #include "storage/index/inverted/analyzer/basic/basic_analyzer.h" #include "storage/index/inverted/analyzer/icu/icu_analyzer.h" #include "storage/index/inverted/analyzer/ik/IKAnalyzer.h" +#include "storage/index/inverted/analyzer/kuromoji/KuromojiAnalyzer.h" #include "storage/index/inverted/char_filter/char_replace_char_filter_factory.h" namespace doris::segment_v2::inverted_index { @@ -69,7 +70,8 @@ bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& analyzer_name analyzer_name == INVERTED_INDEX_PARSER_CHINESE || analyzer_name == INVERTED_INDEX_PARSER_ICU || analyzer_name == INVERTED_INDEX_PARSER_BASIC || - analyzer_name == INVERTED_INDEX_PARSER_IK; + analyzer_name == INVERTED_INDEX_PARSER_IK || + analyzer_name == INVERTED_INDEX_PARSER_KUROMOJI; } AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType parser_type, @@ -107,6 +109,17 @@ AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy ik_analyzer->setMode(false); } analyzer = std::move(ik_analyzer); + } else if (parser_type == InvertedIndexParserType::PARSER_KUROMOJI) { + if (!config::enable_kuromoji_analyzer) { + throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR, + "kuromoji analyzer is disabled by default. Set " + "enable_kuromoji_analyzer=true in " + "be.conf (or via the BE config HTTP API) to enable it."); + } + auto kuromoji_analyzer = std::make_shared(); + kuromoji_analyzer->initDict(config::inverted_index_dict_path + "/kuromoji"); + kuromoji_analyzer->setMode(kuromoji_mode_from_string(parser_mode)); + analyzer = std::move(kuromoji_analyzer); } else { // default analyzer = std::make_shared>(); diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiAnalyzer.h b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiAnalyzer.h new file mode 100644 index 00000000000000..7b1354a4078b55 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiAnalyzer.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/logging.h" +#include "storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h" + +namespace doris::segment_v2 { + +class KuromojiAnalyzer : public Analyzer { +public: + KuromojiAnalyzer() { + _lowercase = true; + _ownReader = false; + } + ~KuromojiAnalyzer() override = default; + + bool isSDocOpt() override { return true; } + + // Loads (once, process-wide) the IPADIC dictionary from `dictPath`. If it is + // unavailable the tokenizer degrades to a per-codepoint split (logged), rather + // than failing index/query. + void initDict(const std::string& dictPath) override { + dict_ = kuromoji::KuromojiDictionary::get_or_load(dictPath); + if (dict_ == nullptr) { + LOG(WARNING) << "kuromoji: dictionary unavailable at " << dictPath + << "; falling back to per-codepoint tokenization"; + } + } + + void setMode(KuromojiMode mode) { mode_ = mode; } + + TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { + auto* tokenizer = _CLNEW KuromojiTokenizer(mode_, _lowercase, _ownReader, dict_); + tokenizer->reset(reader); + return (TokenStream*)tokenizer; + } + + TokenStream* reusableTokenStream(const TCHAR* fieldName, + lucene::util::Reader* reader) override { + if (tokenizer_ == nullptr) { + tokenizer_ = std::make_unique(mode_, _lowercase, _ownReader, dict_); + } + tokenizer_->reset(reader); + return (TokenStream*)tokenizer_.get(); + } + +private: + const kuromoji::KuromojiDictionary* dict_ {nullptr}; + KuromojiMode mode_ {KuromojiMode::Search}; + std::unique_ptr tokenizer_; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiMode.h b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiMode.h new file mode 100644 index 00000000000000..3625f825ec02ac --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiMode.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace doris::segment_v2 { + +// Segmentation mode, mirroring Lucene's JapaneseTokenizer.Mode. Normal returns +// the minimum-cost segmentation. Search additionally decomposes long compounds +// into their shorter parts (via a length-based cost penalty) for better search +// recall. Extended applies the Search penalty and also splits unknown +// (out-of-vocabulary) words into per-character unigrams. +enum class KuromojiMode { Normal, Search, Extended }; + +inline KuromojiMode kuromoji_mode_from_string(const std::string& mode) { + if (mode == "normal") { + return KuromojiMode::Normal; + } + if (mode == "extended") { + return KuromojiMode::Extended; + } + return KuromojiMode::Search; // default (matches OpenSearch/Lucene) +} + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.cpp b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.cpp new file mode 100644 index 00000000000000..bd1ae7f7692077 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.cpp @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h" + +#include +#include + +#include "storage/index/inverted/analyzer/kuromoji/kuromoji_normalize.h" + +namespace doris::segment_v2 { + +namespace { +// Number of bytes in the UTF-8 sequence whose lead byte is `c`. +inline int utf8_len(unsigned char c) { + if (c < 0x80) { + return 1; + } + if ((c >> 5) == 0x6) { + return 2; + } + if ((c >> 4) == 0xE) { + return 3; + } + if ((c >> 3) == 0x1E) { + return 4; + } + return 1; // invalid lead byte: treat as a single byte +} + +inline bool is_ascii_space(unsigned char c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'; +} + +// Returns the idx-th comma-separated field of an IPADIC feature string +// (0=POS1 ... 6=base form, 7=reading, 8=pronunciation), or empty. +std::string_view feature_field(std::string_view feat, int idx) { + int cur = 0; + std::size_t start = 0; + for (std::size_t i = 0; i <= feat.size(); ++i) { + if (i == feat.size() || feat[i] == ',') { + if (cur == idx) { + return feat.substr(start, i - start); + } + ++cur; + start = i + 1; + } + } + return {}; +} + +// Part-of-speech (POS1) classes dropped for full-text search. A coarse subset of +// Lucene/OpenSearch's ja stoptags: particles, auxiliary verbs, conjunctions, +// symbols, fillers. (Full stoptags fidelity is a later refinement.) +bool is_stop_pos(std::string_view pos1) { + return pos1 == "\xE5\x8A\xA9\xE8\xA9\x9E" || // 助詞 (particle) + pos1 == "\xE5\x8A\xA9\xE5\x8B\x95\xE8\xA9\x9E" || // 助動詞 (auxiliary verb) + pos1 == "\xE6\x8E\xA5\xE7\xB6\x9A\xE8\xA9\x9E" || // 接続詞 (conjunction) + pos1 == "\xE8\xA8\x98\xE5\x8F\xB7" || // 記号 (symbol) + pos1 == "\xE3\x83\x95\xE3\x82\xA3\xE3\x83\xA9\xE3\x83\xBC"; // フィラー (filler) +} + +void ascii_lower(std::string& s) { + for (char& c : s) { + if (c >= 'A' && c <= 'Z') { + c = static_cast(c - 'A' + 'a'); + } + } +} +} // namespace + +KuromojiTokenizer::KuromojiTokenizer(KuromojiMode mode, bool lower_case, bool own_reader, + const kuromoji::KuromojiDictionary* dict) + : mode_(mode), dict_(dict) { + this->lowercase = lower_case; + this->ownReader = own_reader; +} + +void KuromojiTokenizer::reset(lucene::util::Reader* reader) { + this->input = reader; + buffer_index_ = 0; + data_length_ = 0; + tokens_text_.clear(); + + // Read the entire input. readCopy returns the count read, or <= 0 at EOF. + std::string text; + char buf[4096]; + int32_t n = 0; + while ((n = reader->readCopy(buf, 0, static_cast(sizeof(buf)))) > 0) { + text.append(buf, n); + } + + if (dict_ != nullptr) { + // Viterbi morphological segmentation, then OpenSearch-default-style filtering: + // drop stop part-of-speech (particles/auxiliaries/...), emit the dictionary + // base form for conjugated words, and lowercase embedded ASCII. + kuromoji::KuromojiViterbi viterbi(*dict_, mode_); + std::vector morphemes; + viterbi.segment(text, &morphemes); + tokens_text_.reserve(morphemes.size()); + for (const auto& m : morphemes) { + const std::string_view feat = + m.known ? dict_->feature(dict_->word(m.word_id)) + : dict_->unknown_feature(dict_->unknown_word(m.word_id)); + if (is_stop_pos(feature_field(feat, 0))) { + continue; // part-of-speech stop filtering + } + const std::string_view base = feature_field(feat, 6); + std::string term = (base.empty() || base == "*") ? text.substr(m.byte_start, m.byte_len) + : std::string(base); + term = kuromoji::cjk_width_normalize( + term); // full-width ASCII -> ASCII before lowercase + if (this->lowercase) { + ascii_lower(term); + } + if (!term.empty()) { + tokens_text_.push_back(std::move(term)); + } + } + } else { + // Fallback (no dictionary wired in yet): CJK per-codepoint unigram split, + // skipping ASCII whitespace. + for (size_t i = 0; i < text.size();) { + int len = utf8_len(static_cast(text[i])); + if (i + static_cast(len) > text.size()) { + len = 1; // truncated tail: emit a single byte + } + if (!(len == 1 && is_ascii_space(static_cast(text[i])))) { + tokens_text_.emplace_back(text.substr(i, len)); + } + i += len; + } + } + data_length_ = static_cast(tokens_text_.size()); +} + +Token* KuromojiTokenizer::next(Token* token) { + if (buffer_index_ >= data_length_) { + return nullptr; + } + std::string& token_text = tokens_text_[buffer_index_++]; + // reset() already segmented and normalized the terms; hand them out one at a + // time, capped at the CLucene maximum term length. + size_t size = std::min(token_text.size(), static_cast(LUCENE_MAX_WORD_LEN)); + token->setNoCopy(token_text.data(), 0, static_cast(size)); + return token; +} + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h new file mode 100644 index 00000000000000..5e37c016d35e00 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "CLucene.h" +#include "CLucene/analysis/AnalysisHeader.h" +#include "storage/index/inverted/analyzer/kuromoji/KuromojiMode.h" +#include "storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.h" + +using namespace lucene::analysis; + +namespace doris::segment_v2 { + +// Japanese tokenizer. With a dictionary it does Viterbi morphological +// segmentation (kuromoji::KuromojiViterbi); without one it falls back to a +// per-codepoint CJK split. +class KuromojiTokenizer : public Tokenizer { +public: + explicit KuromojiTokenizer(KuromojiMode mode = KuromojiMode::Search, bool lowercase = true, + bool own_reader = false, + const kuromoji::KuromojiDictionary* dict = nullptr); + ~KuromojiTokenizer() override = default; + + Token* next(Token* token) override; + void reset(lucene::util::Reader* reader) override; + +private: + KuromojiMode mode_; + const kuromoji::KuromojiDictionary* dict_ {nullptr}; + int32_t buffer_index_ {0}; + int32_t data_length_ {0}; + // Backing storage for emitted terms; must outlive tokens (setNoCopy contract). + std::vector tokens_text_; +}; + +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/darts.h b/be/src/storage/index/inverted/analyzer/kuromoji/dict/darts.h new file mode 100644 index 00000000000000..46f5652464fd78 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/darts.h @@ -0,0 +1,2001 @@ +// Vendored from https://github.com/s-yata/darts-clone (include/darts.h, v0.32). +// Copyright (c) 2008-2014, Susumu Yata. Licensed under the 2-clause BSD license. +// See dist/licenses/LICENSE-darts-clone.txt. Unmodified except this header note +// and the `#pragma GCC system_header` below (so this third-party header is +// exempt from Doris's -Wall -Werror -Wpedantic when included). +#ifndef DARTS_H_ +#define DARTS_H_ + +// Treat the rest of this third-party header as a system header: suppress all +// warnings it would otherwise raise under Doris's strict compile flags. +#pragma GCC system_header + +#include +#include +#include + +#define DARTS_VERSION "0.32" + +// DARTS_THROW() throws a whose message starts with the +// file name and the line number. For example, DARTS_THROW("error message") at +// line 123 of "darts.h" throws a which has a pointer to +// "darts.h:123: exception: error message". The message is available by using +// what() as well as that of . +#define DARTS_INT_TO_STR(value) #value +#define DARTS_LINE_TO_STR(line) DARTS_INT_TO_STR(line) +#define DARTS_LINE_STR DARTS_LINE_TO_STR(__LINE__) +#define DARTS_THROW(msg) throw Darts::Details::Exception( \ + __FILE__ ":" DARTS_LINE_STR ": exception: " msg) + +namespace Darts { + +// The following namespace hides the internal types and classes. +namespace Details { + +// This header assumes that and are 32-bit integer types. +// +// Darts-clone keeps values associated with keys. The type of the values is +// . Note that the values must be positive integers because the +// most significant bit (MSB) of each value is used to represent whether the +// corresponding unit is a leaf or not. Also, the keys are represented by +// sequences of s. is the unsigned type of . +typedef char char_type; +typedef unsigned char uchar_type; +typedef int value_type; + +// The main structure of Darts-clone is an array of s, and the +// unit type is actually a wrapper of . +typedef unsigned int id_type; + +// is the type of callback functions for reporting the +// progress of building a dictionary. See also build() of . +// The 1st argument receives the progress value and the 2nd argument receives +// the maximum progress value. A usage example is to show the progress +// percentage, 100.0 * (the 1st argument) / (the 2nd argument). +typedef int (*progress_func_type)(std::size_t, std::size_t); + +// is the type of double-array units and it is a wrapper of +// in practice. +class DoubleArrayUnit { + public: + DoubleArrayUnit() : unit_() {} + + // has_leaf() returns whether a leaf unit is immediately derived from the + // unit (true) or not (false). + bool has_leaf() const { + return ((unit_ >> 8) & 1) == 1; + } + // value() returns the value stored in the unit, and thus value() is + // available when and only when the unit is a leaf unit. + value_type value() const { + return static_cast(unit_ & ((1U << 31) - 1)); + } + + // label() returns the label associted with the unit. Note that a leaf unit + // always returns an invalid label. For this feature, leaf unit's label() + // returns an that has the MSB of 1. + id_type label() const { + return unit_ & ((1U << 31) | 0xFF); + } + // offset() returns the offset from the unit to its derived units. + id_type offset() const { + return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6); + } + + private: + id_type unit_; + + // Copyable. +}; + +// Darts-clone throws an for memory allocation failure, invalid +// arguments or a too large offset. The last case means that there are too many +// keys in the given set of keys. Note that the `msg' of must be a +// constant or static string because an keeps only a pointer to +// that string. +class Exception : public std::exception { + public: + explicit Exception(const char *msg = NULL) throw() : msg_(msg) {} + Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {} + virtual ~Exception() throw() {} + + // overrides what() of . + virtual const char *what() const throw() { + return (msg_ != NULL) ? msg_ : ""; + } + + private: + const char *msg_; + + // Disallows operator=. + Exception &operator=(const Exception &); +}; + +} // namespace Details + +// is the interface of Darts-clone. Note that other +// classes should not be accessed from outside. +// +// has 4 template arguments but only the 3rd one is used as +// the type of values. Note that the given is used only from outside, and +// the internal value type is not changed from . +// In build(), given values are casted from to +// by using static_cast. On the other hand, values are casted from +// to in searching dictionaries. +template +class DoubleArrayImpl { + public: + // Even if this is changed, the internal value type is still + // . Other types, such as 64-bit integer types + // and floating-point number types, should not be used. + typedef T value_type; + // A key is reprenseted by a sequence of s. For example, + // exactMatchSearch() takes a . + typedef Details::char_type key_type; + // In searching dictionaries, the values associated with the matched keys are + // stored into or returned as s. + typedef value_type result_type; + + // enables applications to get the lengths of the matched + // keys in addition to the values. + struct result_pair_type { + value_type value; + std::size_t length; + }; + + // The constructor initializes member variables with 0 and NULLs. + DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {} + // The destructor frees memory allocated for units and then initializes + // member variables with 0 and NULLs. + virtual ~DoubleArrayImpl() { + clear(); + } + + // has 2 kinds of set_result()s. The 1st set_result() is to + // set a value to a . The 2nd set_result() is to set a value and + // a length to a . By using set_result()s, search methods + // can return the 2 kinds of results in the same way. + // Why the set_result()s are non-static? It is for compatibility. + // + // The 1st set_result() takes a length as the 3rd argument but it is not + // used. If a compiler does a good job, codes for getting the length may be + // removed. + void set_result(value_type *result, value_type value, std::size_t) const { + *result = value; + } + // The 2nd set_result() uses both `value' and `length'. + void set_result(result_pair_type *result, + value_type value, std::size_t length) const { + result->value = value; + result->length = length; + } + + // set_array() calls clear() in order to free memory allocated to the old + // array and then sets a new array. This function is useful to set a memory- + // mapped array. Note that the array set by set_array() is not freed in + // clear() and the destructor of . + // set_array() can also set the size of the new array but the size is not + // used in search methods. So it works well even if the 2nd argument is 0 or + // omitted. Remember that size() and total_size() returns 0 in such a case. + void set_array(const void *ptr, std::size_t size = 0) { + clear(); + array_ = static_cast(ptr); + size_ = size; + } + // array() returns a pointer to the array of units. + const void *array() const { + return array_; + } + + // clear() frees memory allocated to units and then initializes member + // variables with 0 and NULLs. Note that clear() does not free memory if the + // array of units was set by set_array(). In such a case, `array_' is not + // NULL and `buf_' is NULL. + void clear() { + size_ = 0; + array_ = NULL; + if (buf_ != NULL) { + delete[] buf_; + buf_ = NULL; + } + } + + // unit_size() returns the size of each unit. The size must be 4 bytes. + std::size_t unit_size() const { + return sizeof(unit_type); + } + // size() returns the number of units. It can be 0 if set_array() is used. + std::size_t size() const { + return size_; + } + // total_size() returns the number of bytes allocated to the array of units. + // It can be 0 if set_array() is used. + std::size_t total_size() const { + return unit_size() * size(); + } + // nonzero_size() exists for compatibility. It always returns the number of + // units because it takes long time to count the number of non-zero units. + std::size_t nonzero_size() const { + return size(); + } + + // build() constructs a dictionary from given key-value pairs. If `lengths' + // is NULL, `keys' is handled as an array of zero-terminated strings. If + // `values' is NULL, the index in `keys' is associated with each key, i.e. + // the ith key has (i - 1) as its value. + // Note that the key-value pairs must be arranged in key order and the values + // must not be negative. Also, if there are duplicate keys, only the first + // pair will be stored in the resultant dictionary. + // `progress_func' is a pointer to a callback function. If it is not NULL, + // it will be called in build() so that the caller can check the progress of + // dictionary construction. For details, please see the definition of + // . + // The return value of build() is 0, and it indicates the success of the + // operation. Otherwise, build() throws a , which is a + // derived class of . + // build() uses another construction algorithm if `values' is not NULL. In + // this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead + // of a trie because a DAWG is likely to be more compact than a trie. + int build(std::size_t num_keys, const key_type * const *keys, + const std::size_t *lengths = NULL, const value_type *values = NULL, + Details::progress_func_type progress_func = NULL); + + // open() reads an array of units from the specified file. And if it goes + // well, the old array will be freed and replaced with the new array read + // from the file. `offset' specifies the number of bytes to be skipped before + // reading an array. `size' specifies the number of bytes to be read from the + // file. If the `size' is 0, the whole file will be read. + // open() returns 0 iff the operation succeeds. Otherwise, it returns a + // non-zero value or throws a . The exception is thrown + // when and only when a memory allocation fails. + int open(const char *file_name, const char *mode = "rb", + std::size_t offset = 0, std::size_t size = 0); + // save() writes the array of units into the specified file. `offset' + // specifies the number of bytes to be skipped before writing the array. + // open() returns 0 iff the operation succeeds. Otherwise, it returns a + // non-zero value. + int save(const char *file_name, const char *mode = "wb", + std::size_t offset = 0) const; + + // The 1st exactMatchSearch() tests whether the given key exists or not, and + // if it exists, its value and length are set to `result'. Otherwise, the + // value and the length of `result' are set to -1 and 0 respectively. + // Note that if `length' is 0, `key' is handled as a zero-terminated string. + // `node_pos' specifies the start position of matching. This argument enables + // the combination of exactMatchSearch() and traverse(). For example, if you + // want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get + // the node position corresponding to "xyz" and then you can use + // exactMatchSearch() to test "A", "BC", and "DE" from that position. + // Note that the length of `result' indicates the length from the `node_pos'. + // In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }. + template + void exactMatchSearch(const key_type *key, U &result, + std::size_t length = 0, std::size_t node_pos = 0) const { + result = exactMatchSearch(key, length, node_pos); + } + // The 2nd exactMatchSearch() returns a result instead of updating the 2nd + // argument. So, the following exactMatchSearch() has only 3 arguments. + template + inline U exactMatchSearch(const key_type *key, std::size_t length = 0, + std::size_t node_pos = 0) const; + + // commonPrefixSearch() searches for keys which match a prefix of the given + // string. If `length' is 0, `key' is handled as a zero-terminated string. + // The values and the lengths of at most `max_num_results' matched keys are + // stored in `results'. commonPrefixSearch() returns the number of matched + // keys. Note that the return value can be larger than `max_num_results' if + // there are more than `max_num_results' matches. If you want to get all the + // results, allocate more spaces and call commonPrefixSearch() again. + // `node_pos' works as well as in exactMatchSearch(). + template + inline std::size_t commonPrefixSearch(const key_type *key, U *results, + std::size_t max_num_results, std::size_t length = 0, + std::size_t node_pos = 0) const; + + // The 1st commonLongestPrefixSearch() earches for the longest key which + // matches a prefix of the given string, and if it exists, its value and + // length are set to `result'. Otherwise, the value and the length of + //`result' are set to -1 and 0 respectively. Note that if `length' is 0, + // `key' is handled as a zero-terminated string. `node_pos' works as well as + // in exactMatchSearch(). + template + void commonLongestPrefixSearch(const key_type *key, U &result, + std::size_t length = 0, std::size_t node_pos = 0) const { + result = commonLongestPrefixSearch(key, length, node_pos); + } + // The 2nd commonLongestPrefixSearch() returns a result instead of updating + // the 2nd argument. So, the following commonLongestPrefixSearch() has only + // 3 arguments. + template + inline U commonLongestPrefixSearch(const key_type *key, + std::size_t length = 0, std::size_t node_pos = 0) const; + + // In Darts-clone, a dictionary is a deterministic finite-state automaton + // (DFA) and traverse() tests transitions on the DFA. The initial state is + // `node_pos' and traverse() chooses transitions labeled key[key_pos], + // key[key_pos + 1], ... in order. If there is not a transition labeled + // key[key_pos + i], traverse() terminates the transitions at that state and + // returns -2. Otherwise, traverse() ends without a termination and returns + // -1 or a nonnegative value, -1 indicates that the final state was not an + // accept state. When a nonnegative value is returned, it is the value + // associated with the final accept state. That is, traverse() returns the + // value associated with the given key if it exists. Note that traverse() + // updates `node_pos' and `key_pos' after each transition. + inline value_type traverse(const key_type *key, std::size_t &node_pos, + std::size_t &key_pos, std::size_t length = 0) const; + + private: + typedef Details::uchar_type uchar_type; + typedef Details::id_type id_type; + typedef Details::DoubleArrayUnit unit_type; + + std::size_t size_; + const unit_type *array_; + unit_type *buf_; + + // Disallows copy and assignment. + DoubleArrayImpl(const DoubleArrayImpl &); + DoubleArrayImpl &operator=(const DoubleArrayImpl &); +}; + +// is the typical instance of . It uses +// as the type of values and it is suitable for most cases. +typedef DoubleArrayImpl DoubleArray; + +// The interface section ends here. For using Darts-clone, there is no need +// to read the remaining section, which gives the implementation of +// Darts-clone. + +// +// Member functions of DoubleArrayImpl (except build()). +// + +template +int DoubleArrayImpl::open(const char *file_name, + const char *mode, std::size_t offset, std::size_t size) { +#ifdef _MSC_VER + std::FILE *file; + if (::fopen_s(&file, file_name, mode) != 0) { + return -1; + } +#else + std::FILE *file = std::fopen(file_name, mode); + if (file == NULL) { + return -1; + } +#endif + + if (size == 0) { + if (std::fseek(file, 0, SEEK_END) != 0) { + std::fclose(file); + return -1; + } + size = std::ftell(file) - offset; + } + + size /= unit_size(); + if (size < 256 || (size & 0xFF) != 0) { + std::fclose(file); + return -1; + } + + if (std::fseek(file, offset, SEEK_SET) != 0) { + std::fclose(file); + return -1; + } + + unit_type units[256]; + if (std::fread(units, unit_size(), 256, file) != 256) { + std::fclose(file); + return -1; + } + + if (units[0].label() != '\0' || units[0].has_leaf() || + units[0].offset() == 0 || units[0].offset() >= 512) { + std::fclose(file); + return -1; + } + for (id_type i = 1; i < 256; ++i) { + if (units[i].label() <= 0xFF && units[i].offset() >= size) { + std::fclose(file); + return -1; + } + } + + unit_type *buf; + try { + buf = new unit_type[size]; + for (id_type i = 0; i < 256; ++i) { + buf[i] = units[i]; + } + } catch (const std::bad_alloc &) { + std::fclose(file); + DARTS_THROW("failed to open double-array: std::bad_alloc"); + } + + if (size > 256) { + if (std::fread(buf + 256, unit_size(), size - 256, file) != size - 256) { + std::fclose(file); + delete[] buf; + return -1; + } + } + std::fclose(file); + + clear(); + + size_ = size; + array_ = buf; + buf_ = buf; + return 0; +} + +template +int DoubleArrayImpl::save(const char *file_name, + const char *mode, std::size_t offset) const { + if (size() == 0) { + return -1; + } + +#ifdef _MSC_VER + std::FILE *file; + if (::fopen_s(&file, file_name, mode) != 0) { + return -1; + } +#else + std::FILE *file = std::fopen(file_name, mode); + if (file == NULL) { + return -1; + } +#endif + + if (std::fseek(file, offset, SEEK_SET) != 0) { + std::fclose(file); + return -1; + } + + if (std::fwrite(array_, unit_size(), size(), file) != size()) { + std::fclose(file); + return -1; + } + std::fclose(file); + return 0; +} + +template +template +inline U DoubleArrayImpl::exactMatchSearch(const key_type *key, + std::size_t length, std::size_t node_pos) const { + U result; + set_result(&result, static_cast(-1), 0); + + unit_type unit = array_[node_pos]; + if (length != 0) { + for (std::size_t i = 0; i < length; ++i) { + node_pos ^= unit.offset() ^ static_cast(key[i]); + unit = array_[node_pos]; + if (unit.label() != static_cast(key[i])) { + return result; + } + } + } else { + for ( ; key[length] != '\0'; ++length) { + node_pos ^= unit.offset() ^ static_cast(key[length]); + unit = array_[node_pos]; + if (unit.label() != static_cast(key[length])) { + return result; + } + } + } + + if (!unit.has_leaf()) { + return result; + } + unit = array_[node_pos ^ unit.offset()]; + set_result(&result, static_cast(unit.value()), length); + return result; +} + +template +template +inline std::size_t DoubleArrayImpl::commonPrefixSearch( + const key_type *key, U *results, std::size_t max_num_results, + std::size_t length, std::size_t node_pos) const { + std::size_t num_results = 0; + + unit_type unit = array_[node_pos]; + node_pos ^= unit.offset(); + if (length != 0) { + for (std::size_t i = 0; i < length; ++i) { + node_pos ^= static_cast(key[i]); + unit = array_[node_pos]; + if (unit.label() != static_cast(key[i])) { + return num_results; + } + + node_pos ^= unit.offset(); + if (unit.has_leaf()) { + if (num_results < max_num_results) { + set_result(&results[num_results], static_cast( + array_[node_pos].value()), i + 1); + } + ++num_results; + } + } + } else { + for ( ; key[length] != '\0'; ++length) { + node_pos ^= static_cast(key[length]); + unit = array_[node_pos]; + if (unit.label() != static_cast(key[length])) { + return num_results; + } + + node_pos ^= unit.offset(); + if (unit.has_leaf()) { + if (num_results < max_num_results) { + set_result(&results[num_results], static_cast( + array_[node_pos].value()), length + 1); + } + ++num_results; + } + } + } + + return num_results; +} + +template +template +inline U DoubleArrayImpl::commonLongestPrefixSearch( + const key_type *key, std::size_t length, + std::size_t node_pos) const { + U result; + set_result(&result, static_cast(-1), 0); + + unit_type unit = array_[node_pos]; + node_pos ^= unit.offset(); + if (length != 0) { + for (std::size_t i = 0; i < length; ++i) { + node_pos ^= static_cast(key[i]); + unit = array_[node_pos]; + if (unit.label() != static_cast(key[i])) { + return result; + } + + node_pos ^= unit.offset(); + if (unit.has_leaf()) { + set_result(&result, static_cast( + array_[node_pos].value()), i + 1); + } + } + } else { + for ( ; key[length] != '\0'; ++length) { + node_pos ^= static_cast(key[length]); + unit = array_[node_pos]; + if (unit.label() != static_cast(key[length])) { + return result; + } + + node_pos ^= unit.offset(); + if (unit.has_leaf()) { + set_result(&result, static_cast( + array_[node_pos].value()), length + 1); + } + } + } + + return result; +} + +template +inline typename DoubleArrayImpl::value_type +DoubleArrayImpl::traverse(const key_type *key, + std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const { + id_type id = static_cast(node_pos); + unit_type unit = array_[id]; + + if (length != 0) { + for ( ; key_pos < length; ++key_pos) { + id ^= unit.offset() ^ static_cast(key[key_pos]); + unit = array_[id]; + if (unit.label() != static_cast(key[key_pos])) { + return static_cast(-2); + } + node_pos = id; + } + } else { + for ( ; key[key_pos] != '\0'; ++key_pos) { + id ^= unit.offset() ^ static_cast(key[key_pos]); + unit = array_[id]; + if (unit.label() != static_cast(key[key_pos])) { + return static_cast(-2); + } + node_pos = id; + } + } + + if (!unit.has_leaf()) { + return static_cast(-1); + } + unit = array_[id ^ unit.offset()]; + return static_cast(unit.value()); +} + +namespace Details { + +// +// Memory management of array. +// + +template +class AutoArray { + public: + explicit AutoArray(T *array = NULL) : array_(array) {} + ~AutoArray() { + clear(); + } + + const T &operator[](std::size_t id) const { + return array_[id]; + } + T &operator[](std::size_t id) { + return array_[id]; + } + + bool empty() const { + return array_ == NULL; + } + + void clear() { + if (array_ != NULL) { + delete[] array_; + array_ = NULL; + } + } + void swap(AutoArray *array) { + T *temp = array_; + array_ = array->array_; + array->array_ = temp; + } + void reset(T *array = NULL) { + AutoArray(array).swap(this); + } + + private: + T *array_; + + // Disallows copy and assignment. + AutoArray(const AutoArray &); + AutoArray &operator=(const AutoArray &); +}; + +// +// Memory management of resizable array. +// + +template +class AutoPool { + public: + AutoPool() : buf_(), size_(0), capacity_(0) {} + ~AutoPool() { clear(); } + + const T &operator[](std::size_t id) const { + return *(reinterpret_cast(&buf_[0]) + id); + } + T &operator[](std::size_t id) { + return *(reinterpret_cast(&buf_[0]) + id); + } + + bool empty() const { + return size_ == 0; + } + std::size_t size() const { + return size_; + } + + void clear() { + resize(0); + buf_.clear(); + size_ = 0; + capacity_ = 0; + } + + void push_back(const T &value) { + append(value); + } + void pop_back() { + (*this)[--size_].~T(); + } + + void append() { + if (size_ == capacity_) + resize_buf(size_ + 1); + new(&(*this)[size_++]) T; + } + void append(const T &value) { + if (size_ == capacity_) + resize_buf(size_ + 1); + new(&(*this)[size_++]) T(value); + } + + void resize(std::size_t size) { + while (size_ > size) { + (*this)[--size_].~T(); + } + if (size > capacity_) { + resize_buf(size); + } + while (size_ < size) { + new(&(*this)[size_++]) T; + } + } + void resize(std::size_t size, const T &value) { + while (size_ > size) { + (*this)[--size_].~T(); + } + if (size > capacity_) { + resize_buf(size); + } + while (size_ < size) { + new(&(*this)[size_++]) T(value); + } + } + + void reserve(std::size_t size) { + if (size > capacity_) { + resize_buf(size); + } + } + + private: + AutoArray buf_; + std::size_t size_; + std::size_t capacity_; + + // Disallows copy and assignment. + AutoPool(const AutoPool &); + AutoPool &operator=(const AutoPool &); + + void resize_buf(std::size_t size); +}; + +template +void AutoPool::resize_buf(std::size_t size) { + std::size_t capacity; + if (size >= capacity_ * 2) { + capacity = size; + } else { + capacity = 1; + while (capacity < size) { + capacity <<= 1; + } + } + + AutoArray buf; + try { + buf.reset(new char[sizeof(T) * capacity]); + } catch (const std::bad_alloc &) { + DARTS_THROW("failed to resize pool: std::bad_alloc"); + } + + if (size_ > 0) { + T *src = reinterpret_cast(&buf_[0]); + T *dest = reinterpret_cast(&buf[0]); + for (std::size_t i = 0; i < size_; ++i) { + new(&dest[i]) T(src[i]); + src[i].~T(); + } + } + + buf_.swap(&buf); + capacity_ = capacity; +} + +// +// Memory management of stack. +// + +template +class AutoStack { + public: + AutoStack() : pool_() {} + ~AutoStack() { + clear(); + } + + const T &top() const { + return pool_[size() - 1]; + } + T &top() { + return pool_[size() - 1]; + } + + bool empty() const { + return pool_.empty(); + } + std::size_t size() const { + return pool_.size(); + } + + void push(const T &value) { + pool_.push_back(value); + } + void pop() { + pool_.pop_back(); + } + + void clear() { + pool_.clear(); + } + + private: + AutoPool pool_; + + // Disallows copy and assignment. + AutoStack(const AutoStack &); + AutoStack &operator=(const AutoStack &); +}; + +// +// Succinct bit vector. +// + +class BitVector { + public: + BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {} + ~BitVector() { + clear(); + } + + bool operator[](std::size_t id) const { + return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1; + } + + id_type rank(std::size_t id) const { + std::size_t unit_id = id / UNIT_SIZE; + return ranks_[unit_id] + pop_count(units_[unit_id] + & (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1))); + } + + void set(std::size_t id, bool bit) { + if (bit) { + units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE); + } else { + units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE)); + } + } + + bool empty() const { + return units_.empty(); + } + std::size_t num_ones() const { + return num_ones_; + } + std::size_t size() const { + return size_; + } + + void append() { + if ((size_ % UNIT_SIZE) == 0) { + units_.append(0); + } + ++size_; + } + void build(); + + void clear() { + units_.clear(); + ranks_.clear(); + } + + private: + enum { UNIT_SIZE = sizeof(id_type) * 8 }; + + AutoPool units_; + AutoArray ranks_; + std::size_t num_ones_; + std::size_t size_; + + // Disallows copy and assignment. + BitVector(const BitVector &); + BitVector &operator=(const BitVector &); + + static id_type pop_count(id_type unit) { + unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555); + unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333); + unit = ((unit >> 4) + unit) & 0x0F0F0F0F; + unit += unit >> 8; + unit += unit >> 16; + return unit & 0xFF; + } +}; + +inline void BitVector::build() { + try { + ranks_.reset(new id_type[units_.size()]); + } catch (const std::bad_alloc &) { + DARTS_THROW("failed to build rank index: std::bad_alloc"); + } + + num_ones_ = 0; + for (std::size_t i = 0; i < units_.size(); ++i) { + ranks_[i] = num_ones_; + num_ones_ += pop_count(units_[i]); + } +} + +// +// Keyset. +// + +template +class Keyset { + public: + Keyset(std::size_t num_keys, const char_type * const *keys, + const std::size_t *lengths, const T *values) : + num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {} + + std::size_t num_keys() const { + return num_keys_; + } + const char_type *keys(std::size_t id) const { + return keys_[id]; + } + uchar_type keys(std::size_t key_id, std::size_t char_id) const { + if (has_lengths() && char_id >= lengths_[key_id]) + return '\0'; + return keys_[key_id][char_id]; + } + + bool has_lengths() const { + return lengths_ != NULL; + } + std::size_t lengths(std::size_t id) const { + if (has_lengths()) { + return lengths_[id]; + } + std::size_t length = 0; + while (keys_[id][length] != '\0') { + ++length; + } + return length; + } + + bool has_values() const { + return values_ != NULL; + } + const value_type values(std::size_t id) const { + if (has_values()) { + return static_cast(values_[id]); + } + return static_cast(id); + } + + private: + std::size_t num_keys_; + const char_type * const * keys_; + const std::size_t *lengths_; + const T *values_; + + // Disallows copy and assignment. + Keyset(const Keyset &); + Keyset &operator=(const Keyset &); +}; + +// +// Node of Directed Acyclic Word Graph (DAWG). +// + +class DawgNode { + public: + DawgNode() : child_(0), sibling_(0), label_('\0'), + is_state_(false), has_sibling_(false) {} + + void set_child(id_type child) { + child_ = child; + } + void set_sibling(id_type sibling) { + sibling_ = sibling; + } + void set_value(value_type value) { + child_ = value; + } + void set_label(uchar_type label) { + label_ = label; + } + void set_is_state(bool is_state) { + is_state_ = is_state; + } + void set_has_sibling(bool has_sibling) { + has_sibling_ = has_sibling; + } + + id_type child() const { + return child_; + } + id_type sibling() const { + return sibling_; + } + value_type value() const { + return static_cast(child_); + } + uchar_type label() const { + return label_; + } + bool is_state() const { + return is_state_; + } + bool has_sibling() const { + return has_sibling_; + } + + id_type unit() const { + if (label_ == '\0') { + return (child_ << 1) | (has_sibling_ ? 1 : 0); + } + return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0); + } + + private: + id_type child_; + id_type sibling_; + uchar_type label_; + bool is_state_; + bool has_sibling_; + + // Copyable. +}; + +// +// Fixed unit of Directed Acyclic Word Graph (DAWG). +// + +class DawgUnit { + public: + explicit DawgUnit(id_type unit = 0) : unit_(unit) {} + DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {} + + DawgUnit &operator=(id_type unit) { + unit_ = unit; + return *this; + } + + id_type unit() const { + return unit_; + } + + id_type child() const { + return unit_ >> 2; + } + bool has_sibling() const { + return (unit_ & 1) == 1; + } + value_type value() const { + return static_cast(unit_ >> 1); + } + bool is_state() const { + return (unit_ & 2) == 2; + } + + private: + id_type unit_; + + // Copyable. +}; + +// +// Directed Acyclic Word Graph (DAWG) builder. +// + +class DawgBuilder { + public: + DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(), + table_(), node_stack_(), recycle_bin_(), num_states_(0) {} + ~DawgBuilder() { + clear(); + } + + id_type root() const { + return 0; + } + + id_type child(id_type id) const { + return units_[id].child(); + } + id_type sibling(id_type id) const { + return units_[id].has_sibling() ? (id + 1) : 0; + } + int value(id_type id) const { + return units_[id].value(); + } + + bool is_leaf(id_type id) const { + return label(id) == '\0'; + } + uchar_type label(id_type id) const { + return labels_[id]; + } + + bool is_intersection(id_type id) const { + return is_intersections_[id]; + } + id_type intersection_id(id_type id) const { + return is_intersections_.rank(id) - 1; + } + + std::size_t num_intersections() const { + return is_intersections_.num_ones(); + } + + std::size_t size() const { + return units_.size(); + } + + void init(); + void finish(); + + void insert(const char *key, std::size_t length, value_type value); + + void clear(); + + private: + enum { INITIAL_TABLE_SIZE = 1 << 10 }; + + AutoPool nodes_; + AutoPool units_; + AutoPool labels_; + BitVector is_intersections_; + AutoPool table_; + AutoStack node_stack_; + AutoStack recycle_bin_; + std::size_t num_states_; + + // Disallows copy and assignment. + DawgBuilder(const DawgBuilder &); + DawgBuilder &operator=(const DawgBuilder &); + + void flush(id_type id); + + void expand_table(); + + id_type find_unit(id_type id, id_type *hash_id) const; + id_type find_node(id_type node_id, id_type *hash_id) const; + + bool are_equal(id_type node_id, id_type unit_id) const; + + id_type hash_unit(id_type id) const; + id_type hash_node(id_type id) const; + + id_type append_node(); + id_type append_unit(); + + void free_node(id_type id) { + recycle_bin_.push(id); + } + + static id_type hash(id_type key) { + key = ~key + (key << 15); // key = (key << 15) - key - 1; + key = key ^ (key >> 12); + key = key + (key << 2); + key = key ^ (key >> 4); + key = key * 2057; // key = (key + (key << 3)) + (key << 11); + key = key ^ (key >> 16); + return key; + } +}; + +inline void DawgBuilder::init() { + table_.resize(INITIAL_TABLE_SIZE, 0); + + append_node(); + append_unit(); + + num_states_ = 1; + + nodes_[0].set_label(0xFF); + node_stack_.push(0); +} + +inline void DawgBuilder::finish() { + flush(0); + + units_[0] = nodes_[0].unit(); + labels_[0] = nodes_[0].label(); + + nodes_.clear(); + table_.clear(); + node_stack_.clear(); + recycle_bin_.clear(); + + is_intersections_.build(); +} + +inline void DawgBuilder::insert(const char *key, std::size_t length, + value_type value) { + if (value < 0) { + DARTS_THROW("failed to insert key: negative value"); + } else if (length == 0) { + DARTS_THROW("failed to insert key: zero-length key"); + } + + id_type id = 0; + std::size_t key_pos = 0; + + for ( ; key_pos <= length; ++key_pos) { + id_type child_id = nodes_[id].child(); + if (child_id == 0) { + break; + } + + uchar_type key_label = static_cast(key[key_pos]); + if (key_pos < length && key_label == '\0') { + DARTS_THROW("failed to insert key: invalid null character"); + } + + uchar_type unit_label = nodes_[child_id].label(); + if (key_label < unit_label) { + DARTS_THROW("failed to insert key: wrong key order"); + } else if (key_label > unit_label) { + nodes_[child_id].set_has_sibling(true); + flush(child_id); + break; + } + id = child_id; + } + + if (key_pos > length) { + return; + } + + for ( ; key_pos <= length; ++key_pos) { + uchar_type key_label = static_cast( + (key_pos < length) ? key[key_pos] : '\0'); + id_type child_id = append_node(); + + if (nodes_[id].child() == 0) { + nodes_[child_id].set_is_state(true); + } + nodes_[child_id].set_sibling(nodes_[id].child()); + nodes_[child_id].set_label(key_label); + nodes_[id].set_child(child_id); + node_stack_.push(child_id); + + id = child_id; + } + nodes_[id].set_value(value); +} + +inline void DawgBuilder::clear() { + nodes_.clear(); + units_.clear(); + labels_.clear(); + is_intersections_.clear(); + table_.clear(); + node_stack_.clear(); + recycle_bin_.clear(); + num_states_ = 0; +} + +inline void DawgBuilder::flush(id_type id) { + while (node_stack_.top() != id) { + id_type node_id = node_stack_.top(); + node_stack_.pop(); + + if (num_states_ >= table_.size() - (table_.size() >> 2)) { + expand_table(); + } + + id_type num_siblings = 0; + for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { + ++num_siblings; + } + + id_type hash_id; + id_type match_id = find_node(node_id, &hash_id); + if (match_id != 0) { + is_intersections_.set(match_id, true); + } else { + id_type unit_id = 0; + for (id_type i = 0; i < num_siblings; ++i) { + unit_id = append_unit(); + } + for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { + units_[unit_id] = nodes_[i].unit(); + labels_[unit_id] = nodes_[i].label(); + --unit_id; + } + match_id = unit_id + 1; + table_[hash_id] = match_id; + ++num_states_; + } + + for (id_type i = node_id, next; i != 0; i = next) { + next = nodes_[i].sibling(); + free_node(i); + } + + nodes_[node_stack_.top()].set_child(match_id); + } + node_stack_.pop(); +} + +inline void DawgBuilder::expand_table() { + std::size_t table_size = table_.size() << 1; + table_.clear(); + table_.resize(table_size, 0); + + for (std::size_t i = 1; i < units_.size(); ++i) { + id_type id = static_cast(i); + if (labels_[id] == '\0' || units_[id].is_state()) { + id_type hash_id; + find_unit(id, &hash_id); + table_[hash_id] = id; + } + } +} + +inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const { + *hash_id = hash_unit(id) % table_.size(); + for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { + id_type unit_id = table_[*hash_id]; + if (unit_id == 0) { + break; + } + + // There must not be the same unit. + } + return 0; +} + +inline id_type DawgBuilder::find_node(id_type node_id, + id_type *hash_id) const { + *hash_id = hash_node(node_id) % table_.size(); + for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { + id_type unit_id = table_[*hash_id]; + if (unit_id == 0) { + break; + } + + if (are_equal(node_id, unit_id)) { + return unit_id; + } + } + return 0; +} + +inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const { + for (id_type i = nodes_[node_id].sibling(); i != 0; + i = nodes_[i].sibling()) { + if (units_[unit_id].has_sibling() == false) { + return false; + } + ++unit_id; + } + if (units_[unit_id].has_sibling() == true) { + return false; + } + + for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) { + if (nodes_[i].unit() != units_[unit_id].unit() || + nodes_[i].label() != labels_[unit_id]) { + return false; + } + } + return true; +} + +inline id_type DawgBuilder::hash_unit(id_type id) const { + id_type hash_value = 0; + for ( ; id != 0; ++id) { + id_type unit = units_[id].unit(); + uchar_type label = labels_[id]; + hash_value ^= hash((label << 24) ^ unit); + + if (units_[id].has_sibling() == false) { + break; + } + } + return hash_value; +} + +inline id_type DawgBuilder::hash_node(id_type id) const { + id_type hash_value = 0; + for ( ; id != 0; id = nodes_[id].sibling()) { + id_type unit = nodes_[id].unit(); + uchar_type label = nodes_[id].label(); + hash_value ^= hash((label << 24) ^ unit); + } + return hash_value; +} + +inline id_type DawgBuilder::append_unit() { + is_intersections_.append(); + units_.append(); + labels_.append(); + + return static_cast(is_intersections_.size() - 1); +} + +inline id_type DawgBuilder::append_node() { + id_type id; + if (recycle_bin_.empty()) { + id = static_cast(nodes_.size()); + nodes_.append(); + } else { + id = recycle_bin_.top(); + nodes_[id] = DawgNode(); + recycle_bin_.pop(); + } + return id; +} + +// +// Unit of double-array builder. +// + +class DoubleArrayBuilderUnit { + public: + DoubleArrayBuilderUnit() : unit_(0) {} + + void set_has_leaf(bool has_leaf) { + if (has_leaf) { + unit_ |= 1U << 8; + } else { + unit_ &= ~(1U << 8); + } + } + void set_value(value_type value) { + unit_ = value | (1U << 31); + } + void set_label(uchar_type label) { + unit_ = (unit_ & ~0xFFU) | label; + } + void set_offset(id_type offset) { + if (offset >= 1U << 29) { + DARTS_THROW("failed to modify unit: too large offset"); + } + unit_ &= (1U << 31) | (1U << 8) | 0xFF; + if (offset < 1U << 21) { + unit_ |= (offset << 10); + } else { + unit_ |= (offset << 2) | (1U << 9); + } + } + + private: + id_type unit_; + + // Copyable. +}; + +// +// Extra unit of double-array builder. +// + +class DoubleArrayBuilderExtraUnit { + public: + DoubleArrayBuilderExtraUnit() : prev_(0), next_(0), + is_fixed_(false), is_used_(false) {} + + void set_prev(id_type prev) { + prev_ = prev; + } + void set_next(id_type next) { + next_ = next; + } + void set_is_fixed(bool is_fixed) { + is_fixed_ = is_fixed; + } + void set_is_used(bool is_used) { + is_used_ = is_used; + } + + id_type prev() const { + return prev_; + } + id_type next() const { + return next_; + } + bool is_fixed() const { + return is_fixed_; + } + bool is_used() const { + return is_used_; + } + + private: + id_type prev_; + id_type next_; + bool is_fixed_; + bool is_used_; + + // Copyable. +}; + +// +// DAWG -> double-array converter. +// + +class DoubleArrayBuilder { + public: + explicit DoubleArrayBuilder(progress_func_type progress_func) + : progress_func_(progress_func), units_(), extras_(), labels_(), + table_(), extras_head_(0) {} + ~DoubleArrayBuilder() { + clear(); + } + + template + void build(const Keyset &keyset); + void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const; + + void clear(); + + private: + enum { BLOCK_SIZE = 256 }; + enum { NUM_EXTRA_BLOCKS = 16 }; + enum { NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS }; + + enum { UPPER_MASK = 0xFF << 21 }; + enum { LOWER_MASK = 0xFF }; + + typedef DoubleArrayBuilderUnit unit_type; + typedef DoubleArrayBuilderExtraUnit extra_type; + + progress_func_type progress_func_; + AutoPool units_; + AutoArray extras_; + AutoPool labels_; + AutoArray table_; + id_type extras_head_; + + // Disallows copy and assignment. + DoubleArrayBuilder(const DoubleArrayBuilder &); + DoubleArrayBuilder &operator=(const DoubleArrayBuilder &); + + std::size_t num_blocks() const { + return units_.size() / BLOCK_SIZE; + } + + const extra_type &extras(id_type id) const { + return extras_[id % NUM_EXTRAS]; + } + extra_type &extras(id_type id) { + return extras_[id % NUM_EXTRAS]; + } + + template + void build_dawg(const Keyset &keyset, DawgBuilder *dawg_builder); + void build_from_dawg(const DawgBuilder &dawg); + void build_from_dawg(const DawgBuilder &dawg, + id_type dawg_id, id_type dic_id); + id_type arrange_from_dawg(const DawgBuilder &dawg, + id_type dawg_id, id_type dic_id); + + template + void build_from_keyset(const Keyset &keyset); + template + void build_from_keyset(const Keyset &keyset, std::size_t begin, + std::size_t end, std::size_t depth, id_type dic_id); + template + id_type arrange_from_keyset(const Keyset &keyset, std::size_t begin, + std::size_t end, std::size_t depth, id_type dic_id); + + id_type find_valid_offset(id_type id) const; + bool is_valid_offset(id_type id, id_type offset) const; + + void reserve_id(id_type id); + void expand_units(); + + void fix_all_blocks(); + void fix_block(id_type block_id); +}; + +template +void DoubleArrayBuilder::build(const Keyset &keyset) { + if (keyset.has_values()) { + Details::DawgBuilder dawg_builder; + build_dawg(keyset, &dawg_builder); + build_from_dawg(dawg_builder); + dawg_builder.clear(); + } else { + build_from_keyset(keyset); + } +} + +inline void DoubleArrayBuilder::copy(std::size_t *size_ptr, + DoubleArrayUnit **buf_ptr) const { + if (size_ptr != NULL) { + *size_ptr = units_.size(); + } + if (buf_ptr != NULL) { + *buf_ptr = new DoubleArrayUnit[units_.size()]; + unit_type *units = reinterpret_cast(*buf_ptr); + for (std::size_t i = 0; i < units_.size(); ++i) { + units[i] = units_[i]; + } + } +} + +inline void DoubleArrayBuilder::clear() { + units_.clear(); + extras_.clear(); + labels_.clear(); + table_.clear(); + extras_head_ = 0; +} + +template +void DoubleArrayBuilder::build_dawg(const Keyset &keyset, + DawgBuilder *dawg_builder) { + dawg_builder->init(); + for (std::size_t i = 0; i < keyset.num_keys(); ++i) { + dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i)); + if (progress_func_ != NULL) { + progress_func_(i + 1, keyset.num_keys() + 1); + } + } + dawg_builder->finish(); +} + +inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) { + std::size_t num_units = 1; + while (num_units < dawg.size()) { + num_units <<= 1; + } + units_.reserve(num_units); + + table_.reset(new id_type[dawg.num_intersections()]); + for (std::size_t i = 0; i < dawg.num_intersections(); ++i) { + table_[i] = 0; + } + + extras_.reset(new extra_type[NUM_EXTRAS]); + + reserve_id(0); + extras(0).set_is_used(true); + units_[0].set_offset(1); + units_[0].set_label('\0'); + + if (dawg.child(dawg.root()) != 0) { + build_from_dawg(dawg, dawg.root(), 0); + } + + fix_all_blocks(); + + extras_.clear(); + labels_.clear(); + table_.clear(); +} + +inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg, + id_type dawg_id, id_type dic_id) { + id_type dawg_child_id = dawg.child(dawg_id); + if (dawg.is_intersection(dawg_child_id)) { + id_type intersection_id = dawg.intersection_id(dawg_child_id); + id_type offset = table_[intersection_id]; + if (offset != 0) { + offset ^= dic_id; + if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { + if (dawg.is_leaf(dawg_child_id)) { + units_[dic_id].set_has_leaf(true); + } + units_[dic_id].set_offset(offset); + return; + } + } + } + + id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id); + if (dawg.is_intersection(dawg_child_id)) { + table_[dawg.intersection_id(dawg_child_id)] = offset; + } + + do { + uchar_type child_label = dawg.label(dawg_child_id); + id_type dic_child_id = offset ^ child_label; + if (child_label != '\0') { + build_from_dawg(dawg, dawg_child_id, dic_child_id); + } + dawg_child_id = dawg.sibling(dawg_child_id); + } while (dawg_child_id != 0); +} + +inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg, + id_type dawg_id, id_type dic_id) { + labels_.resize(0); + + id_type dawg_child_id = dawg.child(dawg_id); + while (dawg_child_id != 0) { + labels_.append(dawg.label(dawg_child_id)); + dawg_child_id = dawg.sibling(dawg_child_id); + } + + id_type offset = find_valid_offset(dic_id); + units_[dic_id].set_offset(dic_id ^ offset); + + dawg_child_id = dawg.child(dawg_id); + for (std::size_t i = 0; i < labels_.size(); ++i) { + id_type dic_child_id = offset ^ labels_[i]; + reserve_id(dic_child_id); + + if (dawg.is_leaf(dawg_child_id)) { + units_[dic_id].set_has_leaf(true); + units_[dic_child_id].set_value(dawg.value(dawg_child_id)); + } else { + units_[dic_child_id].set_label(labels_[i]); + } + + dawg_child_id = dawg.sibling(dawg_child_id); + } + extras(offset).set_is_used(true); + + return offset; +} + +template +void DoubleArrayBuilder::build_from_keyset(const Keyset &keyset) { + std::size_t num_units = 1; + while (num_units < keyset.num_keys()) { + num_units <<= 1; + } + units_.reserve(num_units); + + extras_.reset(new extra_type[NUM_EXTRAS]); + + reserve_id(0); + extras(0).set_is_used(true); + units_[0].set_offset(1); + units_[0].set_label('\0'); + + if (keyset.num_keys() > 0) { + build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0); + } + + fix_all_blocks(); + + extras_.clear(); + labels_.clear(); +} + +template +void DoubleArrayBuilder::build_from_keyset(const Keyset &keyset, + std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { + id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id); + + while (begin < end) { + if (keyset.keys(begin, depth) != '\0') { + break; + } + ++begin; + } + if (begin == end) { + return; + } + + std::size_t last_begin = begin; + uchar_type last_label = keyset.keys(begin, depth); + while (++begin < end) { + uchar_type label = keyset.keys(begin, depth); + if (label != last_label) { + build_from_keyset(keyset, last_begin, begin, + depth + 1, offset ^ last_label); + last_begin = begin; + last_label = keyset.keys(begin, depth); + } + } + build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label); +} + +template +id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset &keyset, + std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { + labels_.resize(0); + + value_type value = -1; + for (std::size_t i = begin; i < end; ++i) { + uchar_type label = keyset.keys(i, depth); + if (label == '\0') { + if (keyset.has_lengths() && depth < keyset.lengths(i)) { + DARTS_THROW("failed to build double-array: " + "invalid null character"); + } else if (keyset.values(i) < 0) { + DARTS_THROW("failed to build double-array: negative value"); + } + + if (value == -1) { + value = keyset.values(i); + } + if (progress_func_ != NULL) { + progress_func_(i + 1, keyset.num_keys() + 1); + } + } + + if (labels_.empty()) { + labels_.append(label); + } else if (label != labels_[labels_.size() - 1]) { + if (label < labels_[labels_.size() - 1]) { + DARTS_THROW("failed to build double-array: wrong key order"); + } + labels_.append(label); + } + } + + id_type offset = find_valid_offset(dic_id); + units_[dic_id].set_offset(dic_id ^ offset); + + for (std::size_t i = 0; i < labels_.size(); ++i) { + id_type dic_child_id = offset ^ labels_[i]; + reserve_id(dic_child_id); + if (labels_[i] == '\0') { + units_[dic_id].set_has_leaf(true); + units_[dic_child_id].set_value(value); + } else { + units_[dic_child_id].set_label(labels_[i]); + } + } + extras(offset).set_is_used(true); + + return offset; +} + +inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const { + if (extras_head_ >= units_.size()) { + return units_.size() | (id & LOWER_MASK); + } + + id_type unfixed_id = extras_head_; + do { + id_type offset = unfixed_id ^ labels_[0]; + if (is_valid_offset(id, offset)) { + return offset; + } + unfixed_id = extras(unfixed_id).next(); + } while (unfixed_id != extras_head_); + + return units_.size() | (id & LOWER_MASK); +} + +inline bool DoubleArrayBuilder::is_valid_offset(id_type id, + id_type offset) const { + if (extras(offset).is_used()) { + return false; + } + + id_type rel_offset = id ^ offset; + if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) { + return false; + } + + for (std::size_t i = 1; i < labels_.size(); ++i) { + if (extras(offset ^ labels_[i]).is_fixed()) { + return false; + } + } + + return true; +} + +inline void DoubleArrayBuilder::reserve_id(id_type id) { + if (id >= units_.size()) { + expand_units(); + } + + if (id == extras_head_) { + extras_head_ = extras(id).next(); + if (extras_head_ == id) { + extras_head_ = units_.size(); + } + } + extras(extras(id).prev()).set_next(extras(id).next()); + extras(extras(id).next()).set_prev(extras(id).prev()); + extras(id).set_is_fixed(true); +} + +inline void DoubleArrayBuilder::expand_units() { + id_type src_num_units = units_.size(); + id_type src_num_blocks = num_blocks(); + + id_type dest_num_units = src_num_units + BLOCK_SIZE; + id_type dest_num_blocks = src_num_blocks + 1; + + if (dest_num_blocks > NUM_EXTRA_BLOCKS) { + fix_block(src_num_blocks - NUM_EXTRA_BLOCKS); + } + + units_.resize(dest_num_units); + + if (dest_num_blocks > NUM_EXTRA_BLOCKS) { + for (std::size_t id = src_num_units; id < dest_num_units; ++id) { + extras(id).set_is_used(false); + extras(id).set_is_fixed(false); + } + } + + for (id_type i = src_num_units + 1; i < dest_num_units; ++i) { + extras(i - 1).set_next(i); + extras(i).set_prev(i - 1); + } + + extras(src_num_units).set_prev(dest_num_units - 1); + extras(dest_num_units - 1).set_next(src_num_units); + + extras(src_num_units).set_prev(extras(extras_head_).prev()); + extras(dest_num_units - 1).set_next(extras_head_); + + extras(extras(extras_head_).prev()).set_next(src_num_units); + extras(extras_head_).set_prev(dest_num_units - 1); +} + +inline void DoubleArrayBuilder::fix_all_blocks() { + id_type begin = 0; + if (num_blocks() > NUM_EXTRA_BLOCKS) { + begin = num_blocks() - NUM_EXTRA_BLOCKS; + } + id_type end = num_blocks(); + + for (id_type block_id = begin; block_id != end; ++block_id) { + fix_block(block_id); + } +} + +inline void DoubleArrayBuilder::fix_block(id_type block_id) { + id_type begin = block_id * BLOCK_SIZE; + id_type end = begin + BLOCK_SIZE; + + id_type unused_offset = 0; + for (id_type offset = begin; offset != end; ++offset) { + if (!extras(offset).is_used()) { + unused_offset = offset; + break; + } + } + + for (id_type id = begin; id != end; ++id) { + if (!extras(id).is_fixed()) { + reserve_id(id); + units_[id].set_label(static_cast(id ^ unused_offset)); + } + } +} + +} // namespace Details + +// +// Member function build() of DoubleArrayImpl. +// + +template +int DoubleArrayImpl::build(std::size_t num_keys, + const key_type * const *keys, const std::size_t *lengths, + const value_type *values, Details::progress_func_type progress_func) { + Details::Keyset keyset(num_keys, keys, lengths, values); + + Details::DoubleArrayBuilder builder(progress_func); + builder.build(keyset); + + std::size_t size = 0; + unit_type *buf = NULL; + builder.copy(&size, &buf); + + clear(); + + size_ = size; + array_ = buf; + buf_ = buf; + + if (progress_func != NULL) { + progress_func(num_keys + 1, num_keys + 1); + } + + return 0; +} + +} // namespace Darts + +#undef DARTS_INT_TO_STR +#undef DARTS_LINE_TO_STR +#undef DARTS_LINE_STR +#undef DARTS_THROW + +#endif // DARTS_H_ diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h new file mode 100644 index 00000000000000..3cf22594c2c1eb --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace doris::segment_v2::kuromoji { + +// "DORISKMJ" as 8 bytes; little-endian only. +inline constexpr char KMJ_MAGIC[8] = {'D', 'O', 'R', 'I', 'S', 'K', 'M', 'J'}; +inline constexpr uint32_t KMJ_FORMAT_VERSION = 1; + +enum KmjFileKind : uint32_t { + KMJ_KIND_SYSTEM = 1, + KMJ_KIND_MATRIX = 2, + KMJ_KIND_CHARDEF = 3, + KMJ_KIND_UNKDICT = 4, +}; + +// Canonical character categories (ordinals are stable on disk). Mirrors Lucene's set. +enum CharCategory : uint8_t { + CAT_NGRAM = 0, + CAT_DEFAULT = 1, + CAT_SPACE = 2, + CAT_SYMBOL = 3, + CAT_NUMERIC = 4, + CAT_ALPHA = 5, + CAT_CYRILLIC = 6, + CAT_GREEK = 7, + CAT_HIRAGANA = 8, + CAT_KATAKANA = 9, + CAT_KANJI = 10, + CAT_KANJINUMERIC = 11, + CAT_CLASS_COUNT = 12, +}; + +#pragma pack(push, 1) +// Common 32-byte file header at offset 0 of every .bin. All ints little-endian. +struct KmjFileHeader { + char magic[8]; // KMJ_MAGIC + uint32_t format_version; // KMJ_FORMAT_VERSION + uint32_t file_kind; // KmjFileKind + uint64_t file_size; // total file bytes (sanity vs fstat) + uint8_t reserved[8]; // zero +}; + +// system.bin sub-header (follows KmjFileHeader). Offsets are absolute from file start. +struct KmjSystemHeader { + uint64_t trie_offset; + uint64_t trie_bytes; // Darts array (4-byte units) + uint64_t runs_offset; + uint64_t runs_count; // WordIdRun[runs_count], indexed by trie value + uint64_t entries_offset; + uint64_t entries_count; // WordEntry[entries_count], indexed by word id + uint64_t features_offset; + uint64_t features_bytes; // length-prefixed UTF-8 blob +}; + +struct KmjMatrixHeader { + uint32_t forward_size; // right-context cardinality + uint32_t backward_size; // left-context cardinality + uint64_t cells_offset; // int16[forward_size*backward_size], row-major by backward_id +}; + +struct KmjCharDefHeader { + uint32_t class_count; // == CAT_CLASS_COUNT for IPADIC + uint32_t reserved; + uint64_t catmap_offset; // uint8[0x10000], one category per BMP code point + uint64_t defs_offset; // CategoryDef[class_count] +}; + +struct KmjUnkHeader { + uint32_t class_count; + uint32_t reserved; + uint64_t runs_offset; // WordIdRun[class_count], indexed by category ordinal + uint64_t entries_offset; + uint64_t entries_count; // WordEntry[entries_count] + uint64_t features_offset; + uint64_t features_bytes; +}; + +// 12 bytes. Indexed by word id (a plain 0-based index here). +struct WordEntry { + int16_t left_id; + int16_t right_id; + int16_t word_cost; + uint16_t pad; // keep 4-byte alignment; reserved + uint32_t feature_offset; // byte offset into the features blob; 0xFFFFFFFF == none +}; + +// 8 bytes. system.bin: indexed by trie value. unkdict.bin: indexed by category ordinal. +struct WordIdRun { + uint32_t entry_start; // first word id in this run + uint32_t count; // number of entries in this run +}; + +// 4 bytes. Indexed by category ordinal. +struct CategoryDef { + uint8_t invoke; // 0/1 + uint8_t group; // 0/1 + uint16_t length; // max grouping length (kept for fidelity; Lucene ignores it) +}; +#pragma pack(pop) + +inline constexpr uint32_t KMJ_NO_FEATURE = 0xFFFFFFFFU; + +// Connection cost accessor (MeCab/Lucene convention, verbatim): +// forward_id = left node's right-context-id; backward_id = right node's left-context-id. +inline int16_t connection_cost(const int16_t* cells, uint32_t forward_size, uint32_t forward_id, + uint32_t backward_id) { + return cells[static_cast(backward_id) * forward_size + forward_id]; +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.cpp b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.cpp new file mode 100644 index 00000000000000..956a5c98420775 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.cpp @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "common/logging.h" + +namespace doris::segment_v2::kuromoji { + +MappedFile::~MappedFile() { + if (_data != nullptr) { + ::munmap(const_cast(_data), _size); + _data = nullptr; + _size = 0; + } +} + +Status MappedFile::open(const std::string& path) { + int fd = ::open(path.c_str(), O_RDONLY); + if (fd < 0) { + return Status::IOError("kuromoji dict: cannot open {}", path); + } + struct stat st {}; + if (::fstat(fd, &st) != 0 || st.st_size <= 0) { + ::close(fd); + return Status::IOError("kuromoji dict: cannot stat {}", path); + } + auto bytes = static_cast(st.st_size); + void* m = ::mmap(nullptr, bytes, PROT_READ, MAP_PRIVATE, fd, 0); + ::close(fd); + if (m == MAP_FAILED) { + return Status::IOError("kuromoji dict: mmap failed for {}", path); + } + _data = static_cast(m); + _size = bytes; + return Status::OK(); +} + +Status KuromojiDictionary::check_header(const uint8_t* p, std::size_t size, KmjFileKind kind) { + if (size < sizeof(KmjFileHeader)) { + return Status::Corruption("kuromoji dict: file too small"); + } + KmjFileHeader h {}; + std::memcpy(&h, p, sizeof(h)); + if (std::memcmp(h.magic, KMJ_MAGIC, sizeof(h.magic)) != 0) { + return Status::Corruption("kuromoji dict: bad magic"); + } + if (h.format_version != KMJ_FORMAT_VERSION) { + return Status::Corruption("kuromoji dict: version {} != {}", h.format_version, + KMJ_FORMAT_VERSION); + } + if (h.file_kind != static_cast(kind)) { + return Status::Corruption("kuromoji dict: wrong file_kind {}", h.file_kind); + } + if (h.file_size != size) { + return Status::Corruption("kuromoji dict: file_size {} != actual {}", h.file_size, size); + } + return Status::OK(); +} + +std::string_view KuromojiDictionary::feature_at(const uint8_t* blob, uint64_t blob_bytes, + uint32_t off) { + if (off == KMJ_NO_FEATURE || blob == nullptr || static_cast(off) + 2 > blob_bytes) { + return {}; + } + auto len = static_cast(static_cast(blob[off]) | + static_cast(blob[off + 1] << 8)); + if (static_cast(off) + 2 + len > blob_bytes) { + return {}; + } + return {reinterpret_cast(blob + off + 2), len}; +} + +Status KuromojiDictionary::map_system(const std::string& path) { + RETURN_IF_ERROR(_system_map.open(path)); + const uint8_t* p = _system_map.data(); + RETURN_IF_ERROR(check_header(p, _system_map.size(), KMJ_KIND_SYSTEM)); + KmjSystemHeader s {}; + std::memcpy(&s, p + sizeof(KmjFileHeader), sizeof(s)); + _runs = reinterpret_cast(p + s.runs_offset); + _entries = reinterpret_cast(p + s.entries_offset); + _features = p + s.features_offset; + _features_bytes = s.features_bytes; + if (s.trie_bytes > 0) { + // size is in 4-byte units; the mmap outlives _trie (both owned by this object). + _trie.set_array(p + s.trie_offset, static_cast(s.trie_bytes / 4)); + } + return Status::OK(); +} + +Status KuromojiDictionary::map_matrix(const std::string& path) { + RETURN_IF_ERROR(_matrix_map.open(path)); + const uint8_t* p = _matrix_map.data(); + RETURN_IF_ERROR(check_header(p, _matrix_map.size(), KMJ_KIND_MATRIX)); + KmjMatrixHeader m {}; + std::memcpy(&m, p + sizeof(KmjFileHeader), sizeof(m)); + _forward_size = m.forward_size; + _cells = reinterpret_cast(p + m.cells_offset); + return Status::OK(); +} + +Status KuromojiDictionary::map_chardef(const std::string& path) { + RETURN_IF_ERROR(_chardef_map.open(path)); + const uint8_t* p = _chardef_map.data(); + RETURN_IF_ERROR(check_header(p, _chardef_map.size(), KMJ_KIND_CHARDEF)); + KmjCharDefHeader c {}; + std::memcpy(&c, p + sizeof(KmjFileHeader), sizeof(c)); + _catmap = p + c.catmap_offset; + _defs = reinterpret_cast(p + c.defs_offset); + return Status::OK(); +} + +Status KuromojiDictionary::map_unkdict(const std::string& path) { + RETURN_IF_ERROR(_unk_map.open(path)); + const uint8_t* p = _unk_map.data(); + RETURN_IF_ERROR(check_header(p, _unk_map.size(), KMJ_KIND_UNKDICT)); + KmjUnkHeader u {}; + std::memcpy(&u, p + sizeof(KmjFileHeader), sizeof(u)); + _unk_runs = reinterpret_cast(p + u.runs_offset); + _unk_entries = reinterpret_cast(p + u.entries_offset); + _unk_features = p + u.features_offset; + _unk_features_bytes = u.features_bytes; + return Status::OK(); +} + +Status KuromojiDictionary::load(const std::string& dir, std::unique_ptr* out) { + auto dict = std::make_unique(); + RETURN_IF_ERROR(dict->map_system(dir + "/system.bin")); + RETURN_IF_ERROR(dict->map_matrix(dir + "/matrix.bin")); + RETURN_IF_ERROR(dict->map_chardef(dir + "/chardef.bin")); + RETURN_IF_ERROR(dict->map_unkdict(dir + "/unkdict.bin")); + *out = std::move(dict); + return Status::OK(); +} + +const KuromojiDictionary* KuromojiDictionary::get_or_load(const std::string& dir) { + static std::mutex mu; + static std::map> cache; + std::lock_guard lock(mu); + auto it = cache.find(dir); + if (it != cache.end()) { + return it->second.get(); // may be nullptr if a prior load failed + } + std::unique_ptr dict; + Status st = load(dir, &dict); + if (!st.ok()) { + LOG(WARNING) << "kuromoji: failed to load dictionary from " << dir << ": " << st; + cache.emplace(dir, nullptr); + return nullptr; + } + const KuromojiDictionary* ptr = dict.get(); + cache.emplace(dir, std::move(dict)); + return ptr; +} + +void KuromojiDictionary::common_prefix_search(const char* text, std::size_t len, + std::vector* out) const { + out->clear(); + constexpr std::size_t kBatch = 64; + Darts::DoubleArray::result_pair_type results[kBatch]; + std::size_t n = _trie.commonPrefixSearch(text, results, kBatch, len); + if (n > kBatch) { + // Rare: more prefix matches than the stack buffer; re-query with an exact buffer. + std::vector big(n); + std::size_t m = _trie.commonPrefixSearch(text, big.data(), n, len); + std::size_t take = m < n ? m : n; + for (std::size_t i = 0; i < take; ++i) { + out->push_back( + {static_cast(big[i].value), static_cast(big[i].length)}); + } + return; + } + for (std::size_t i = 0; i < n; ++i) { + out->push_back({static_cast(results[i].value), + static_cast(results[i].length)}); + } +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h new file mode 100644 index 00000000000000..c10146243d24d1 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/darts.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h" + +namespace doris::segment_v2::kuromoji { + +// One read-only mmap region; unmaps on destruction. +class MappedFile { +public: + MappedFile() = default; + ~MappedFile(); + MappedFile(const MappedFile&) = delete; + MappedFile& operator=(const MappedFile&) = delete; + + Status open(const std::string& path); + const uint8_t* data() const { return _data; } + std::size_t size() const { return _size; } + +private: + const uint8_t* _data = nullptr; + std::size_t _size = 0; +}; + +// Read-only kuromoji dictionary backed by four mmapped files. After load() it is +// immutable and safe to share across threads. Returned string_views/pointers are +// valid only for the dictionary's lifetime. This is the query API the Phase-2 +// Viterbi tokenizer consumes. +class KuromojiDictionary { +public: + struct PrefixMatch { + uint32_t trie_value; + uint32_t length; // bytes consumed from the input + }; + + static Status load(const std::string& dir, std::unique_ptr* out); + + // Process-wide, per-directory cache: loads the dictionary at `dir` once and + // returns a stable pointer valid for the process lifetime. Returns nullptr if + // `dir` has no valid dictionary (the failure is logged and cached). Thread-safe. + static const KuromojiDictionary* get_or_load(const std::string& dir); + + void common_prefix_search(const char* text, std::size_t len, + std::vector* out) const; + + // System dictionary. + WordIdRun run_for_value(uint32_t trie_value) const { return _runs[trie_value]; } + const WordEntry& word(uint32_t word_id) const { return _entries[word_id]; } + std::string_view feature(const WordEntry& e) const { + return feature_at(_features, _features_bytes, e.feature_offset); + } + + // Connection costs. + int16_t connection_cost(uint32_t forward_id, uint32_t backward_id) const { + return ::doris::segment_v2::kuromoji::connection_cost(_cells, _forward_size, forward_id, + backward_id); + } + + // Character definitions. + uint8_t char_category(char32_t cp) const { + return cp < 0x10000 ? _catmap[cp] : static_cast(CAT_DEFAULT); + } + bool is_invoke(char32_t cp) const { return _defs[char_category(cp)].invoke != 0; } + bool is_group(char32_t cp) const { return _defs[char_category(cp)].group != 0; } + + // Unknown-word dictionary. + WordIdRun unknown_run(uint8_t category) const { return _unk_runs[category]; } + const WordEntry& unknown_word(uint32_t word_id) const { return _unk_entries[word_id]; } + std::string_view unknown_feature(const WordEntry& e) const { + return feature_at(_unk_features, _unk_features_bytes, e.feature_offset); + } + +private: + static std::string_view feature_at(const uint8_t* blob, uint64_t blob_bytes, uint32_t off); + static Status check_header(const uint8_t* p, std::size_t size, KmjFileKind kind); + Status map_system(const std::string& path); + Status map_matrix(const std::string& path); + Status map_chardef(const std::string& path); + Status map_unkdict(const std::string& path); + + MappedFile _system_map; + MappedFile _matrix_map; + MappedFile _chardef_map; + MappedFile _unk_map; + + // system + Darts::DoubleArray _trie; + const WordIdRun* _runs = nullptr; + const WordEntry* _entries = nullptr; + const uint8_t* _features = nullptr; + uint64_t _features_bytes = 0; + // matrix + const int16_t* _cells = nullptr; + uint32_t _forward_size = 0; + // chardef + const uint8_t* _catmap = nullptr; + const CategoryDef* _defs = nullptr; + // unk + const WordIdRun* _unk_runs = nullptr; + const WordEntry* _unk_entries = nullptr; + const uint8_t* _unk_features = nullptr; + uint64_t _unk_features_bytes = 0; +}; + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.cpp b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.cpp new file mode 100644 index 00000000000000..1cbb29ea84ddd0 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.cpp @@ -0,0 +1,263 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h" + +#include +#include +#include +#include + +#include "storage/index/inverted/analyzer/kuromoji/dict/darts.h" + +namespace doris::segment_v2::kuromoji { + +namespace { + +// Append-only byte buffer that tracks offset and 8-byte-aligns sections. +class ByteSink { +public: + void align8() { + while ((_buf.size() % 8) != 0) { + _buf.push_back(0); + } + } + uint64_t offset() const { return static_cast(_buf.size()); } + void put(const void* p, std::size_t n) { + const auto* b = static_cast(p); + _buf.insert(_buf.end(), b, b + n); + } + template + void put_pod(const T& v) { + put(&v, sizeof(T)); + } + std::vector& buf() { return _buf; } + +private: + std::vector _buf; +}; + +KmjFileHeader make_header(KmjFileKind kind) { + KmjFileHeader h {}; + std::memcpy(h.magic, KMJ_MAGIC, sizeof(h.magic)); + h.format_version = KMJ_FORMAT_VERSION; + h.file_kind = kind; + h.file_size = 0; // patched at flush + return h; +} + +// Appends one run's worth of entries (+ feature blob bytes) and returns the run. +WordIdRun append_words(const std::vector& words, std::vector& entries, + std::vector& features) { + WordIdRun run {static_cast(entries.size()), static_cast(words.size())}; + for (const auto& w : words) { + WordEntry e {}; + e.left_id = w.left_id; + e.right_id = w.right_id; + e.word_cost = w.word_cost; + e.pad = 0; + if (w.feature.empty()) { + e.feature_offset = KMJ_NO_FEATURE; + } else { + e.feature_offset = static_cast(features.size()); + const auto len = + static_cast(std::min(w.feature.size(), 0xFFFFU)); + features.push_back(static_cast(len & 0xFFU)); + features.push_back(static_cast((len >> 8) & 0xFFU)); + const auto* p = reinterpret_cast(w.feature.data()); + features.insert(features.end(), p, p + len); + } + entries.push_back(e); + } + return run; +} + +Status flush_file(const std::string& path, std::vector& buf) { + auto* hdr = reinterpret_cast(buf.data()); + hdr->file_size = static_cast(buf.size()); + std::ofstream out(path, std::ios::binary | std::ios::trunc); + if (!out) { + return Status::IOError("kuromoji dict: cannot open {} for write", path); + } + out.write(reinterpret_cast(buf.data()), static_cast(buf.size())); + if (!out) { + return Status::IOError("kuromoji dict: short write to {}", path); + } + return Status::OK(); +} + +} // namespace + +Status KuromojiDictionaryBuilder::write_system(const std::string& path, const SystemDictInput& in) { + // Sort surfaces by raw bytes (Darts requirement); trie value = sorted index. + auto surfaces = in.surfaces; + std::sort(surfaces.begin(), surfaces.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + + std::vector runs; + std::vector entries; + std::vector features; + runs.reserve(surfaces.size()); + for (const auto& [surface, words] : surfaces) { + runs.push_back(append_words(words, entries, features)); + } + + Darts::DoubleArray da; + if (!surfaces.empty()) { + std::vector kptrs; + std::vector klens; + std::vector values; + kptrs.reserve(surfaces.size()); + klens.reserve(surfaces.size()); + values.reserve(surfaces.size()); + for (std::size_t i = 0; i < surfaces.size(); ++i) { + kptrs.push_back(surfaces[i].first.data()); + klens.push_back(surfaces[i].first.size()); + values.push_back(static_cast(i)); + } + try { + if (da.build(surfaces.size(), kptrs.data(), klens.data(), values.data()) != 0) { + return Status::InternalError("kuromoji dict: darts build failed"); + } + } catch (const std::exception& e) { + return Status::InternalError("kuromoji dict: darts build threw: {}", e.what()); + } + } + + ByteSink sink; + sink.put_pod(make_header(KMJ_KIND_SYSTEM)); + const uint64_t subhdr_at = sink.offset(); + KmjSystemHeader sub {}; + sink.put_pod(sub); + + sink.align8(); + sub.trie_offset = sink.offset(); + sub.trie_bytes = static_cast(da.total_size()); + if (sub.trie_bytes > 0) { + sink.put(da.array(), da.total_size()); + } + + sink.align8(); + sub.runs_offset = sink.offset(); + sub.runs_count = runs.size(); + if (!runs.empty()) { + sink.put(runs.data(), runs.size() * sizeof(WordIdRun)); + } + + sink.align8(); + sub.entries_offset = sink.offset(); + sub.entries_count = entries.size(); + if (!entries.empty()) { + sink.put(entries.data(), entries.size() * sizeof(WordEntry)); + } + + sink.align8(); + sub.features_offset = sink.offset(); + sub.features_bytes = features.size(); + if (!features.empty()) { + sink.put(features.data(), features.size()); + } + + std::memcpy(sink.buf().data() + subhdr_at, &sub, sizeof(sub)); + return flush_file(path, sink.buf()); +} + +Status KuromojiDictionaryBuilder::write_matrix(const std::string& path, const MatrixInput& in) { + if (in.cells.size() != static_cast(in.forward_size) * in.backward_size) { + return Status::InvalidArgument("kuromoji dict: matrix cell count mismatch"); + } + ByteSink sink; + sink.put_pod(make_header(KMJ_KIND_MATRIX)); + const uint64_t subhdr_at = sink.offset(); + KmjMatrixHeader sub {}; + sink.put_pod(sub); + + sink.align8(); + sub.forward_size = in.forward_size; + sub.backward_size = in.backward_size; + sub.cells_offset = sink.offset(); + if (!in.cells.empty()) { + sink.put(in.cells.data(), in.cells.size() * sizeof(int16_t)); + } + + std::memcpy(sink.buf().data() + subhdr_at, &sub, sizeof(sub)); + return flush_file(path, sink.buf()); +} + +Status KuromojiDictionaryBuilder::write_chardef(const std::string& path, const CharDefInput& in) { + ByteSink sink; + sink.put_pod(make_header(KMJ_KIND_CHARDEF)); + const uint64_t subhdr_at = sink.offset(); + KmjCharDefHeader sub {}; + sink.put_pod(sub); + + sink.align8(); + sub.class_count = static_cast(in.defs.size()); + sub.catmap_offset = sink.offset(); + sink.put(in.catmap.data(), in.catmap.size()); // exactly 0x10000 bytes + + sink.align8(); + sub.defs_offset = sink.offset(); + if (!in.defs.empty()) { + sink.put(in.defs.data(), in.defs.size() * sizeof(CategoryDef)); + } + + std::memcpy(sink.buf().data() + subhdr_at, &sub, sizeof(sub)); + return flush_file(path, sink.buf()); +} + +Status KuromojiDictionaryBuilder::write_unkdict(const std::string& path, const UnkDictInput& in) { + std::vector runs; + std::vector entries; + std::vector features; + runs.reserve(in.per_category.size()); + for (const auto& words : in.per_category) { + runs.push_back(append_words(words, entries, features)); + } + + ByteSink sink; + sink.put_pod(make_header(KMJ_KIND_UNKDICT)); + const uint64_t subhdr_at = sink.offset(); + KmjUnkHeader sub {}; + sink.put_pod(sub); + + sink.align8(); + sub.class_count = static_cast(runs.size()); + sub.runs_offset = sink.offset(); + if (!runs.empty()) { + sink.put(runs.data(), runs.size() * sizeof(WordIdRun)); + } + + sink.align8(); + sub.entries_offset = sink.offset(); + sub.entries_count = entries.size(); + if (!entries.empty()) { + sink.put(entries.data(), entries.size() * sizeof(WordEntry)); + } + + sink.align8(); + sub.features_offset = sink.offset(); + sub.features_bytes = features.size(); + if (!features.empty()) { + sink.put(features.data(), features.size()); + } + + std::memcpy(sink.buf().data() + subhdr_at, &sub, sizeof(sub)); + return flush_file(path, sink.buf()); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h new file mode 100644 index 00000000000000..4b054e82560931 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h" + +namespace doris::segment_v2::kuromoji { + +// One morpheme entry to serialize. `feature` is the raw UTF-8 feature columns +// (comma-joined IPADIC fields), or empty for none. +struct BuilderWord { + int16_t left_id = 0; + int16_t right_id = 0; + int16_t word_cost = 0; + std::string feature; +}; + +// System dictionary input: (surface, homograph entries). Any order; the builder +// sorts surfaces by raw bytes (required by Darts). +struct SystemDictInput { + std::vector>> surfaces; +}; + +// Connection-cost matrix: cells row-major by backward_id, +// cells[backward_id * forward_size + forward_id]. +struct MatrixInput { + uint32_t forward_size = 0; + uint32_t backward_size = 0; + std::vector cells; +}; + +// Character definitions: each BMP code point -> category ordinal; per-category flags. +struct CharDefInput { + std::array catmap {}; + std::vector defs; // indexed by category ordinal +}; + +// Unknown-word entries keyed by category ordinal. +struct UnkDictInput { + std::vector> per_category; +}; + +// Offline serializer: writes the four mmap-friendly .bin files consumed by +// KuromojiDictionary. Build-time only (the offline converter + unit tests). +class KuromojiDictionaryBuilder { +public: + static Status write_system(const std::string& path, const SystemDictInput& in); + static Status write_matrix(const std::string& path, const MatrixInput& in); + static Status write_chardef(const std::string& path, const CharDefInput& in); + static Status write_unkdict(const std::string& path, const UnkDictInput& in); +}; + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.cpp b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.cpp new file mode 100644 index 00000000000000..80735acd72a627 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.cpp @@ -0,0 +1,301 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.h" + +#include +#include +#include +#include + +namespace doris::segment_v2::kuromoji { + +namespace { + +constexpr std::string_view WS = " \t\r\n\f\v"; + +std::string_view trim(std::string_view s) { + const auto b = s.find_first_not_of(WS); + if (b == std::string_view::npos) { + return {}; + } + const auto e = s.find_last_not_of(WS); + return s.substr(b, e - b + 1); +} + +bool parse_dec(std::string_view s, int* v) { + s = trim(s); + if (s.empty()) { + return false; + } + const auto* end = s.data() + s.size(); + auto r = std::from_chars(s.data(), end, *v); + return r.ec == std::errc() && r.ptr == end; +} + +int16_t to_int16(int v) { + if (v > std::numeric_limits::max()) { + return std::numeric_limits::max(); + } + if (v < std::numeric_limits::min()) { + return std::numeric_limits::min(); + } + return static_cast(v); +} + +bool parse_hex_cp(std::string_view s, uint32_t* v) { + s = trim(s); + if (s.size() > 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) { + s = s.substr(2); + } + if (s.empty()) { + return false; + } + const auto* end = s.data() + s.size(); + auto r = std::from_chars(s.data(), end, *v, 16); + return r.ec == std::errc() && r.ptr == end; +} + +// Strip an inline '#' comment and trim. Returns the content view (may be empty). +std::string_view strip_comment(std::string_view line) { + const auto h = line.find('#'); + if (h != std::string_view::npos) { + line = line.substr(0, h); + } + return trim(line); +} + +// Whitespace-separated tokens. +std::vector ws_tokens(std::string_view s) { + std::vector out; + std::size_t i = 0; + while (i < s.size()) { + const auto b = s.find_first_not_of(WS, i); + if (b == std::string_view::npos) { + break; + } + auto e = s.find_first_of(WS, b); + if (e == std::string_view::npos) { + e = s.size(); + } + out.push_back(s.substr(b, e - b)); + i = e; + } + return out; +} + +// Invoke `fn(line)` for each line in `content`. +template +void for_each_line(std::string_view content, F&& fn) { + std::size_t i = 0; + while (i <= content.size()) { + auto nl = content.find('\n', i); + if (nl == std::string_view::npos) { + nl = content.size(); + } + fn(content.substr(i, nl - i)); + if (nl == content.size()) { + break; + } + i = nl + 1; + } +} + +} // namespace + +uint8_t ipadic_category_ordinal(std::string_view name) { + name = trim(name); + if (name == "DEFAULT") return CAT_DEFAULT; + if (name == "SPACE") return CAT_SPACE; + if (name == "KANJI") return CAT_KANJI; + if (name == "SYMBOL") return CAT_SYMBOL; + if (name == "NUMERIC") return CAT_NUMERIC; + if (name == "ALPHA") return CAT_ALPHA; + if (name == "HIRAGANA") return CAT_HIRAGANA; + if (name == "KATAKANA") return CAT_KATAKANA; + if (name == "KANJINUMERIC") return CAT_KANJINUMERIC; + if (name == "GREEK") return CAT_GREEK; + if (name == "CYRILLIC") return CAT_CYRILLIC; + return CAT_CLASS_COUNT; // unknown +} + +Status parse_lexicon_line(std::string_view line, std::string* surface, BuilderWord* out) { + const auto c1 = line.find(','); + const auto c2 = c1 == std::string_view::npos ? c1 : line.find(',', c1 + 1); + const auto c3 = c2 == std::string_view::npos ? c2 : line.find(',', c2 + 1); + const auto c4 = c3 == std::string_view::npos ? c3 : line.find(',', c3 + 1); + if (c4 == std::string_view::npos) { + return Status::InvalidArgument("kuromoji ipadic: malformed lexicon line: {}", + std::string(line)); + } + int left = 0; + int right = 0; + int cost = 0; + if (!parse_dec(line.substr(c1 + 1, c2 - c1 - 1), &left) || + !parse_dec(line.substr(c2 + 1, c3 - c2 - 1), &right) || + !parse_dec(line.substr(c3 + 1, c4 - c3 - 1), &cost)) { + return Status::InvalidArgument("kuromoji ipadic: bad ids/cost in lexicon line: {}", + std::string(line)); + } + *surface = std::string(line.substr(0, c1)); + out->left_id = to_int16(left); + out->right_id = to_int16(right); + out->word_cost = to_int16(cost); + out->feature = std::string(line.substr(c4 + 1)); + return Status::OK(); +} + +Status parse_matrix_def(std::string_view content, MatrixInput* out) { + bool have_header = false; + Status st = Status::OK(); + for_each_line(content, [&](std::string_view raw) { + if (!st.ok()) { + return; + } + const std::string_view line = strip_comment(raw); + if (line.empty()) { + return; + } + const auto tok = ws_tokens(line); + if (!have_header) { + int fwd = 0; + int bwd = 0; + if (tok.size() < 2 || !parse_dec(tok[0], &fwd) || !parse_dec(tok[1], &bwd) || + fwd <= 0 || bwd <= 0) { + st = Status::InvalidArgument("kuromoji ipadic: bad matrix.def header"); + return; + } + out->forward_size = static_cast(fwd); + out->backward_size = static_cast(bwd); + out->cells.assign(static_cast(fwd) * static_cast(bwd), 0); + have_header = true; + return; + } + int a = 0; + int b = 0; + int c = 0; + if (tok.size() < 3 || !parse_dec(tok[0], &a) || !parse_dec(tok[1], &b) || + !parse_dec(tok[2], &c)) { + st = Status::InvalidArgument("kuromoji ipadic: bad matrix.def row"); + return; + } + if (a < 0 || b < 0 || static_cast(a) >= out->forward_size || + static_cast(b) >= out->backward_size) { + st = Status::InvalidArgument("kuromoji ipadic: matrix.def id out of range"); + return; + } + out->cells[static_cast(b) * out->forward_size + static_cast(a)] = + to_int16(c); + }); + if (st.ok() && !have_header) { + return Status::InvalidArgument("kuromoji ipadic: empty matrix.def"); + } + return st; +} + +Status parse_char_def(std::string_view content, CharDefInput* out) { + out->catmap.fill(static_cast(CAT_DEFAULT)); + out->defs.assign(CAT_CLASS_COUNT, CategoryDef {0, 0, 0}); + Status st = Status::OK(); + for_each_line(content, [&](std::string_view raw) { + if (!st.ok()) { + return; + } + const std::string_view line = strip_comment(raw); + if (line.empty()) { + return; + } + const auto tok = ws_tokens(line); + if (tok.empty()) { + return; + } + if (tok[0].size() >= 2 && tok[0][0] == '0' && (tok[0][1] == 'x' || tok[0][1] == 'X')) { + // code-point mapping: 0xXXXX[..0xYYYY] CATEGORY [extra...] + if (tok.size() < 2) { + return; + } + uint32_t lo = 0; + uint32_t hi = 0; + const auto dots = tok[0].find(".."); + if (dots != std::string_view::npos) { + if (!parse_hex_cp(tok[0].substr(0, dots), &lo) || + !parse_hex_cp(tok[0].substr(dots + 2), &hi)) { + return; + } + } else { + if (!parse_hex_cp(tok[0], &lo)) { + return; + } + hi = lo; + } + const uint8_t ord = ipadic_category_ordinal(tok[1]); // first = primary + if (ord >= CAT_CLASS_COUNT) { + return; + } + if (hi > 0xFFFF) { + hi = 0xFFFF; // catmap covers the BMP only + } + for (uint32_t cp = lo; cp <= hi && cp <= 0xFFFF; ++cp) { + out->catmap[cp] = ord; + } + } else { + // category definition: NAME INVOKE GROUP LENGTH + const uint8_t ord = ipadic_category_ordinal(tok[0]); + if (ord >= CAT_CLASS_COUNT || tok.size() < 4) { + return; + } + int invoke = 0; + int group = 0; + int length = 0; + if (!parse_dec(tok[1], &invoke) || !parse_dec(tok[2], &group) || + !parse_dec(tok[3], &length)) { + return; + } + out->defs[ord] = CategoryDef {static_cast(invoke != 0), + static_cast(group != 0), + static_cast(length < 0 ? 0 : length)}; + } + }); + return st; +} + +Status parse_unk_def(std::string_view content, UnkDictInput* out) { + out->per_category.assign(CAT_CLASS_COUNT, {}); + Status st = Status::OK(); + for_each_line(content, [&](std::string_view raw) { + if (!st.ok()) { + return; + } + const std::string_view line = strip_comment(raw); + if (line.empty()) { + return; + } + std::string surface; + BuilderWord w; + if (!parse_lexicon_line(line, &surface, &w).ok()) { + return; // skip malformed unk row + } + const uint8_t ord = ipadic_category_ordinal(surface); // col 0 is the category name + if (ord >= CAT_CLASS_COUNT) { + return; + } + out->per_category[ord].push_back(std::move(w)); + }); + return st; +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.h b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.h new file mode 100644 index 00000000000000..601ea2e3f43eae --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h" + +// Parsers for the mecab-ipadic source files (assumed already transcoded to UTF-8). +// Pure text->struct transforms feeding KuromojiDictionaryBuilder. Build-time only. +namespace doris::segment_v2::kuromoji { + +// Maps an IPADIC char.def category name (e.g. "KANJI") to our canonical +// CharCategory ordinal. Returns CAT_CLASS_COUNT for an unknown name. +uint8_t ipadic_category_ordinal(std::string_view name); + +// Parse one IPADIC lexicon CSV row: surface,left,right,cost,. +// `feature` keeps columns 5.. verbatim (already comma-separated). +Status parse_lexicon_line(std::string_view line, std::string* surface, BuilderWord* out); + +// Parse the whole matrix.def: header " " then +// " " lines. +Status parse_matrix_def(std::string_view content, MatrixInput* out); + +// Parse the whole char.def: category definitions (NAME INVOKE GROUP LENGTH) and +// code-point mappings (0xXXXX[..0xYYYY] CATEGORY [extra...]). +Status parse_char_def(std::string_view content, CharDefInput* out); + +// Parse the whole unk.def: CATEGORY,left,right,cost, rows. +Status parse_unk_def(std::string_view content, UnkDictInput* out); + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_normalize.h b/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_normalize.h new file mode 100644 index 00000000000000..9db173891eabaa --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_normalize.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace doris::segment_v2::kuromoji { + +// CJK width folding (a subset of Lucene's CJKWidthFilter): full-width ASCII +// variants U+FF01..U+FF5E -> basic-latin U+0021..U+007E, and the ideographic +// space U+3000 -> ' '. So ABC123 -> ABC123. Everything else is preserved +// byte-for-byte. (Half-width katakana -> full-width composition is a TODO.) +inline std::string cjk_width_normalize(std::string_view in) { + std::string out; + out.reserve(in.size()); + std::size_t i = 0; + const std::size_t n = in.size(); + while (i < n) { + const auto b0 = static_cast(in[i]); + std::size_t len = 1; + char32_t cp = b0; + if (b0 >= 0xF0 && i + 3 < n) { + cp = static_cast(((b0 & 0x07U) << 18) | + ((static_cast(in[i + 1]) & 0x3FU) << 12) | + ((static_cast(in[i + 2]) & 0x3FU) << 6) | + (static_cast(in[i + 3]) & 0x3FU)); + len = 4; + } else if (b0 >= 0xE0 && i + 2 < n) { + cp = static_cast(((b0 & 0x0FU) << 12) | + ((static_cast(in[i + 1]) & 0x3FU) << 6) | + (static_cast(in[i + 2]) & 0x3FU)); + len = 3; + } else if (b0 >= 0xC0 && i + 1 < n) { + cp = static_cast(((b0 & 0x1FU) << 6) | + (static_cast(in[i + 1]) & 0x3FU)); + len = 2; + } + + if (cp >= 0xFF01 && cp <= 0xFF5E) { + out.push_back(static_cast(cp - 0xFEE0)); // full-width ASCII -> ASCII + } else if (cp == 0x3000) { + out.push_back(' '); // ideographic space -> space + } else { + out.append(in.substr(i, len)); // keep original bytes + } + i += len; + } + return out; +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.cpp b/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.cpp new file mode 100644 index 00000000000000..e8c4ba7c579e3a --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.cpp @@ -0,0 +1,263 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.h" + +#include +#include +#include + +namespace doris::segment_v2::kuromoji { + +namespace { + +constexpr int64_t KMJ_INF = std::numeric_limits::max() / 4; +constexpr uint32_t MAX_UNKNOWN_GROUP_CHARS = 1024; + +// Search/Extended-mode compound-decomposition penalties, matching Lucene's +// JapaneseTokenizer. Lengths are counted in code points. A token longer than the +// length threshold is penalized so the minimum-cost path prefers its shorter +// parts: all-kanji runs over KANJI_LENGTH chars, other runs over OTHER_LENGTH. +constexpr uint32_t SEARCH_MODE_KANJI_LENGTH = 2; +constexpr int64_t SEARCH_MODE_KANJI_PENALTY = 3000; +constexpr uint32_t SEARCH_MODE_OTHER_LENGTH = 7; +constexpr int64_t SEARCH_MODE_OTHER_PENALTY = 1700; + +struct DecodedCp { + char32_t cp; + uint32_t len; +}; + +// Decode one UTF-8 code point at text[pos]. Invalid/truncated -> single byte. +DecodedCp decode_utf8(std::string_view text, std::size_t pos) { + auto b0 = static_cast(text[pos]); + const std::size_t avail = text.size() - pos; + if (b0 < 0x80) { + return {b0, 1}; + } + if ((b0 >> 5) == 0x6 && avail >= 2) { + auto b1 = static_cast(text[pos + 1]); + return {static_cast(((b0 & 0x1FU) << 6) | (b1 & 0x3FU)), 2}; + } + if ((b0 >> 4) == 0xE && avail >= 3) { + auto b1 = static_cast(text[pos + 1]); + auto b2 = static_cast(text[pos + 2]); + return {static_cast(((b0 & 0x0FU) << 12) | ((b1 & 0x3FU) << 6) | (b2 & 0x3FU)), + 3}; + } + if ((b0 >> 3) == 0x1E && avail >= 4) { + auto b1 = static_cast(text[pos + 1]); + auto b2 = static_cast(text[pos + 2]); + auto b3 = static_cast(text[pos + 3]); + return {static_cast(((b0 & 0x07U) << 18) | ((b1 & 0x3FU) << 12) | + ((b2 & 0x3FU) << 6) | (b3 & 0x3FU)), + 4}; + } + return {b0, 1}; +} + +// Lucene JapaneseTokenizer's search-mode penalty for the token covering +// [start, end) bytes: penalize long compounds so the Viterbi prefers their +// shorter parts. Returns 0 for tokens at or under the length thresholds. +int64_t compute_penalty(const KuromojiDictionary& dict, std::string_view text, uint32_t start, + uint32_t end) { + uint32_t length = 0; + bool all_kanji = true; + for (uint32_t p = start; p < end;) { + const DecodedCp d = decode_utf8(text, p); + if (dict.char_category(d.cp) != CAT_KANJI) { + all_kanji = false; + } + p += d.len; + ++length; + } + if (length > SEARCH_MODE_KANJI_LENGTH) { + if (all_kanji) { + return static_cast(length - SEARCH_MODE_KANJI_LENGTH) * + SEARCH_MODE_KANJI_PENALTY; + } + if (length > SEARCH_MODE_OTHER_LENGTH) { + return static_cast(length - SEARCH_MODE_OTHER_LENGTH) * + SEARCH_MODE_OTHER_PENALTY; + } + } + return 0; +} + +// A lattice node spanning [start, end) bytes of the input. +struct VNode { + uint32_t start; + uint32_t end; + int16_t left_id; + int16_t right_id; + int16_t word_cost; + bool known; + uint32_t word_id; + int64_t total_cost; + int back; // previous node index, -1 if none +}; + +} // namespace + +void KuromojiViterbi::segment(std::string_view text, std::vector* out) const { + out->clear(); + const auto n = static_cast(text.size()); + if (n == 0) { + return; + } + + std::vector nodes; + std::vector> ending_at(n + 1); // node indices ending at each byte position + + // BOS (index 0): ends at position 0, context id 0, zero cost. + nodes.push_back(VNode {0, 0, 0, 0, 0, false, 0, 0, -1}); + ending_at[0].push_back(0); + + // Add a node and relax it against all nodes ending at its start position. + auto add_node = [&](uint32_t s, uint32_t e, int16_t lid, int16_t rid, int16_t wcost, bool known, + uint32_t wid) { + int64_t best = KMJ_INF; + int best_prev = -1; + for (int pe : ending_at[s]) { + const VNode& pv = nodes[static_cast(pe)]; + if (pv.total_cost >= KMJ_INF) { + continue; + } + const int64_t c = + pv.total_cost + _dict.connection_cost(static_cast(pv.right_id), + static_cast(lid)); + if (c < best) { + best = c; + best_prev = pe; + } + } + if (best_prev < 0) { + return; + } + // Search/Extended mode penalizes long compounds so shorter parts win. + const int64_t penalty = + _mode == KuromojiMode::Normal ? 0 : compute_penalty(_dict, text, s, e); + const auto idx = static_cast(nodes.size()); + nodes.push_back( + VNode {s, e, lid, rid, wcost, known, wid, best + wcost + penalty, best_prev}); + ending_at[e].push_back(idx); + }; + + uint32_t pos = 0; + while (pos < n) { + if (ending_at[pos].empty()) { + pos += decode_utf8(text, pos).len; // unreachable boundary; skip + continue; + } + const DecodedCp d0 = decode_utf8(text, pos); + const auto before = nodes.size(); + + // System-dictionary words (common-prefix search). + std::vector matches; + _dict.common_prefix_search(text.data() + pos, n - pos, &matches); + bool any_known = false; + for (const auto& mt : matches) { + const WordIdRun run = _dict.run_for_value(mt.trie_value); + for (uint32_t k = 0; k < run.count; ++k) { + const uint32_t wid = run.entry_start + k; + const WordEntry& e = _dict.word(wid); + add_node(pos, pos + mt.length, e.left_id, e.right_id, e.word_cost, true, wid); + any_known = true; + } + } + + // Unknown words: when no known word starts here, or the category forces it. + if (!any_known || _dict.is_invoke(d0.cp)) { + const uint8_t cat = _dict.char_category(d0.cp); + uint32_t group_len = d0.len; + if (_dict.is_group(d0.cp)) { + uint32_t p = pos + d0.len; + uint32_t chars = 1; + while (p < n && chars < MAX_UNKNOWN_GROUP_CHARS) { + const DecodedCp dn = decode_utf8(text, p); + if (_dict.char_category(dn.cp) != cat) { + break; + } + group_len += dn.len; + p += dn.len; + ++chars; + } + } + const WordIdRun urun = _dict.unknown_run(cat); + for (uint32_t k = 0; k < urun.count; ++k) { + const uint32_t wid = urun.entry_start + k; + const WordEntry& e = _dict.unknown_word(wid); + add_node(pos, pos + d0.len, e.left_id, e.right_id, e.word_cost, false, wid); + if (group_len > d0.len) { + add_node(pos, pos + group_len, e.left_id, e.right_id, e.word_cost, false, wid); + } + } + } + + // Connectivity safety net: if nothing covers this reachable position, force a + // single-character node so the lattice never dead-ends. + if (nodes.size() == before) { + add_node(pos, pos + d0.len, 0, 0, std::numeric_limits::max(), false, 0); + } + pos += d0.len; + } + + // EOS: best node ending at n connected to the EOS context (id 0). + int64_t best = KMJ_INF; + int best_prev = -1; + for (int pe : ending_at[n]) { + const VNode& pv = nodes[static_cast(pe)]; + if (pv.total_cost >= KMJ_INF) { + continue; + } + const int64_t c = + pv.total_cost + _dict.connection_cost(static_cast(pv.right_id), 0); + if (c < best) { + best = c; + best_prev = pe; + } + } + if (best_prev < 0) { + return; // no path (should not happen given the connectivity net) + } + + std::vector path; + for (int cur = best_prev; cur > 0; cur = nodes[static_cast(cur)].back) { + path.push_back(cur); + } + std::reverse(path.begin(), path.end()); + out->reserve(path.size()); + for (int idx : path) { + const VNode& nd = nodes[static_cast(idx)]; + // Extended mode decomposes each unknown (out-of-vocabulary) word into its + // per-code-point unigrams, mirroring Lucene JapaneseTokenizer's EXTENDED + // mode. Every code point in an unknown node shares its character category + // (group nodes are built from a same-category run), so the unigrams reuse + // the node's unknown word_id. Known words are left intact. + if (_mode == KuromojiMode::Extended && !nd.known) { + for (uint32_t p = nd.start; p < nd.end;) { + const uint32_t len = decode_utf8(text, p).len; + out->push_back(KuromojiMorpheme {p, len, false, nd.word_id}); + p += len; + } + } else { + out->push_back(KuromojiMorpheme {nd.start, nd.end - nd.start, nd.known, nd.word_id}); + } + } +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.h b/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.h new file mode 100644 index 00000000000000..4fca28e5bd5259 --- /dev/null +++ b/be/src/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "storage/index/inverted/analyzer/kuromoji/KuromojiMode.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h" + +namespace doris::segment_v2::kuromoji { + +// One morpheme on the best path. `surface` is identified by byte range into the +// analyzed input; `known` distinguishes a system-dictionary word from a +// synthesized unknown word, and `word_id` indexes the system or unknown entries +// (so callers can fetch features / base forms in later phases). +struct KuromojiMorpheme { + uint32_t byte_start = 0; + uint32_t byte_len = 0; + bool known = false; + uint32_t word_id = 0; +}; + +// Viterbi morphological segmenter. It builds a lattice over the input (known +// words from the system dictionary via common-prefix search, unknown words from +// the character-category rules) and returns the minimum-cost path, where cost is +// the sum of word costs and connection costs. +// +// In Search/Extended mode it additionally applies Lucene JapaneseTokenizer's +// compound-decomposition penalty: long tokens (all-kanji runs longer than 2 +// chars, or other runs longer than 7 chars) are penalized so the lattice prefers +// segmenting a long compound into its shorter parts, improving search recall. +// Extended mode additionally splits unknown (out-of-vocabulary) words into +// per-code-point unigrams. Normal mode applies no penalty. +class KuromojiViterbi { +public: + explicit KuromojiViterbi(const KuromojiDictionary& dict, + KuromojiMode mode = KuromojiMode::Normal) + : _dict(dict), _mode(mode) {} + + void segment(std::string_view text, std::vector* out) const; + +private: + const KuromojiDictionary& _dict; + KuromojiMode _mode; +}; + +} // namespace doris::segment_v2::kuromoji diff --git a/be/src/storage/index/inverted/inverted_index_parser.cpp b/be/src/storage/index/inverted/inverted_index_parser.cpp index 47819cc62f6397..769ecabb19f373 100644 --- a/be/src/storage/index/inverted/inverted_index_parser.cpp +++ b/be/src/storage/index/inverted/inverted_index_parser.cpp @@ -39,6 +39,8 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_ return INVERTED_INDEX_PARSER_BASIC; case InvertedIndexParserType::PARSER_IK: return INVERTED_INDEX_PARSER_IK; + case InvertedIndexParserType::PARSER_KUROMOJI: + return INVERTED_INDEX_PARSER_KUROMOJI; default: return INVERTED_INDEX_PARSER_UNKNOWN; } @@ -62,6 +64,8 @@ InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::st return InvertedIndexParserType::PARSER_BASIC; } else if (parser_str_lower == INVERTED_INDEX_PARSER_IK) { return InvertedIndexParserType::PARSER_IK; + } else if (parser_str_lower == INVERTED_INDEX_PARSER_KUROMOJI) { + return InvertedIndexParserType::PARSER_KUROMOJI; } return InvertedIndexParserType::PARSER_UNKNOWN; @@ -89,8 +93,13 @@ std::string get_parser_mode_string_from_properties( if (parser_it == properties.end()) { parser_it = properties.find(INVERTED_INDEX_PARSER_KEY_ALIAS); } - if (parser_it != properties.end() && parser_it->second == INVERTED_INDEX_PARSER_IK) { - return INVERTED_INDEX_PARSER_SMART; + if (parser_it != properties.end()) { + if (parser_it->second == INVERTED_INDEX_PARSER_IK) { + return INVERTED_INDEX_PARSER_SMART; + } + if (parser_it->second == INVERTED_INDEX_PARSER_KUROMOJI) { + return INVERTED_INDEX_PARSER_KUROMOJI_SEARCH; + } } return INVERTED_INDEX_PARSER_COARSE_GRANULARITY; } diff --git a/be/src/storage/index/inverted/inverted_index_parser.h b/be/src/storage/index/inverted/inverted_index_parser.h index d2d3df47abd0a3..411a12eca199f0 100644 --- a/be/src/storage/index/inverted/inverted_index_parser.h +++ b/be/src/storage/index/inverted/inverted_index_parser.h @@ -41,7 +41,8 @@ enum class InvertedIndexParserType { PARSER_UNICODE = 5, PARSER_ICU = 6, PARSER_BASIC = 7, - PARSER_IK = 8 + PARSER_IK = 8, + PARSER_KUROMOJI = 9 }; using CharFilterMap = std::map; @@ -77,6 +78,10 @@ const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese"; const std::string INVERTED_INDEX_PARSER_ICU = "icu"; const std::string INVERTED_INDEX_PARSER_BASIC = "basic"; const std::string INVERTED_INDEX_PARSER_IK = "ik"; +const std::string INVERTED_INDEX_PARSER_KUROMOJI = "kuromoji"; +const std::string INVERTED_INDEX_PARSER_KUROMOJI_NORMAL = "normal"; +const std::string INVERTED_INDEX_PARSER_KUROMOJI_SEARCH = "search"; +const std::string INVERTED_INDEX_PARSER_KUROMOJI_EXTENDED = "extended"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true"; diff --git a/be/src/tools/CMakeLists.txt b/be/src/tools/CMakeLists.txt index 3307e268af6103..2f7671a7ee825e 100644 --- a/be/src/tools/CMakeLists.txt +++ b/be/src/tools/CMakeLists.txt @@ -116,3 +116,44 @@ if (BUILD_INDEX_TOOL) ) endif() endif() + +# Offline generator for the Japanese kuromoji dictionary. Compiles the UTF-8 +# mecab-ipadic source into the four dict/kuromoji/*.bin files that the BE install +# rule ships. EXCLUDE_FROM_ALL and not installed: it is a maintenance tool, not +# part of doris_be. Regenerate with `ninja kuromoji_dict`. +add_executable(kuromoji_build_dict EXCLUDE_FROM_ALL + kuromoji_build_dict.cpp +) + +target_include_directories(kuromoji_build_dict PRIVATE ${PROJECT_SOURCE_DIR}/..) + +pch_reuse(kuromoji_build_dict) + +set_target_properties(kuromoji_build_dict PROPERTIES ENABLE_EXPORTS 1) + +if (COMPILER_CLANG) + target_compile_options(kuromoji_build_dict PRIVATE + -Wno-implicit-int-conversion + -Wno-shorten-64-to-32 + ) +endif() + +target_link_libraries(kuromoji_build_dict + ${DORIS_LINK_LIBS} +) + +# `ninja kuromoji_dict` runs the tool over the staged mecab-ipadic source and +# (re)writes dict/kuromoji/*.bin. Point KUROMOJI_IPADIC_SRC elsewhere if the +# source is not under the thirdparty share directory. +set(KUROMOJI_IPADIC_SRC "${THIRDPARTY_DIR}/share/mecab-ipadic-2.7.0-20250920" + CACHE PATH "UTF-8 mecab-ipadic source directory used to generate the kuromoji dictionary") +set(KUROMOJI_DICT_OUT "${BASE_DIR}/dict/kuromoji") +add_custom_command( + OUTPUT "${KUROMOJI_DICT_OUT}/system.bin" "${KUROMOJI_DICT_OUT}/matrix.bin" + "${KUROMOJI_DICT_OUT}/chardef.bin" "${KUROMOJI_DICT_OUT}/unkdict.bin" + COMMAND ${CMAKE_COMMAND} -E make_directory "${KUROMOJI_DICT_OUT}" + COMMAND $ "${KUROMOJI_IPADIC_SRC}" "${KUROMOJI_DICT_OUT}" + DEPENDS kuromoji_build_dict + COMMENT "Generating kuromoji IPADIC dictionary from ${KUROMOJI_IPADIC_SRC}" + VERBATIM) +add_custom_target(kuromoji_dict DEPENDS "${KUROMOJI_DICT_OUT}/system.bin") diff --git a/be/src/tools/kuromoji_build_dict.cpp b/be/src/tools/kuromoji_build_dict.cpp new file mode 100644 index 00000000000000..a43ae7dee4f88a --- /dev/null +++ b/be/src/tools/kuromoji_build_dict.cpp @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Offline tool: compile a UTF-8 mecab-ipadic source directory into the four +// kuromoji .bin files consumed by KuromojiDictionary. +// usage: kuromoji_build_dict +// Built on demand via `ninja kuromoji_dict`; never linked into doris_be. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.h" + +namespace fs = std::filesystem; +using namespace doris::segment_v2::kuromoji; +using doris::Status; + +namespace { + +bool read_file(const std::string& path, std::string* out) { + std::ifstream in(path, std::ios::binary); + if (!in) { + std::fprintf(stderr, "cannot open %s\n", path.c_str()); + return false; + } + std::ostringstream ss; + ss << in.rdbuf(); + *out = ss.str(); + return true; +} + +void for_each_line(const std::string& content, const std::function& fn) { + std::size_t i = 0; + while (i < content.size()) { + auto nl = content.find('\n', i); + if (nl == std::string::npos) { + nl = content.size(); + } + std::string_view line(content.data() + i, nl - i); + if (!line.empty() && line.back() == '\r') { + line.remove_suffix(1); + } + if (!line.empty()) { + fn(line); + } + i = nl + 1; + } +} + +} // namespace + +int main(int argc, char** argv) { + if (argc < 3) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + const std::string src = argv[1]; + const std::string out = argv[2]; + std::error_code ec; + fs::create_directories(out, ec); + + // --- system dictionary: group all *.csv lexicon rows by surface (homographs) --- + std::unordered_map> by_surface; + std::size_t lexicon_rows = 0; + for (const auto& entry : fs::directory_iterator(src)) { + if (!entry.is_regular_file() || entry.path().extension() != ".csv") { + continue; + } + std::string content; + if (!read_file(entry.path().string(), &content)) { + return 1; + } + for_each_line(content, [&](std::string_view line) { + std::string surface; + BuilderWord w; + if (parse_lexicon_line(line, &surface, &w).ok()) { + by_surface[surface].push_back(std::move(w)); + ++lexicon_rows; + } + }); + } + SystemDictInput sys; + sys.surfaces.reserve(by_surface.size()); + for (auto& kv : by_surface) { + sys.surfaces.emplace_back(kv.first, std::move(kv.second)); + } + if (Status st = KuromojiDictionaryBuilder::write_system(out + "/system.bin", sys); !st.ok()) { + std::fprintf(stderr, "write_system failed: %s\n", st.to_string().c_str()); + return 1; + } + + // --- connection cost matrix --- + std::string matrix_txt; + MatrixInput matrix; + if (!read_file(src + "/matrix.def", &matrix_txt) || + !parse_matrix_def(matrix_txt, &matrix).ok() || + !KuromojiDictionaryBuilder::write_matrix(out + "/matrix.bin", matrix).ok()) { + std::fprintf(stderr, "matrix.def build failed\n"); + return 1; + } + + // --- character definitions --- + std::string char_txt; + CharDefInput chardef; + if (!read_file(src + "/char.def", &char_txt) || !parse_char_def(char_txt, &chardef).ok() || + !KuromojiDictionaryBuilder::write_chardef(out + "/chardef.bin", chardef).ok()) { + std::fprintf(stderr, "char.def build failed\n"); + return 1; + } + + // --- unknown-word dictionary --- + std::string unk_txt; + UnkDictInput unk; + if (!read_file(src + "/unk.def", &unk_txt) || !parse_unk_def(unk_txt, &unk).ok() || + !KuromojiDictionaryBuilder::write_unkdict(out + "/unkdict.bin", unk).ok()) { + std::fprintf(stderr, "unk.def build failed\n"); + return 1; + } + + std::fprintf(stderr, + "kuromoji dict built: %zu surfaces (%zu lexicon rows), matrix %ux%u -> %s\n", + sys.surfaces.size(), lexicon_rows, matrix.forward_size, matrix.backward_size, + out.c_str()); + return 0; +} diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/darts_smoke_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/darts_smoke_test.cpp new file mode 100644 index 00000000000000..2411aa80104b8a --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/darts_smoke_test.cpp @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include + +#include "storage/index/inverted/analyzer/kuromoji/dict/darts.h" + +namespace doris::segment_v2::kuromoji { + +// Proves the vendored Darts-clone header builds + queries: surface forms -> +// non-negative values, then common-prefix search (the API the kuromoji lattice +// will use). "東"(E6 9D B1), "京"(E4 BA AC), "東京"(E6 9D B1 E4 BA AC). +TEST(DartsSmokeTest, BuildAndCommonPrefixSearch) { + std::vector keys = {"\xE4\xBA\xAC", "\xE6\x9D\xB1", "\xE6\x9D\xB1\xE4\xBA\xAC"}; + std::sort(keys.begin(), keys.end()); // Darts requires byte-sorted keys + + std::vector kptrs; + std::vector klens; + std::vector values; + for (std::size_t i = 0; i < keys.size(); ++i) { + kptrs.push_back(keys[i].data()); + klens.push_back(keys[i].size()); + values.push_back(static_cast(i)); // non-negative (MSB reserved by Darts) + } + + Darts::DoubleArray da; + ASSERT_EQ(0, da.build(keys.size(), kptrs.data(), klens.data(), values.data())); + + // exact match for "東京" + const std::string tokyo = "\xE6\x9D\xB1\xE4\xBA\xAC"; + EXPECT_GE(da.exactMatchSearch(tokyo.data(), tokyo.size()), 0); + + // common-prefix search over "東京" must yield "東" (len 3) and "東京" (len 6) + Darts::DoubleArray::result_pair_type results[8]; + std::size_t n = da.commonPrefixSearch(tokyo.data(), results, 8, tokyo.size()); + ASSERT_GE(n, 2U); + bool saw_len3 = false; + bool saw_len6 = false; + for (std::size_t i = 0; i < n && i < 8; ++i) { + if (results[i].length == 3) { + saw_len3 = true; + } + if (results[i].length == 6) { + saw_len6 = true; + } + } + EXPECT_TRUE(saw_len3); + EXPECT_TRUE(saw_len6); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dict_format_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dict_format_test.cpp new file mode 100644 index 00000000000000..f4aed864ca1e85 --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dict_format_test.cpp @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h" + +#include + +#include + +namespace doris::segment_v2::kuromoji { + +static_assert(sizeof(KmjFileHeader) == 32, "KmjFileHeader must be 32 bytes"); +static_assert(sizeof(WordEntry) == 12, "WordEntry must be 12 bytes"); +static_assert(sizeof(WordIdRun) == 8, "WordIdRun must be 8 bytes"); +static_assert(sizeof(CategoryDef) == 4, "CategoryDef must be 4 bytes"); + +TEST(KuromojiDictFormatTest, ConstantsAndCategories) { + EXPECT_EQ(KMJ_FORMAT_VERSION, 1U); + EXPECT_EQ(CAT_CLASS_COUNT, 12); + EXPECT_EQ(KMJ_NO_FEATURE, 0xFFFFFFFFU); +} + +TEST(KuromojiDictFormatTest, ConnectionCostIndexing) { + // forward_size=2, backward_size=3 -> 6 cells, row-major by backward_id. + int16_t cells[6] = {10, 11, 20, 21, 30, 31}; + // cell index = backward_id * forward_size + forward_id + EXPECT_EQ(connection_cost(cells, 2, /*forward*/ 0, /*backward*/ 0), 10); + EXPECT_EQ(connection_cost(cells, 2, /*forward*/ 1, /*backward*/ 0), 11); + EXPECT_EQ(connection_cost(cells, 2, /*forward*/ 0, /*backward*/ 1), 20); + EXPECT_EQ(connection_cost(cells, 2, /*forward*/ 0, /*backward*/ 2), 30); + EXPECT_EQ(connection_cost(cells, 2, /*forward*/ 1, /*backward*/ 2), 31); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dictionary_builder_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dictionary_builder_test.cpp new file mode 100644 index 00000000000000..94a49c0c8e74ae --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dictionary_builder_test.cpp @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h" + +#include +#include + +#include + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h" + +namespace doris::segment_v2::kuromoji { + +static bool file_nonempty(const std::string& p) { + struct stat st {}; + return ::stat(p.c_str(), &st) == 0 && st.st_size > static_cast(sizeof(KmjFileHeader)); +} + +TEST(KuromojiDictionaryBuilderTest, WritesFourFiles) { + std::string dir = std::string(::testing::TempDir()) + "/kmj_build_test"; + ::mkdir(dir.c_str(), 0755); + + SystemDictInput sys; + sys.surfaces.push_back({"\xE6\x9D\xB1", {{1, 1, 100, "POS,East"}}}); // 東 + sys.surfaces.push_back({"\xE6\x9D\xB1\xE4\xBA\xAC", {{2, 2, 50, "POS,Tokyo"}}}); // 東京 + ASSERT_TRUE(KuromojiDictionaryBuilder::write_system(dir + "/system.bin", sys).ok()); + + MatrixInput m; + m.forward_size = 3; + m.backward_size = 3; + m.cells.assign(9, 7); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_matrix(dir + "/matrix.bin", m).ok()); + + CharDefInput cd; + cd.catmap.fill(CAT_DEFAULT); + cd.catmap[0x6771] = CAT_KANJI; // 東 + cd.defs.assign(CAT_CLASS_COUNT, CategoryDef {0, 0, 0}); + cd.defs[CAT_DEFAULT] = CategoryDef {1, 1, 0}; + ASSERT_TRUE(KuromojiDictionaryBuilder::write_chardef(dir + "/chardef.bin", cd).ok()); + + UnkDictInput unk; + unk.per_category.resize(CAT_CLASS_COUNT); + unk.per_category[CAT_DEFAULT].push_back({5, 5, 4769, "SYMBOL"}); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_unkdict(dir + "/unkdict.bin", unk).ok()); + + EXPECT_TRUE(file_nonempty(dir + "/system.bin")); + EXPECT_TRUE(file_nonempty(dir + "/matrix.bin")); + EXPECT_TRUE(file_nonempty(dir + "/chardef.bin")); + EXPECT_TRUE(file_nonempty(dir + "/unkdict.bin")); + + // matrix rejects a wrong cell count. + MatrixInput bad; + bad.forward_size = 2; + bad.backward_size = 2; + bad.cells.assign(3, 0); + EXPECT_FALSE(KuromojiDictionaryBuilder::write_matrix(dir + "/bad.bin", bad).ok()); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dictionary_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dictionary_test.cpp new file mode 100644 index 00000000000000..dee4f17cf5f584 --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_dictionary_test.cpp @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h" + +#include +#include + +#include +#include +#include + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h" + +namespace doris::segment_v2::kuromoji { + +class KuromojiDictionaryTest : public ::testing::Test { +protected: + std::string _dir; + + void SetUp() override { + _dir = std::string(::testing::TempDir()) + "/kmj_dict_rt"; + ::mkdir(_dir.c_str(), 0755); + + SystemDictInput sys; + sys.surfaces.push_back({"\xE6\x9D\xB1", {{1, 1, 100, "f-east"}}}); // 東 + sys.surfaces.push_back({"\xE6\x9D\xB1\xE4\xBA\xAC", {{2, 2, 50, "f-tokyo"}}}); // 東京 + ASSERT_TRUE(KuromojiDictionaryBuilder::write_system(_dir + "/system.bin", sys).ok()); + + MatrixInput m; + m.forward_size = 3; + m.backward_size = 3; + m.cells = {0, 0, 0, 0, 0, 0, 0, 0, 42}; // cells[backward=2 * 3 + forward=2] = 42 + ASSERT_TRUE(KuromojiDictionaryBuilder::write_matrix(_dir + "/matrix.bin", m).ok()); + + CharDefInput cd; + cd.catmap.fill(CAT_DEFAULT); + cd.catmap[0x6771] = CAT_KANJI; // 東 + cd.defs.assign(CAT_CLASS_COUNT, CategoryDef {0, 0, 0}); + cd.defs[CAT_DEFAULT] = CategoryDef {1, 1, 0}; // invoke=1, group=1 + cd.defs[CAT_KANJI] = CategoryDef {0, 0, 2}; // invoke=0, group=0 + ASSERT_TRUE(KuromojiDictionaryBuilder::write_chardef(_dir + "/chardef.bin", cd).ok()); + + UnkDictInput unk; + unk.per_category.resize(CAT_CLASS_COUNT); + unk.per_category[CAT_DEFAULT].push_back({5, 5, 4769, "unk-default"}); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_unkdict(_dir + "/unkdict.bin", unk).ok()); + } +}; + +TEST_F(KuromojiDictionaryTest, LoadAndQuery) { + std::unique_ptr dict; + ASSERT_TRUE(KuromojiDictionary::load(_dir, &dict).ok()); + + // common-prefix search over "東京" finds "東" (len 3) and "東京" (len 6). + const std::string text = "\xE6\x9D\xB1\xE4\xBA\xAC"; + std::vector matches; + dict->common_prefix_search(text.data(), text.size(), &matches); + ASSERT_EQ(matches.size(), 2U); + + // Resolve "東京" (length 6) -> its single entry: cost 50, leftId 2, feature "f-tokyo". + bool checked = false; + for (const auto& mt : matches) { + if (mt.length == 6) { + WordIdRun run = dict->run_for_value(mt.trie_value); + ASSERT_EQ(run.count, 1U); + const WordEntry& e = dict->word(run.entry_start); + EXPECT_EQ(e.left_id, 2); + EXPECT_EQ(e.word_cost, 50); + EXPECT_EQ(dict->feature(e), std::string_view("f-tokyo")); + checked = true; + } + } + EXPECT_TRUE(checked); + + EXPECT_EQ(dict->connection_cost(/*forward*/ 2, /*backward*/ 2), 42); + + EXPECT_EQ(dict->char_category(0x6771), CAT_KANJI); + EXPECT_EQ(dict->char_category(U'a'), CAT_DEFAULT); + EXPECT_TRUE(dict->is_invoke(U'a')); // DEFAULT invoke=1 + EXPECT_FALSE(dict->is_invoke(0x6771)); // KANJI invoke=0 + + WordIdRun urun = dict->unknown_run(CAT_DEFAULT); + ASSERT_EQ(urun.count, 1U); + const WordEntry& ue = dict->unknown_word(urun.entry_start); + EXPECT_EQ(ue.word_cost, 4769); + EXPECT_EQ(dict->unknown_feature(ue), std::string_view("unk-default")); +} + +TEST_F(KuromojiDictionaryTest, RejectsBadMagic) { + // Corrupt the system.bin magic and confirm load() fails cleanly. + std::string bad = std::string(::testing::TempDir()) + "/kmj_dict_bad"; + ::mkdir(bad.c_str(), 0755); + // Reuse the good files but overwrite system.bin's first byte. + std::unique_ptr dict; + // Point at a directory missing the files -> must fail, not crash. + EXPECT_FALSE(KuromojiDictionary::load(bad, &dict).ok()); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_ipadic_parser_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_ipadic_parser_test.cpp new file mode 100644 index 00000000000000..dde821036c0ac3 --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_ipadic_parser_test.cpp @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_ipadic_parser.h" + +#include + +#include + +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h" + +namespace doris::segment_v2::kuromoji { + +TEST(KuromojiIpadicParserTest, CategoryOrdinal) { + EXPECT_EQ(ipadic_category_ordinal("KANJI"), CAT_KANJI); + EXPECT_EQ(ipadic_category_ordinal(" DEFAULT "), CAT_DEFAULT); + EXPECT_EQ(ipadic_category_ordinal("KANJINUMERIC"), CAT_KANJINUMERIC); + EXPECT_EQ(ipadic_category_ordinal("NOPE"), CAT_CLASS_COUNT); +} + +TEST(KuromojiIpadicParserTest, LexiconLine) { + std::string surface; + BuilderWord w; + // Real IPADIC row shape: surface,left,right,cost,<9 feature cols>. + ASSERT_TRUE(parse_lexicon_line("\xE4\xBB\x95\xE8\x88\x9E\xE3\x81\x84,1285,1285,5543," + "noun,general,*,*,*,*,base,reading,pron", + &surface, &w) + .ok()); + EXPECT_EQ(surface, "\xE4\xBB\x95\xE8\x88\x9E\xE3\x81\x84"); // 仕舞い + EXPECT_EQ(w.left_id, 1285); + EXPECT_EQ(w.right_id, 1285); + EXPECT_EQ(w.word_cost, 5543); + EXPECT_EQ(w.feature, "noun,general,*,*,*,*,base,reading,pron"); + + // Negative cost is valid. + ASSERT_TRUE(parse_lexicon_line("x,1,2,-300,noun,*", &surface, &w).ok()); + EXPECT_EQ(w.word_cost, -300); + + // Too few columns -> error. + EXPECT_FALSE(parse_lexicon_line("x,1,2", &surface, &w).ok()); +} + +TEST(KuromojiIpadicParserTest, MatrixDef) { + MatrixInput m; + ASSERT_TRUE(parse_matrix_def("2 2\n0 0 -100\n1 1 50\n# trailing comment\n", &m).ok()); + EXPECT_EQ(m.forward_size, 2U); + EXPECT_EQ(m.backward_size, 2U); + ASSERT_EQ(m.cells.size(), 4U); + // data "a b cost" -> cells[b*forward_size + a] + EXPECT_EQ(connection_cost(m.cells.data(), m.forward_size, /*fwd*/ 0, /*bwd*/ 0), -100); + EXPECT_EQ(connection_cost(m.cells.data(), m.forward_size, /*fwd*/ 1, /*bwd*/ 1), 50); + EXPECT_EQ(connection_cost(m.cells.data(), m.forward_size, /*fwd*/ 1, /*bwd*/ 0), 0); +} + +TEST(KuromojiIpadicParserTest, CharDef) { + const std::string content = + "# category definitions\n" + "DEFAULT 0 1 0\n" + "KANJI 0 0 2\n" + "ALPHA 1 1 0\n" + "SYMBOL 1 1 0\n" + "# mappings\n" + "0x4E00..0x9FA5 KANJI\n" + "0x0041..0x005A ALPHA\n" + "0x3007 SYMBOL KANJINUMERIC\n"; + CharDefInput cd; + ASSERT_TRUE(parse_char_def(content, &cd).ok()); + + EXPECT_EQ(cd.defs[CAT_KANJI].invoke, 0); + EXPECT_EQ(cd.defs[CAT_KANJI].group, 0); + EXPECT_EQ(cd.defs[CAT_KANJI].length, 2); + EXPECT_EQ(cd.defs[CAT_ALPHA].invoke, 1); + EXPECT_EQ(cd.defs[CAT_DEFAULT].group, 1); + + EXPECT_EQ(cd.catmap[0x4E00], CAT_KANJI); + EXPECT_EQ(cd.catmap[0x9FA5], CAT_KANJI); + EXPECT_EQ(cd.catmap[0x0041], CAT_ALPHA); // 'A' + EXPECT_EQ(cd.catmap[0x3007], CAT_SYMBOL); // first category is primary + EXPECT_EQ(cd.catmap[0x0040], CAT_DEFAULT); // unmapped -> DEFAULT +} + +TEST(KuromojiIpadicParserTest, UnkDef) { + const std::string content = + "DEFAULT,5,5,4769,symbol,general,*,*,*,*,*\n" + "KANJI,1285,1285,11426,noun,general,*,*,*,*,*\n"; + UnkDictInput unk; + ASSERT_TRUE(parse_unk_def(content, &unk).ok()); + ASSERT_EQ(unk.per_category.size(), static_cast(CAT_CLASS_COUNT)); + ASSERT_EQ(unk.per_category[CAT_DEFAULT].size(), 1U); + EXPECT_EQ(unk.per_category[CAT_DEFAULT][0].word_cost, 4769); + ASSERT_EQ(unk.per_category[CAT_KANJI].size(), 1U); + EXPECT_EQ(unk.per_category[CAT_KANJI][0].left_id, 1285); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_normalize_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_normalize_test.cpp new file mode 100644 index 00000000000000..527898d1af3be8 --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_normalize_test.cpp @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/kuromoji_normalize.h" + +#include + +#include + +namespace doris::segment_v2::kuromoji { + +TEST(KuromojiNormalizeTest, FullWidthAsciiToAscii) { + EXPECT_EQ(cjk_width_normalize("\xEF\xBC\xA1\xEF\xBC\xA2\xEF\xBC\xA3"), "ABC"); // ABC + EXPECT_EQ(cjk_width_normalize("\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93"), "123"); // 123 + // ABC123 + EXPECT_EQ(cjk_width_normalize("\xEF\xBC\xA1\xEF\xBC\xA2\xEF\xBC\xA3\xEF\xBC\x91\xEF\xBC\x92" + "\xEF\xBC\x93"), + "ABC123"); +} + +TEST(KuromojiNormalizeTest, IdeographicSpaceToSpace) { + EXPECT_EQ(cjk_width_normalize("\xE3\x80\x80"), " "); // U+3000 +} + +TEST(KuromojiNormalizeTest, PreservesOtherText) { + EXPECT_EQ(cjk_width_normalize("ABC123"), "ABC123"); // already ASCII + EXPECT_EQ(cjk_width_normalize("\xE6\x9D\xB1\xE4\xBA\xAC"), + "\xE6\x9D\xB1\xE4\xBA\xAC"); // 東京 unchanged + EXPECT_EQ(cjk_width_normalize(""), ""); + // mixed: A東B -> A東B + EXPECT_EQ(cjk_width_normalize("\xEF\xBC\xA1\xE6\x9D\xB1\xEF\xBC\xA2"), + "A\xE6\x9D\xB1" + "B"); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_real_dict_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_real_dict_test.cpp new file mode 100644 index 00000000000000..3a4a63cf329bba --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_real_dict_test.cpp @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "CLucene.h" +#include "common/config.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h" +#include "storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.h" +#include "storage/index/inverted/inverted_index_parser.h" + +// End-to-end against the REAL IPADIC dictionary generated under be/dict/kuromoji. +// Skips if the dictionary has not been generated on this host (e.g. plain CI), +// so it is safe to keep in the suite. +namespace doris::segment_v2::kuromoji { + +static std::string real_dict_dir() { + const char* home = std::getenv("DORIS_HOME"); + return std::string(home != nullptr ? home : ".") + "/be/dict/kuromoji"; +} + +static bool real_dict_present() { + struct stat st {}; + return ::stat((real_dict_dir() + "/system.bin").c_str(), &st) == 0; +} + +static std::vector segment_surfaces(const KuromojiDictionary& dict, + const std::string& text) { + KuromojiViterbi v(dict); + std::vector ms; + v.segment(text, &ms); + std::vector out; + out.reserve(ms.size()); + for (const auto& m : ms) { + out.emplace_back(text.substr(m.byte_start, m.byte_len)); + } + return out; +} + +TEST(KuromojiRealDictTest, SegmentsRealJapanese) { + if (!real_dict_present()) { + GTEST_SKIP() << "real IPADIC dictionary not generated at " << real_dict_dir(); + } + std::unique_ptr dict; + ASSERT_TRUE(KuromojiDictionary::load(real_dict_dir(), &dict).ok()); + + // 東京都に住んでいます ("I live in Tokyo"), 10 code points. + const std::string text = + "\xE6\x9D\xB1\xE4\xBA\xAC\xE9\x83\xBD\xE3\x81\xAB\xE4\xBD\x8F" + "\xE3\x82\x93\xE3\x81\xA7\xE3\x81\x84\xE3\x81\xBE\xE3\x81\x99"; + const std::vector toks = segment_surfaces(*dict, text); + + // Lossless coverage: morphemes concatenate back to the input. + std::string concat; + for (const auto& t : toks) { + concat += t; + } + EXPECT_EQ(concat, text); + + // Real morphological segmentation: more than one token, but well under the + // 10 code points a per-character split would produce. + EXPECT_GT(toks.size(), 1U); + EXPECT_LT(toks.size(), 10U); + + // The particle "に" must be isolated as its own token. + EXPECT_NE(std::find(toks.begin(), toks.end(), std::string("\xE3\x81\xAB")), toks.end()); + + std::cout << "segmentation of 東京都に住んでいます:"; + for (const auto& t : toks) { + std::cout << " [" << t << "]"; + } + std::cout << std::endl; +} + +TEST(KuromojiRealDictTest, KnownCompoundWord) { + if (!real_dict_present()) { + GTEST_SKIP() << "real IPADIC dictionary not generated at " << real_dict_dir(); + } + std::unique_ptr dict; + ASSERT_TRUE(KuromojiDictionary::load(real_dict_dir(), &dict).ok()); + + // 日本語 ("Japanese language") is a single IPADIC entry -> one token. + const std::string nihongo = "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"; + const std::vector toks = segment_surfaces(*dict, nihongo); + std::cout << "segmentation of 日本語:"; + for (const auto& t : toks) { + std::cout << " [" << t << "]"; + } + std::cout << std::endl; + EXPECT_EQ(toks.size(), 1U); + EXPECT_EQ(toks[0], nihongo); +} + +// Full analyzer path: parser="kuromoji" -> create_builtin_analyzer -> initDict loads +// the real dictionary -> tokenStream emits morphemes. Points the dict-path config at +// be/dict (so + "/kuromoji" resolves) and restores it so other tests are unaffected. +TEST(KuromojiRealDictTest, AnalyzerSegmentsViaBuiltinParser) { + if (!real_dict_present()) { + GTEST_SKIP() << "real IPADIC dictionary not generated at " << real_dict_dir(); + } + const char* home = std::getenv("DORIS_HOME"); + const std::string saved = doris::config::inverted_index_dict_path; + doris::config::inverted_index_dict_path = + std::string(home != nullptr ? home : ".") + "/be/dict"; + const bool saved_enable = doris::config::enable_kuromoji_analyzer; + doris::config::enable_kuromoji_analyzer = true; + + std::vector toks; + { + auto analyzer = + doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_builtin_analyzer( + doris::InvertedIndexParserType::PARSER_KUROMOJI, + doris::INVERTED_INDEX_PARSER_KUROMOJI_SEARCH, "true", "none"); + ASSERT_NE(analyzer, nullptr); + + const std::string text = + "\xE6\x9D\xB1\xE4\xBA\xAC\xE9\x83\xBD\xE3\x81\xAB\xE4\xBD\x8F" + "\xE3\x82\x93\xE3\x81\xA7\xE3\x81\x84\xE3\x81\xBE\xE3\x81\x99"; + lucene::util::SStringReader reader; + reader.init(text.data(), text.size(), false); + std::unique_ptr ts( + (lucene::analysis::TokenStream*)analyzer->tokenStream(L"", &reader)); + lucene::analysis::Token t; + while (ts->next(&t) != nullptr) { + toks.emplace_back(t.termBuffer(), static_cast(t.termLength())); + } + } + doris::config::inverted_index_dict_path = saved; + doris::config::enable_kuromoji_analyzer = saved_enable; + + std::cout << "analyzer tokens for 東京都に住んでいます:"; + for (const auto& t : toks) { + std::cout << " [" << t << "]"; + } + std::cout << std::endl; + + EXPECT_GT(toks.size(), 1U); + // Part-of-speech stop filtering removes the particle に. + EXPECT_EQ(std::find(toks.begin(), toks.end(), std::string("\xE3\x81\xAB")), toks.end()); // に + // Content words survive; 東京 is kept as-is. + EXPECT_NE(std::find(toks.begin(), toks.end(), std::string("\xE6\x9D\xB1\xE4\xBA\xAC")), + toks.end()); // 東京 + // Base-form normalization rewrites the conjugated 住ん to its lemma 住む. + EXPECT_EQ(std::find(toks.begin(), toks.end(), std::string("\xE4\xBD\x8F\xE3\x82\x93")), + toks.end()); // 住ん (surface) absent + EXPECT_NE(std::find(toks.begin(), toks.end(), std::string("\xE4\xBD\x8F\xE3\x82\x80")), + toks.end()); // 住む (base form) present +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi_test.cpp new file mode 100644 index 00000000000000..5d635895b36165 --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi_test.cpp @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/index/inverted/analyzer/kuromoji/kuromoji_viterbi.h" + +#include +#include + +#include +#include +#include +#include + +#include "CLucene.h" +#include "storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dict_format.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary.h" +#include "storage/index/inverted/analyzer/kuromoji/dict/kuromoji_dictionary_builder.h" + +namespace doris::segment_v2::kuromoji { + +// 東 = E6 9D B1 (U+6771), 京 = E4 BA AC (U+4EAC), 府 = E5 BA 9C (U+5E9C) +static const std::string TOU = "\xE6\x9D\xB1"; +static const std::string KYO = "\xE4\xBA\xAC"; +static const std::string FU = "\xE5\xBA\x9C"; + +class KuromojiViterbiTest : public ::testing::Test { +protected: + std::string _dir; + std::unique_ptr _dict; + + void SetUp() override { + _dir = std::string(::testing::TempDir()) + "/kmj_viterbi"; + ::mkdir(_dir.c_str(), 0755); + + // Lexicon: single chars cost 1000 each; the compound "東京" costs 100. + // With an all-zero connection matrix, the min-cost path must pick "東京". + SystemDictInput sys; + sys.surfaces.push_back({TOU, {{1, 1, 1000, "noun,East"}}}); + sys.surfaces.push_back({KYO, {{1, 1, 1000, "noun,Capital"}}}); + sys.surfaces.push_back({TOU + KYO, {{1, 1, 100, "noun,Tokyo"}}}); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_system(_dir + "/system.bin", sys).ok()); + + MatrixInput m; + m.forward_size = 2; // context ids: 0 = BOS/EOS, 1 = word + m.backward_size = 2; + m.cells.assign(4, 0); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_matrix(_dir + "/matrix.bin", m).ok()); + + CharDefInput cd; + cd.catmap.fill(CAT_DEFAULT); + cd.catmap[0x6771] = CAT_KANJI; // 東 + cd.catmap[0x4EAC] = CAT_KANJI; // 京 + cd.catmap[0x5E9C] = CAT_KANJI; // 府 (not in lexicon -> unknown) + cd.defs.assign(CAT_CLASS_COUNT, CategoryDef {0, 0, 0}); + cd.defs[CAT_KANJI] = CategoryDef {0, 0, 0}; // invoke=0, group=0 -> single-char unknown + cd.defs[CAT_DEFAULT] = CategoryDef {1, 1, 0}; // invoke=1, group=1 + ASSERT_TRUE(KuromojiDictionaryBuilder::write_chardef(_dir + "/chardef.bin", cd).ok()); + + UnkDictInput unk; + unk.per_category.resize(CAT_CLASS_COUNT); + unk.per_category[CAT_KANJI].push_back({1, 1, 5000, "unknown,kanji"}); + unk.per_category[CAT_DEFAULT].push_back({1, 1, 5000, "unknown,default"}); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_unkdict(_dir + "/unkdict.bin", unk).ok()); + + ASSERT_TRUE(KuromojiDictionary::load(_dir, &_dict).ok()); + } + + std::vector surfaces(std::string_view text) const { + KuromojiViterbi v(*_dict); + std::vector ms; + v.segment(text, &ms); + std::vector out; + for (const auto& m : ms) { + out.emplace_back(text.substr(m.byte_start, m.byte_len)); + } + return out; + } +}; + +TEST_F(KuromojiViterbiTest, PrefersCompoundOverSingleChars) { + // "東京" must segment as one word, not 東 + 京. + EXPECT_EQ(surfaces(TOU + KYO), (std::vector {TOU + KYO})); +} + +TEST_F(KuromojiViterbiTest, KnownPlusUnknown) { + // "東京府" -> 東京 (known) + 府 (unknown), since 東京(100)+府(5000) beats 東+京+府. + KuromojiViterbi v(*_dict); + std::vector ms; + v.segment(TOU + KYO + FU, &ms); + ASSERT_EQ(ms.size(), 2U); + EXPECT_EQ(ms[0].byte_start, 0U); + EXPECT_EQ(ms[0].byte_len, 6U); + EXPECT_TRUE(ms[0].known); + EXPECT_EQ(ms[1].byte_start, 6U); + EXPECT_EQ(ms[1].byte_len, 3U); + EXPECT_FALSE(ms[1].known); + EXPECT_EQ((TOU + KYO + FU).substr(ms[1].byte_start, ms[1].byte_len), FU); +} + +TEST_F(KuromojiViterbiTest, SingleKnownWord) { + EXPECT_EQ(surfaces(TOU), (std::vector {TOU})); +} + +TEST_F(KuromojiViterbiTest, EmptyInput) { + EXPECT_TRUE(surfaces("").empty()); +} + +TEST_F(KuromojiViterbiTest, AllUnknown) { + // A lone unknown kanji 府 -> exactly one unknown morpheme covering it. + KuromojiViterbi v(*_dict); + std::vector ms; + v.segment(FU, &ms); + ASSERT_EQ(ms.size(), 1U); + EXPECT_FALSE(ms[0].known); + EXPECT_EQ(ms[0].byte_len, 3U); +} + +// End-to-end through the CLucene Tokenizer wrapper: with a dictionary, the +// tokenizer emits Viterbi morphemes ("東京", "府") rather than unigrams. +TEST_F(KuromojiViterbiTest, TokenizerUsesDictionary) { + const std::string s = TOU + KYO + FU; + lucene::util::SStringReader reader; + reader.init(s.data(), s.size(), false); + + KuromojiTokenizer tk(KuromojiMode::Search, true, false, _dict.get()); + tk.reset(&reader); + + std::vector toks; + lucene::analysis::Token t; + while (tk.next(&t) != nullptr) { + toks.emplace_back(t.termBuffer(), static_cast(t.termLength())); + } + EXPECT_EQ(toks, (std::vector {TOU + KYO, FU})); +} + +// 都 = E9 83 BD (U+90FD), 山 = E5 B1 B1 (U+5C71), 川 = E5 B7 9D (U+5DDD) +static const std::string TO = "\xE9\x83\xBD"; +static const std::string YAMA = "\xE5\xB1\xB1"; +static const std::string KAWA = "\xE5\xB7\x9D"; + +// Search-mode compound decomposition, mirroring Lucene's JapaneseTokenizer: +// long all-kanji dictionary words are penalized so the Viterbi prefers their +// shorter parts (better recall), while Normal mode keeps the whole compound and +// short (<= 2 kanji) compounds are never split. +class KuromojiSearchModeTest : public ::testing::Test { +protected: + std::string _dir; + std::unique_ptr _dict; + + void SetUp() override { + _dir = std::string(::testing::TempDir()) + "/kmj_search"; + ::mkdir(_dir.c_str(), 0755); + + // Single kanji cost 1000 each; compounds "東京都" and "山川" cost 100. + SystemDictInput sys; + sys.surfaces.push_back({TOU, {{1, 1, 1000, "noun"}}}); + sys.surfaces.push_back({KYO, {{1, 1, 1000, "noun"}}}); + sys.surfaces.push_back({TO, {{1, 1, 1000, "noun"}}}); + sys.surfaces.push_back({YAMA, {{1, 1, 1000, "noun"}}}); + sys.surfaces.push_back({KAWA, {{1, 1, 1000, "noun"}}}); + sys.surfaces.push_back({TOU + KYO + TO, {{1, 1, 100, "noun"}}}); + sys.surfaces.push_back({YAMA + KAWA, {{1, 1, 100, "noun"}}}); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_system(_dir + "/system.bin", sys).ok()); + + MatrixInput m; + m.forward_size = 2; + m.backward_size = 2; + m.cells.assign(4, 0); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_matrix(_dir + "/matrix.bin", m).ok()); + + CharDefInput cd; + cd.catmap.fill(CAT_DEFAULT); + cd.catmap[0x6771] = CAT_KANJI; // 東 + cd.catmap[0x4EAC] = CAT_KANJI; // 京 + cd.catmap[0x90FD] = CAT_KANJI; // 都 + cd.catmap[0x5C71] = CAT_KANJI; // 山 + cd.catmap[0x5DDD] = CAT_KANJI; // 川 + cd.defs.assign(CAT_CLASS_COUNT, CategoryDef {0, 0, 0}); + cd.defs[CAT_KANJI] = CategoryDef {0, 0, 0}; + cd.defs[CAT_DEFAULT] = CategoryDef {1, 1, 0}; + ASSERT_TRUE(KuromojiDictionaryBuilder::write_chardef(_dir + "/chardef.bin", cd).ok()); + + UnkDictInput unk; + unk.per_category.resize(CAT_CLASS_COUNT); + unk.per_category[CAT_KANJI].push_back({1, 1, 5000, "unknown"}); + unk.per_category[CAT_DEFAULT].push_back({1, 1, 5000, "unknown"}); + ASSERT_TRUE(KuromojiDictionaryBuilder::write_unkdict(_dir + "/unkdict.bin", unk).ok()); + + ASSERT_TRUE(KuromojiDictionary::load(_dir, &_dict).ok()); + } + + std::vector surfaces(std::string_view text, KuromojiMode mode) const { + KuromojiViterbi v(*_dict, mode); + std::vector ms; + v.segment(text, &ms); + std::vector out; + for (const auto& m : ms) { + out.emplace_back(text.substr(m.byte_start, m.byte_len)); + } + return out; + } +}; + +TEST_F(KuromojiSearchModeTest, NormalModeKeepsCompound) { + // 東京都(100) beats 東+京+都(3000): the compound is one token. + EXPECT_EQ(surfaces(TOU + KYO + TO, KuromojiMode::Normal), + (std::vector {TOU + KYO + TO})); +} + +TEST_F(KuromojiSearchModeTest, SearchModeDecompoundsKanjiCompound) { + // 東京都 is 3 kanji: penalty (3-2)*3000 makes it 3100, so 東+京+都(3000) wins. + EXPECT_EQ(surfaces(TOU + KYO + TO, KuromojiMode::Search), + (std::vector {TOU, KYO, TO})); +} + +TEST_F(KuromojiSearchModeTest, SearchModeKeepsShortCompound) { + // 山川 is only 2 kanji (not > SEARCH_MODE_KANJI_LENGTH), so no penalty: it is + // kept whole even in search mode. + EXPECT_EQ(surfaces(YAMA + KAWA, KuromojiMode::Search), + (std::vector {YAMA + KAWA})); +} + +TEST_F(KuromojiSearchModeTest, SearchModeKeepsUnknownGroupWhole) { + // "abc" is an out-of-vocabulary run that segments as one unknown group token + // in normal/search mode. + EXPECT_EQ(surfaces("abc", KuromojiMode::Search), (std::vector {"abc"})); +} + +TEST_F(KuromojiSearchModeTest, ExtendedModeSplitsUnknownIntoUnigrams) { + // Extended mode additionally decomposes an unknown word into per-character + // unigrams (mirroring Lucene JapaneseTokenizer's EXTENDED mode). + EXPECT_EQ(surfaces("abc", KuromojiMode::Extended), (std::vector {"a", "b", "c"})); +} + +TEST_F(KuromojiSearchModeTest, ExtendedModeStillDecompoundsKnownCompound) { + // Extended mode keeps search-mode behavior for known compounds. + EXPECT_EQ(surfaces(TOU + KYO + TO, KuromojiMode::Extended), + (std::vector {TOU, KYO, TO})); +} + +} // namespace doris::segment_v2::kuromoji diff --git a/be/test/storage/index/inverted/analyzer/kuromoji_analyzer_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji_analyzer_test.cpp new file mode 100644 index 00000000000000..6b6ff2f5e5f16c --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji_analyzer_test.cpp @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "CLucene.h" +#include "common/config.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/inverted_index_parser.h" + +using namespace lucene::analysis; + +namespace doris::segment_v2::inverted_index { + +class KuromojiAnalyzerTest : public ::testing::Test { +protected: + bool _saved = false; + void SetUp() override { _saved = config::enable_kuromoji_analyzer; } + void TearDown() override { config::enable_kuromoji_analyzer = _saved; } +}; + +TEST_F(KuromojiAnalyzerTest, BuiltinDispatchTokenizes) { + config::enable_kuromoji_analyzer = true; + auto analyzer = InvertedIndexAnalyzer::create_builtin_analyzer( + InvertedIndexParserType::PARSER_KUROMOJI, INVERTED_INDEX_PARSER_KUROMOJI_SEARCH, "true", + "none"); + ASSERT_NE(analyzer, nullptr); + + std::string s = "東京都"; + lucene::util::SStringReader reader; + reader.init(s.data(), s.size(), false); + + std::unique_ptr ts((TokenStream*)analyzer->tokenStream(L"", &reader)); + std::vector out; + Token t; + while (ts->next(&t)) { + out.emplace_back(t.termBuffer(), t.termLength()); + } + EXPECT_EQ(out, (std::vector {"東", "京", "都"})); +} + +TEST_F(KuromojiAnalyzerTest, DisabledByConfigThrows) { + config::enable_kuromoji_analyzer = false; + EXPECT_ANY_THROW({ + (void)InvertedIndexAnalyzer::create_builtin_analyzer( + InvertedIndexParserType::PARSER_KUROMOJI, INVERTED_INDEX_PARSER_KUROMOJI_SEARCH, + "true", "none"); + }); +} + +} // namespace doris::segment_v2::inverted_index diff --git a/be/test/storage/index/inverted/analyzer/kuromoji_tokenizer_test.cpp b/be/test/storage/index/inverted/analyzer/kuromoji_tokenizer_test.cpp new file mode 100644 index 00000000000000..184afafe6f8f55 --- /dev/null +++ b/be/test/storage/index/inverted/analyzer/kuromoji_tokenizer_test.cpp @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "CLucene.h" +#include "storage/index/inverted/analyzer/kuromoji/KuromojiTokenizer.h" + +using namespace lucene::analysis; + +namespace doris::segment_v2 { + +static std::vector tokenize(const std::string& s) { + KuromojiTokenizer tk(KuromojiMode::Search, true, false); + lucene::util::SStringReader reader; + reader.init(s.data(), s.size(), false); + tk.reset(&reader); + std::vector out; + Token t; + while (tk.next(&t)) { + out.emplace_back(t.termBuffer(), t.termLength()); + } + return out; +} + +TEST(KuromojiTokenizerStubTest, CjkUnigram) { + EXPECT_EQ(tokenize("東京都"), (std::vector {"東", "京", "都"})); +} + +TEST(KuromojiTokenizerStubTest, SkipsAsciiSpaces) { + EXPECT_EQ(tokenize("a b"), (std::vector {"a", "b"})); +} + +} // namespace doris::segment_v2 diff --git a/be/test/storage/index/inverted/kuromoji_parser_type_test.cpp b/be/test/storage/index/inverted/kuromoji_parser_type_test.cpp new file mode 100644 index 00000000000000..db4789e31ed595 --- /dev/null +++ b/be/test/storage/index/inverted/kuromoji_parser_type_test.cpp @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "storage/index/inverted/inverted_index_parser.h" + +namespace doris { + +TEST(KuromojiParserTypeTest, StringToEnum) { + EXPECT_EQ(get_inverted_index_parser_type_from_string("kuromoji"), + InvertedIndexParserType::PARSER_KUROMOJI); + EXPECT_EQ(get_inverted_index_parser_type_from_string("KUROMOJI"), + InvertedIndexParserType::PARSER_KUROMOJI); +} + +TEST(KuromojiParserTypeTest, EnumToString) { + EXPECT_EQ(inverted_index_parser_type_to_string(InvertedIndexParserType::PARSER_KUROMOJI), + "kuromoji"); +} + +TEST(KuromojiParserTypeTest, DefaultModeIsSearch) { + std::map props = {{"parser", "kuromoji"}}; + EXPECT_EQ(get_parser_mode_string_from_properties(props), "search"); +} + +} // namespace doris diff --git a/dist/licenses/LICENSE-darts-clone.txt b/dist/licenses/LICENSE-darts-clone.txt new file mode 100644 index 00000000000000..43ed4225a607e1 --- /dev/null +++ b/dist/licenses/LICENSE-darts-clone.txt @@ -0,0 +1,11 @@ +# The BSD 2-clause license + +Copyright (c) 2008-2014, Susumu Yata +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/dist/licenses/LICENSE-ipadic.txt b/dist/licenses/LICENSE-ipadic.txt new file mode 100644 index 00000000000000..d0fed662d33a0e --- /dev/null +++ b/dist/licenses/LICENSE-ipadic.txt @@ -0,0 +1,71 @@ +Copyright 2000, 2001, 2002, 2003 Nara Institute of Science +and Technology. All Rights Reserved. + +Use, reproduction, and distribution of this software is permitted. +Any copy of this software, whether in its original form or modified, +must include both the above copyright notice and the following +paragraphs. + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + diff --git a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/InvertedIndexProperties.java b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/InvertedIndexProperties.java index 3a564f893b215c..179dd0de0a9eae 100644 --- a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/InvertedIndexProperties.java +++ b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/InvertedIndexProperties.java @@ -35,11 +35,15 @@ public class InvertedIndexProperties { public static String INVERTED_INDEX_PARSER_ICU = "icu"; public static String INVERTED_INDEX_PARSER_BASIC = "basic"; public static String INVERTED_INDEX_PARSER_IK = "ik"; + public static String INVERTED_INDEX_PARSER_KUROMOJI = "kuromoji"; public static String INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode"; public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; public static String INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained"; public static String INVERTED_INDEX_PARSER_SMART = "ik_smart"; + public static String INVERTED_INDEX_PARSER_KUROMOJI_NORMAL = "normal"; + public static String INVERTED_INDEX_PARSER_KUROMOJI_SEARCH = "search"; + public static String INVERTED_INDEX_PARSER_KUROMOJI_EXTENDED = "extended"; public static String INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; public static String INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 54129adf81bed0..09339eb58bb2ee 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -53,6 +53,7 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_ICU = InvertedIndexProperties.INVERTED_INDEX_PARSER_ICU; public static String INVERTED_INDEX_PARSER_BASIC = InvertedIndexProperties.INVERTED_INDEX_PARSER_BASIC; public static String INVERTED_INDEX_PARSER_IK = InvertedIndexProperties.INVERTED_INDEX_PARSER_IK; + public static String INVERTED_INDEX_PARSER_KUROMOJI = InvertedIndexProperties.INVERTED_INDEX_PARSER_KUROMOJI; public static String INVERTED_INDEX_PARSER_MODE_KEY = InvertedIndexProperties.INVERTED_INDEX_PARSER_MODE_KEY; @@ -124,7 +125,8 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c || parser.equals(INVERTED_INDEX_PARSER_CHINESE) || parser.equals(INVERTED_INDEX_PARSER_ICU) || parser.equals(INVERTED_INDEX_PARSER_BASIC) - || parser.equals(INVERTED_INDEX_PARSER_IK))) { + || parser.equals(INVERTED_INDEX_PARSER_IK) + || parser.equals(INVERTED_INDEX_PARSER_KUROMOJI))) { throw new AnalysisException("INVERTED index parser: " + parser + " is invalid for column: " + indexColName + " of type " + colType); } @@ -207,9 +209,10 @@ private static void checkInvertedIndexProperties(Map properties, checkAnalyzerName(analyzerName, colType); checkNormalizerName(normalizerName, colType); - if (parser != null && !parser.matches("none|english|unicode|chinese|standard|icu|basic|ik")) { + if (parser != null + && !parser.matches("none|english|unicode|chinese|standard|icu|basic|ik|kuromoji")) { throw new AnalysisException("Invalid inverted index 'parser' value: " + parser - + ", parser must be none, english, unicode, chinese, icu, basic or ik"); + + ", parser must be none, english, unicode, chinese, icu, basic, ik or kuromoji"); } if (parserMode != null) { @@ -223,8 +226,13 @@ private static void checkInvertedIndexProperties(Map properties, throw new AnalysisException("Invalid inverted index 'parser_mode' value: " + parserMode + ", parser_mode must be ik_max_word or ik_smart for ik parser"); } - } else if (parserMode != null) { - throw new AnalysisException("parser_mode is only available for chinese and ik parser"); + } else if (INVERTED_INDEX_PARSER_KUROMOJI.equals(parser)) { + if (!parserMode.matches("search|normal|extended")) { + throw new AnalysisException("Invalid inverted index 'parser_mode' value: " + parserMode + + ", parser_mode must be search, normal or extended for kuromoji parser"); + } + } else { + throw new AnalysisException("parser_mode is only available for chinese, ik and kuromoji parser"); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java index fb0338c8ccf460..b072dae45bc3a9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java +++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java @@ -69,7 +69,7 @@ public class IndexPolicy implements Writable, GsonPostProcessable { "empty", "char_replace", "icu_normalizer"); public static final Set BUILTIN_ANALYZERS = ImmutableSet.of( - "none", "standard", "unicode", "english", "chinese", "icu", "basic", "ik"); + "none", "standard", "unicode", "english", "chinese", "icu", "basic", "ik", "kuromoji"); public static final Set BUILTIN_NORMALIZERS = ImmutableSet.of("lowercase"); diff --git a/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexKuromojiValidationTest.java b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexKuromojiValidationTest.java new file mode 100644 index 00000000000000..609252f1b9285f --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexKuromojiValidationTest.java @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.Map; + +public class InvertedIndexKuromojiValidationTest { + + @Test + public void acceptsKuromojiParserWithSearchMode() throws AnalysisException { + Map props = new HashMap<>(); + props.put("parser", "kuromoji"); + props.put("parser_mode", "search"); + InvertedIndexUtil.checkInvertedIndexParser( + "content", PrimitiveType.STRING, props, TInvertedIndexFileStorageFormat.V2); + } + + @Test + public void rejectsInvalidKuromojiMode() { + Map props = new HashMap<>(); + props.put("parser", "kuromoji"); + props.put("parser_mode", "bogus"); + Assertions.assertThrows(AnalysisException.class, () -> + InvertedIndexUtil.checkInvertedIndexParser( + "content", PrimitiveType.STRING, props, TInvertedIndexFileStorageFormat.V2)); + } +} diff --git a/regression-test/suites/inverted_index_p0/analyzer/test_japanese_analyzer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_japanese_analyzer.groovy new file mode 100644 index 00000000000000..c54ee9f2a4ca58 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/analyzer/test_japanese_analyzer.groovy @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_japanese_analyzer", "p0") { + def tableName = "test_japanese_analyzer" + + def backendId_to_backendIP = [:] + def backendId_to_backendHttpPort = [:] + getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort) + def set_be_config = { key, value -> + for (String backend_id : backendId_to_backendIP.keySet()) { + update_be_config(backendId_to_backendIP.get(backend_id), + backendId_to_backendHttpPort.get(backend_id), key, value) + } + } + + sql "DROP TABLE IF EXISTS ${tableName}" + // kuromoji is disabled by default; enable it for this test. + set_be_config("enable_kuromoji_analyzer", "true") + try { + sql """ + CREATE TABLE ${tableName} ( + `id` int(11) NULL COMMENT "", + `content` text NULL COMMENT "", + INDEX content_idx (`content`) USING INVERTED PROPERTIES("parser" = "kuromoji", "parser_mode" = "search") COMMENT '', + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ INSERT INTO ${tableName} VALUES (1, "東京都に住んでいます"); """ + sql """ INSERT INTO ${tableName} VALUES (2, "私は寿司が好きです"); """ + sql """ INSERT INTO ${tableName} VALUES (3, "Apache Doris は高速です"); """ + sql "sync" + + // The kuromoji dictionary is not shipped in the p0 package, so the + // analyzer falls back to CJK unigram. + def tokyo = sql """ SELECT id FROM ${tableName} WHERE content MATCH '東' ORDER BY id; """ + assertEquals(1, tokyo.size()) + assertTrue(tokyo[0][0] == 1) + + def sushi = sql """ SELECT id FROM ${tableName} WHERE content MATCH '寿' ORDER BY id; """ + assertEquals(1, sushi.size()) + assertTrue(sushi[0][0] == 2) + + // Verify the TOKENIZE function dispatches to the kuromoji parser. + // Quoting follows the literal-string form proven in test_tokenize.groovy:97 — + // property string uses double-quoted keys/values inside a single-quoted outer string. + def tokens = sql """SELECT TOKENIZE('東京都', '"parser"="kuromoji"');""" + def tokenStr = tokens[0][0].toString() + assertTrue(tokenStr.contains('"token": "東"')) + assertTrue(tokenStr.contains('"token": "京"')) + assertTrue(tokenStr.contains('"token": "都"')) + } finally { + sql "DROP TABLE IF EXISTS ${tableName}" + set_be_config("enable_kuromoji_analyzer", "false") + } +} diff --git a/regression-test/suites/inverted_index_p0/test_properties.groovy b/regression-test/suites/inverted_index_p0/test_properties.groovy index 16c255be72d8a5..6a4d6e32017dcc 100644 --- a/regression-test/suites/inverted_index_p0/test_properties.groovy +++ b/regression-test/suites/inverted_index_p0/test_properties.groovy @@ -84,7 +84,7 @@ suite("test_properties", "p0"){ "replication_allocation" = "tag.location.default: 1" ); """ - create_table_with_inverted_index_properties(wrong_parser_mode, "parser_mode is only available for chinese and ik parser") + create_table_with_inverted_index_properties(wrong_parser_mode, "parser_mode is only available for chinese, ik and kuromoji parser") assertEquals(success, false) def valid_parser_and_mode = """ diff --git a/regression-test/suites/variant_p0/predefine/test_variant_field_pattern_invalid_inverted_index.groovy b/regression-test/suites/variant_p0/predefine/test_variant_field_pattern_invalid_inverted_index.groovy index 88564c366b4482..3ea6dfedfc3ffd 100644 --- a/regression-test/suites/variant_p0/predefine/test_variant_field_pattern_invalid_inverted_index.groovy +++ b/regression-test/suites/variant_p0/predefine/test_variant_field_pattern_invalid_inverted_index.groovy @@ -219,7 +219,7 @@ suite("test_variant_field_pattern_invalid_inverted_index", "p0") { DISTRIBUTED BY HASH(`id`) BUCKETS 1 PROPERTIES ("replication_allocation" = "tag.location.default: 1") """ - exception("parser_mode is only available for chinese and ik parser") + exception("parser_mode is only available for chinese, ik and kuromoji parser") } sql "DROP TABLE IF EXISTS ${tableName}" diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index 0da907b3882d80..d7ae3796a85f31 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -1969,6 +1969,14 @@ build_icu() { make install } +# mecab-ipadic +build_mecab_ipadic() { + check_if_source_exist "${MECAB_IPADIC_SOURCE}" + rm -rf "${TP_INSTALL_DIR}/share/${MECAB_IPADIC_SOURCE}" + mkdir -p "${TP_INSTALL_DIR}/share" + cp -r "${TP_SOURCE_DIR}/${MECAB_IPADIC_SOURCE}" "${TP_INSTALL_DIR}/share/${MECAB_IPADIC_SOURCE}" +} + # jindofs build_jindofs() { check_if_source_exist "${JINDOFS_SOURCE}" @@ -2161,6 +2169,7 @@ if [[ "${#packages[@]}" -eq 0 ]]; then azure brotli icu + mecab_ipadic pugixml paimon_cpp ) @@ -2258,6 +2267,7 @@ cleanup_package_source() { azure) src_var="AZURE_SOURCE" ;; dragonbox) src_var="DRAGONBOX_SOURCE" ;; icu) src_var="ICU_SOURCE" ;; + mecab_ipadic) src_var="MECAB_IPADIC_SOURCE" ;; jindofs) src_var="JINDOFS_SOURCE" ;; juicefs) src_var="JUICEFS_SOURCE" ;; pugixml) src_var="PUGIXML_SOURCE" ;; diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index 13ab593312d7d9..2aca9fb94703e4 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -548,6 +548,12 @@ ICU_NAME=release-69-1.tar.gz ICU_SOURCE=icu-release-69-1 ICU_MD5SUM="135125f633864285d637db5c01e0388b" +# mecab-ipadic (UTF-8) for the Japanese kuromoji analyzer +MECAB_IPADIC_DOWNLOAD="https://github.com/lindera/mecab-ipadic/archive/refs/tags/2.7.0-20250920.tar.gz" +MECAB_IPADIC_NAME="mecab-ipadic-2.7.0-20250920.tar.gz" +MECAB_IPADIC_SOURCE="mecab-ipadic-2.7.0-20250920" +MECAB_IPADIC_MD5SUM="a95c409f12f1023fce8ef91f991ef042" + # jindofs JINDOFS_DOWNLOAD="https://github.com/apache/doris-thirdparty/releases/download/alibabacloud-jindodata-releases/jindofs-6.10.4-libs-0.1.tar.gz" JINDOFS_NAME=jindofs-6.10.4-libs-0.1.tar.gz @@ -575,6 +581,7 @@ PAIMON_CPP_MD5SUM="b8599a0421dbf1ec05e2f1a481d64e87" # all thirdparties which need to be downloaded is set in array TP_ARCHIVES export TP_ARCHIVES=( + 'MECAB_IPADIC' 'LIBEVENT' 'OPENSSL' 'THRIFT'