diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 73f9bbc9b..73a08ae6c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -60,7 +60,7 @@ jobs: export DEBIAN_FRONTEND=noninteractive apt update && apt install -q -y autoconf-archive cmake curl git libssl-dev \ libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \ - libxml2-dev zip libzstd-dev debhelper moreutils bison libicu-dev + libxml2-dev zip libzstd-dev debhelper moreutils bison update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \ --slave /usr/bin/g++ g++ /usr/bin/g++-9 run: | diff --git a/docs/build-from-source.md b/docs/build-from-source.md index 4081cd884..7b87667f2 100644 --- a/docs/build-from-source.md +++ b/docs/build-from-source.md @@ -16,21 +16,21 @@ On Debian/Ubuntu: ```bash sudo apt install ninja-build libunwind-dev libboost-fiber-dev libssl-dev \ - autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev libicu-dev + autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev ``` On Fedora: ```bash sudo dnf install -y automake boost-devel g++ git cmake libtool ninja-build libzstd-devel \ - openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel + openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel ``` On openSUSE: ```bash sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja libzstd-devel \ - openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel \ + openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel \ libboost_context-devel libboost_system-devel ``` diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c18f6b315..5fda2ffbc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -92,6 +92,13 @@ add_third_party( LIB "none" ) +add_third_party( + uni-algo + URL https://github.com/uni-algo/uni-algo/archive/refs/tags/v1.0.0.tar.gz + + BUILD_IN_SOURCE 1 +) + add_library(TRDP::jsoncons INTERFACE IMPORTED) add_dependencies(TRDP::jsoncons jsoncons_project) set_target_properties(TRDP::jsoncons PROPERTIES diff --git a/src/core/search/CMakeLists.txt b/src/core/search/CMakeLists.txt index bbbe89e53..9c74122e8 100644 --- a/src/core/search/CMakeLists.txt +++ b/src/core/search/CMakeLists.txt @@ -3,14 +3,11 @@ gen_bison(parser) cur_gen_dir(gen_dir) -find_package(ICU REQUIRED COMPONENTS uc i18n) - add_library(query_parser ast_expr.cc query_driver.cc search.cc indices.cc vector_utils.cc compressed_sorted_set.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc) -target_link_libraries(query_parser ICU::uc ICU::i18n) +target_link_libraries(query_parser base absl::strings TRDP::reflex TRDP::uni-algo) -target_link_libraries(query_parser base absl::strings TRDP::reflex) cxx_test(compressed_sorted_set_test query_parser LABELS DFLY) cxx_test(search_parser_test query_parser LABELS DFLY) cxx_test(search_test query_parser LABELS DFLY) diff --git a/src/core/search/indices.cc b/src/core/search/indices.cc index 032e03fe4..d03d0e586 100644 --- a/src/core/search/indices.cc +++ b/src/core/search/indices.cc @@ -8,12 +8,14 @@ #include #include #include -#include -#include + +#define UNI_ALGO_DISABLE_NFKC_NFKD + +#include +#include #include #include -#include #include "base/logging.h" @@ -28,60 +30,13 @@ bool IsAllAscii(string_view sv) { } // Get all words from text as matched by the ICU library -absl::flat_hash_set ICUTokenizeWords(std::string_view text) { - // Is text contains only ascii, skip working with ICU resources - if (IsAllAscii(text)) { - std::regex rx{"\\b.*?\\b", std::regex_constants::icase}; - std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{}; - - absl::flat_hash_set words; - for (auto it = begin; it != end; ++it) { - auto word = it->str(); - absl::AsciiStrToLower(&word); - words.insert(move(word)); - } - return words; - } - - icu::UnicodeString uStr(text.data(), text.size(), "UTF-8"); - - UErrorCode status = U_ZERO_ERROR; - std::unique_ptr wordIter{ - icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status)}; - - if (U_FAILURE(status)) - return {}; - - wordIter->setText(uStr); - - std::string tmpStdWord; +absl::flat_hash_set TokenizeWords(std::string_view text) { absl::flat_hash_set words; - - int32_t start = wordIter->first(); - for (int32_t end = wordIter->next(); end != icu::BreakIterator::DONE; - start = end, end = wordIter->next()) { - icu::UnicodeString word = uStr.tempSubStringBetween(start, end); - // If the substring is not a space, convert it to lowercase and add to results - if (!word.isBogus() && !word.trim().isEmpty()) { - word.toLower(); - word.toUTF8String(tmpStdWord); - words.emplace(move(tmpStdWord)); - } - } - + for (std::string_view word : una::views::word_only::utf8(text)) + words.insert(una::cases::to_lowercase_utf8(word)); return words; } -// Convert string to lowercase with ICU library -std::string ICUToLowercase(string_view input) { - icu::UnicodeString uStr = - icu::UnicodeString::fromUTF8(icu::StringPiece(input.data(), input.size())); - uStr.toLower(); - std::string result; - uStr.toUTF8String(result); - return result; -} - // Split taglist, remove duplicates and convert all to lowercase absl::flat_hash_set NormalizeTags(string_view taglist) { string tmp; @@ -127,7 +82,7 @@ const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const { if (IsAllAscii(str)) word = absl::AsciiStrToLower(str); else - word = ICUToLowercase(str); + word = una::cases::to_lowercase_utf8(str); auto it = entries_.find(word); return (it != entries_.end()) ? &it->second : nullptr; @@ -144,7 +99,7 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field) } absl::flat_hash_set TextIndex::Tokenize(std::string_view value) const { - return ICUTokenizeWords(value); + return TokenizeWords(value); } absl::flat_hash_set TagIndex::Tokenize(std::string_view value) const { diff --git a/src/server/search/search_family_test.cc b/src/server/search/search_family_test.cc index c9dd9835d..c8d52a537 100644 --- a/src/server/search/search_family_test.cc +++ b/src/server/search/search_family_test.cc @@ -361,6 +361,19 @@ TEST_F(SearchFamilyTest, Unicode) { UnorderedElementsAre("visits", "100", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας")); } +TEST_F(SearchFamilyTest, UnicodeWords) { + EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text"}), "OK"); + + Run({"hset", "d:1", "title", + "WORD!!! Одно слово? Zwei Wörter. Comma before ,sentence, " + "Τρεις λέξεις: χελώνα-σκύλου-γάτας. !זה עובד", + "visits", "400"}); + + // Make sure it includes ALL those words + EXPECT_THAT(Run({"ft.search", "i1", "word слово wörter sentence λέξεις γάτας עובד"}), + AreDocIds("d:1")); +} + TEST_F(SearchFamilyTest, SimpleExpiry) { EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "expires-in", "numeric"}), "OK"); diff --git a/tools/packaging/Dockerfile.alpine-dev b/tools/packaging/Dockerfile.alpine-dev index 1e71d6fea..4ac3db97f 100644 --- a/tools/packaging/Dockerfile.alpine-dev +++ b/tools/packaging/Dockerfile.alpine-dev @@ -6,7 +6,7 @@ FROM alpine:3 as builder # "openssl-libs-static" fixes "Could NOT find OpenSSL, try to set the path to OpenSSL root folder in the" RUN apk add autoconf-archive automake bash bison boost-dev cmake coreutils \ curl ccache git gcc gdb g++ libunwind-dev libtool libxml2-dev make ninja \ - openssl-dev openssl-libs-static patch zip zstd-static icu-dev + openssl-dev openssl-libs-static patch zip zstd-static # This is required to make static linking work RUN ls -1 /usr/lib/libboost_*.so | while read -r _file; do ln -sfv ${_file} ${_file//.so/.a}; done diff --git a/tools/packaging/Dockerfile.ubuntu-dev b/tools/packaging/Dockerfile.ubuntu-dev index c35e38ef9..8ff5c7ab4 100644 --- a/tools/packaging/Dockerfile.ubuntu-dev +++ b/tools/packaging/Dockerfile.ubuntu-dev @@ -12,7 +12,7 @@ RUN \ apt update && \ apt install -q -y autoconf-archive cmake curl git libssl-dev \ libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \ - libxml2-dev zip libzstd-dev bison libicu-dev + libxml2-dev zip libzstd-dev bison RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \ --slave /usr/bin/g++ g++ /usr/bin/g++-9