diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c2e18c20f..296939ed1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -60,7 +60,7 @@ jobs: export DEBIAN_FRONTEND=noninteractive apt update && apt install -q -y autoconf-archive cmake curl git libssl-dev \ libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \ - libxml2-dev zip libzstd-dev debhelper moreutils bison + libxml2-dev zip libzstd-dev debhelper moreutils bison libicu-dev update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \ --slave /usr/bin/g++ g++ /usr/bin/g++-9 run: | diff --git a/docs/build-from-source.md b/docs/build-from-source.md index 6ffcad509..8ffd6eeeb 100644 --- a/docs/build-from-source.md +++ b/docs/build-from-source.md @@ -16,21 +16,21 @@ On Debian/Ubuntu: ```bash sudo apt install ninja-build libunwind-dev libboost-fiber-dev libssl-dev \ - autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev + autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev libicu-dev ``` On Fedora: ```bash sudo yum install automake boost-devel g++ git cmake libtool ninja-build libzstd-devel \ - openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel + openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel ``` On openSUSE: ```bash sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja libzstd-devel \ - openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel \ + openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel \ libboost_context-devel libboost_system-devel ``` diff --git a/src/core/search/CMakeLists.txt b/src/core/search/CMakeLists.txt index 78901e8e1..a68c73bbe 100644 --- a/src/core/search/CMakeLists.txt +++ b/src/core/search/CMakeLists.txt @@ -3,8 +3,13 @@ gen_bison(parser) cur_gen_dir(gen_dir) +find_package(ICU REQUIRED COMPONENTS uc i18n) + add_library(query_parser ast_expr.cc query_driver.cc search.cc indices.cc vector.cc compressed_sorted_set.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc) + +target_link_libraries(query_parser ICU::uc ICU::i18n) + target_link_libraries(query_parser base absl::strings TRDP::reflex) cxx_test(compressed_sorted_set_test query_parser LABELS DFLY) cxx_test(search_parser_test query_parser LABELS DFLY) diff --git a/src/core/search/indices.cc b/src/core/search/indices.cc index d94dd138e..39f98a363 100644 --- a/src/core/search/indices.cc +++ b/src/core/search/indices.cc @@ -8,8 +8,11 @@ #include #include #include +#include +#include #include +#include #include #include "base/logging.h" @@ -20,20 +23,64 @@ using namespace std; namespace { -// Get all words from text as matched by regex word boundaries -absl::flat_hash_set TokenizeWords(string_view text) { - std::regex rx{"\\b.*?\\b", std::regex_constants::icase}; - std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{}; +bool IsAllAscii(string_view sv) { + return all_of(sv.begin(), sv.end(), [](unsigned char c) { return isascii(c); }); +} - absl::flat_hash_set words; - for (auto it = begin; it != end; ++it) { - auto word = it->str(); - absl::AsciiStrToLower(&word); - words.insert(move(word)); +// Get all words from text as matched by the ICU library +absl::flat_hash_set ICUTokenizeWords(std::string_view text) { + // Is text contains only ascii, skip working with ICU resources + if (IsAllAscii(text)) { + std::regex rx{"\\b.*?\\b", std::regex_constants::icase}; + std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{}; + + absl::flat_hash_set words; + for (auto it = begin; it != end; ++it) { + auto word = it->str(); + absl::AsciiStrToLower(&word); + words.insert(move(word)); + } + return words; } + + icu::UnicodeString uStr(text.data(), text.size(), "UTF-8"); + + UErrorCode status = U_ZERO_ERROR; + std::unique_ptr wordIter{ + icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status)}; + + if (U_FAILURE(status)) + return {}; + + wordIter->setText(uStr); + + std::string tmpStdWord; + absl::flat_hash_set words; + + int32_t start = wordIter->first(); + for (int32_t end = wordIter->next(); end != icu::BreakIterator::DONE; + start = end, end = wordIter->next()) { + icu::UnicodeString word = uStr.tempSubStringBetween(start, end); + // If the substring is not a space, convert it to lowercase and add to results + if (!word.isBogus() && !word.trim().isEmpty()) { + word.toLower(); + word.toUTF8String(tmpStdWord); + words.emplace(move(tmpStdWord)); + } + } + return words; } +// Convert string to lowercase with ICU library +std::string ICUToLowercase(string_view input) { + icu::UnicodeString uStr = icu::UnicodeString::fromUTF8(input); + uStr.toLower(); + std::string result; + uStr.toUTF8String(result); + return result; +} + // Split taglist, remove duplicates and convert all to lowercase absl::flat_hash_set NormalizeTags(string_view taglist) { string tmp; @@ -73,7 +120,15 @@ vector NumericIndex::Range(int64_t l, int64_t r) const { } const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const { - auto it = entries_.find(absl::StripAsciiWhitespace(str)); + str = absl::StripAsciiWhitespace(str); + + string word; + if (IsAllAscii(str)) + word = absl::AsciiStrToLower(str); + else + word = ICUToLowercase(str); + + auto it = entries_.find(word); return (it != entries_.end()) ? &it->second : nullptr; } @@ -88,7 +143,7 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field) } absl::flat_hash_set TextIndex::Tokenize(std::string_view value) const { - return TokenizeWords(value); + return ICUTokenizeWords(value); } absl::flat_hash_set TagIndex::Tokenize(std::string_view value) const { diff --git a/src/server/search/search_family_test.cc b/src/server/search/search_family_test.cc index ba7fbc8d2..f9ee3bd75 100644 --- a/src/server/search/search_family_test.cc +++ b/src/server/search/search_family_test.cc @@ -287,6 +287,25 @@ TEST_F(SearchFamilyTest, SimpleUpdates) { } } +TEST_F(SearchFamilyTest, Unicode) { + EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "visits", "numeric"}), "OK"); + + // Explicitly using screaming uppercase to check utf-8 to lowercase functionality + Run({"hset", "d:1", "title", "Веселая СТРЕКОЗА Иван", "visits", "400"}); + Run({"hset", "d:2", "title", "Die fröhliche Libelle Günther", "visits", "300"}); + Run({"hset", "d:3", "title", "השפירית המהירה יעקב", "visits", "200"}); + Run({"hset", "d:4", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας", "visits", "100"}); + + // Check we find our dragonfly in all languages + EXPECT_THAT(Run({"ft.search", "i1", "стРекоЗа|liBellE|השפירית|λΙβελλοΎλη"}), + AreDocIds("d:1", "d:2", "d:3", "d:4")); + + // Check the result is valid + auto resp = Run({"ft.search", "i1", "λιβελλούλη"}); + EXPECT_THAT(resp.GetVec()[2].GetVec(), + UnorderedElementsAre("visits", "100", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας")); +} + TEST_F(SearchFamilyTest, SimpleExpiry) { EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "expires-in", "numeric"}), "OK"); diff --git a/tools/docker/Dockerfile.ubuntu-dev b/tools/docker/Dockerfile.ubuntu-dev index 750d29e24..eec0d809a 100644 --- a/tools/docker/Dockerfile.ubuntu-dev +++ b/tools/docker/Dockerfile.ubuntu-dev @@ -12,7 +12,7 @@ RUN \ apt update && \ apt install -q -y autoconf-archive cmake curl git libssl-dev \ libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \ - libxml2-dev zip libzstd-dev bison + libxml2-dev zip libzstd-dev bison libicu-dev RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \ --slave /usr/bin/g++ g++ /usr/bin/g++-9