Remove ICU library (#1812)

* chore(search): Replace icu with unialgo

---------

Signed-off-by: Vladislav Oleshko <vlad@dragonflydb.io>
This commit is contained in:
Vladislav 2023-09-06 15:06:38 +03:00 committed by GitHub
parent a8e4bebffe
commit e0af5fe836
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 37 additions and 65 deletions

View File

@ -60,7 +60,7 @@ jobs:
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -q -y autoconf-archive cmake curl git libssl-dev \
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
libxml2-dev zip libzstd-dev debhelper moreutils bison libicu-dev
libxml2-dev zip libzstd-dev debhelper moreutils bison
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
--slave /usr/bin/g++ g++ /usr/bin/g++-9
run: |

View File

@ -16,21 +16,21 @@ On Debian/Ubuntu:
```bash
sudo apt install ninja-build libunwind-dev libboost-fiber-dev libssl-dev \
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev libicu-dev
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev
```
On Fedora:
```bash
sudo dnf install -y automake boost-devel g++ git cmake libtool ninja-build libzstd-devel \
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel
```
On openSUSE:
```bash
sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja libzstd-devel \
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel \
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel \
libboost_context-devel libboost_system-devel
```

View File

@ -92,6 +92,13 @@ add_third_party(
LIB "none"
)
add_third_party(
uni-algo
URL https://github.com/uni-algo/uni-algo/archive/refs/tags/v1.0.0.tar.gz
BUILD_IN_SOURCE 1
)
add_library(TRDP::jsoncons INTERFACE IMPORTED)
add_dependencies(TRDP::jsoncons jsoncons_project)
set_target_properties(TRDP::jsoncons PROPERTIES

View File

@ -3,14 +3,11 @@ gen_bison(parser)
cur_gen_dir(gen_dir)
find_package(ICU REQUIRED COMPONENTS uc i18n)
add_library(query_parser ast_expr.cc query_driver.cc search.cc indices.cc vector_utils.cc compressed_sorted_set.cc
${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
target_link_libraries(query_parser ICU::uc ICU::i18n)
target_link_libraries(query_parser base absl::strings TRDP::reflex TRDP::uni-algo)
target_link_libraries(query_parser base absl::strings TRDP::reflex)
cxx_test(compressed_sorted_set_test query_parser LABELS DFLY)
cxx_test(search_parser_test query_parser LABELS DFLY)
cxx_test(search_test query_parser LABELS DFLY)

View File

@ -8,12 +8,14 @@
#include <absl/strings/ascii.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_split.h>
#include <unicode/brkiter.h>
#include <unicode/unistr.h>
#define UNI_ALGO_DISABLE_NFKC_NFKD
#include <uni_algo/case.h>
#include <uni_algo/ranges_word.h>
#include <algorithm>
#include <cctype>
#include <regex>
#include "base/logging.h"
@ -28,60 +30,13 @@ bool IsAllAscii(string_view sv) {
}
// Get all words from text as matched by the ICU library
absl::flat_hash_set<std::string> ICUTokenizeWords(std::string_view text) {
// Is text contains only ascii, skip working with ICU resources
if (IsAllAscii(text)) {
std::regex rx{"\\b.*?\\b", std::regex_constants::icase};
std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{};
absl::flat_hash_set<string> words;
for (auto it = begin; it != end; ++it) {
auto word = it->str();
absl::AsciiStrToLower(&word);
words.insert(move(word));
}
return words;
}
icu::UnicodeString uStr(text.data(), text.size(), "UTF-8");
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::BreakIterator> wordIter{
icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status)};
if (U_FAILURE(status))
return {};
wordIter->setText(uStr);
std::string tmpStdWord;
absl::flat_hash_set<std::string> TokenizeWords(std::string_view text) {
absl::flat_hash_set<std::string> words;
int32_t start = wordIter->first();
for (int32_t end = wordIter->next(); end != icu::BreakIterator::DONE;
start = end, end = wordIter->next()) {
icu::UnicodeString word = uStr.tempSubStringBetween(start, end);
// If the substring is not a space, convert it to lowercase and add to results
if (!word.isBogus() && !word.trim().isEmpty()) {
word.toLower();
word.toUTF8String(tmpStdWord);
words.emplace(move(tmpStdWord));
}
}
for (std::string_view word : una::views::word_only::utf8(text))
words.insert(una::cases::to_lowercase_utf8(word));
return words;
}
// Convert string to lowercase with ICU library
std::string ICUToLowercase(string_view input) {
icu::UnicodeString uStr =
icu::UnicodeString::fromUTF8(icu::StringPiece(input.data(), input.size()));
uStr.toLower();
std::string result;
uStr.toUTF8String(result);
return result;
}
// Split taglist, remove duplicates and convert all to lowercase
absl::flat_hash_set<string> NormalizeTags(string_view taglist) {
string tmp;
@ -127,7 +82,7 @@ const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
if (IsAllAscii(str))
word = absl::AsciiStrToLower(str);
else
word = ICUToLowercase(str);
word = una::cases::to_lowercase_utf8(str);
auto it = entries_.find(word);
return (it != entries_.end()) ? &it->second : nullptr;
@ -144,7 +99,7 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field)
}
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
return ICUTokenizeWords(value);
return TokenizeWords(value);
}
absl::flat_hash_set<std::string> TagIndex::Tokenize(std::string_view value) const {

View File

@ -361,6 +361,19 @@ TEST_F(SearchFamilyTest, Unicode) {
UnorderedElementsAre("visits", "100", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας"));
}
TEST_F(SearchFamilyTest, UnicodeWords) {
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text"}), "OK");
Run({"hset", "d:1", "title",
"WORD!!! Одно слово? Zwei Wörter. Comma before ,sentence, "
"Τρεις λέξεις: χελώνα-σκύλου-γάτας. !זה עובד",
"visits", "400"});
// Make sure it includes ALL those words
EXPECT_THAT(Run({"ft.search", "i1", "word слово wörter sentence λέξεις γάτας עובד"}),
AreDocIds("d:1"));
}
TEST_F(SearchFamilyTest, SimpleExpiry) {
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "expires-in", "numeric"}), "OK");

View File

@ -6,7 +6,7 @@ FROM alpine:3 as builder
# "openssl-libs-static" fixes "Could NOT find OpenSSL, try to set the path to OpenSSL root folder in the"
RUN apk add autoconf-archive automake bash bison boost-dev cmake coreutils \
curl ccache git gcc gdb g++ libunwind-dev libtool libxml2-dev make ninja \
openssl-dev openssl-libs-static patch zip zstd-static icu-dev
openssl-dev openssl-libs-static patch zip zstd-static
# This is required to make static linking work
RUN ls -1 /usr/lib/libboost_*.so | while read -r _file; do ln -sfv ${_file} ${_file//.so/.a}; done

View File

@ -12,7 +12,7 @@ RUN \
apt update && \
apt install -q -y autoconf-archive cmake curl git libssl-dev \
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
libxml2-dev zip libzstd-dev bison libicu-dev
libxml2-dev zip libzstd-dev bison
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
--slave /usr/bin/g++ g++ /usr/bin/g++-9