mirror of
https://github.com/dragonflydb/dragonfly
synced 2024-11-21 23:19:53 +00:00
Remove ICU library (#1812)
* chore(search): Replace icu with unialgo --------- Signed-off-by: Vladislav Oleshko <vlad@dragonflydb.io>
This commit is contained in:
parent
a8e4bebffe
commit
e0af5fe836
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@ -60,7 +60,7 @@ jobs:
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt update && apt install -q -y autoconf-archive cmake curl git libssl-dev \
|
||||
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
|
||||
libxml2-dev zip libzstd-dev debhelper moreutils bison libicu-dev
|
||||
libxml2-dev zip libzstd-dev debhelper moreutils bison
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
|
||||
--slave /usr/bin/g++ g++ /usr/bin/g++-9
|
||||
run: |
|
||||
|
@ -16,21 +16,21 @@ On Debian/Ubuntu:
|
||||
|
||||
```bash
|
||||
sudo apt install ninja-build libunwind-dev libboost-fiber-dev libssl-dev \
|
||||
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev libicu-dev
|
||||
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev
|
||||
```
|
||||
|
||||
On Fedora:
|
||||
|
||||
```bash
|
||||
sudo dnf install -y automake boost-devel g++ git cmake libtool ninja-build libzstd-devel \
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel
|
||||
```
|
||||
|
||||
On openSUSE:
|
||||
|
||||
```bash
|
||||
sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja libzstd-devel \
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel \
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel \
|
||||
libboost_context-devel libboost_system-devel
|
||||
```
|
||||
|
||||
|
@ -92,6 +92,13 @@ add_third_party(
|
||||
LIB "none"
|
||||
)
|
||||
|
||||
add_third_party(
|
||||
uni-algo
|
||||
URL https://github.com/uni-algo/uni-algo/archive/refs/tags/v1.0.0.tar.gz
|
||||
|
||||
BUILD_IN_SOURCE 1
|
||||
)
|
||||
|
||||
add_library(TRDP::jsoncons INTERFACE IMPORTED)
|
||||
add_dependencies(TRDP::jsoncons jsoncons_project)
|
||||
set_target_properties(TRDP::jsoncons PROPERTIES
|
||||
|
@ -3,14 +3,11 @@ gen_bison(parser)
|
||||
|
||||
cur_gen_dir(gen_dir)
|
||||
|
||||
find_package(ICU REQUIRED COMPONENTS uc i18n)
|
||||
|
||||
add_library(query_parser ast_expr.cc query_driver.cc search.cc indices.cc vector_utils.cc compressed_sorted_set.cc
|
||||
${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
||||
|
||||
target_link_libraries(query_parser ICU::uc ICU::i18n)
|
||||
target_link_libraries(query_parser base absl::strings TRDP::reflex TRDP::uni-algo)
|
||||
|
||||
target_link_libraries(query_parser base absl::strings TRDP::reflex)
|
||||
cxx_test(compressed_sorted_set_test query_parser LABELS DFLY)
|
||||
cxx_test(search_parser_test query_parser LABELS DFLY)
|
||||
cxx_test(search_test query_parser LABELS DFLY)
|
||||
|
@ -8,12 +8,14 @@
|
||||
#include <absl/strings/ascii.h>
|
||||
#include <absl/strings/numbers.h>
|
||||
#include <absl/strings/str_split.h>
|
||||
#include <unicode/brkiter.h>
|
||||
#include <unicode/unistr.h>
|
||||
|
||||
#define UNI_ALGO_DISABLE_NFKC_NFKD
|
||||
|
||||
#include <uni_algo/case.h>
|
||||
#include <uni_algo/ranges_word.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <regex>
|
||||
|
||||
#include "base/logging.h"
|
||||
|
||||
@ -28,60 +30,13 @@ bool IsAllAscii(string_view sv) {
|
||||
}
|
||||
|
||||
// Get all words from text as matched by the ICU library
|
||||
absl::flat_hash_set<std::string> ICUTokenizeWords(std::string_view text) {
|
||||
// Is text contains only ascii, skip working with ICU resources
|
||||
if (IsAllAscii(text)) {
|
||||
std::regex rx{"\\b.*?\\b", std::regex_constants::icase};
|
||||
std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{};
|
||||
|
||||
absl::flat_hash_set<string> words;
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
auto word = it->str();
|
||||
absl::AsciiStrToLower(&word);
|
||||
words.insert(move(word));
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
icu::UnicodeString uStr(text.data(), text.size(), "UTF-8");
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
std::unique_ptr<icu::BreakIterator> wordIter{
|
||||
icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status)};
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return {};
|
||||
|
||||
wordIter->setText(uStr);
|
||||
|
||||
std::string tmpStdWord;
|
||||
absl::flat_hash_set<std::string> TokenizeWords(std::string_view text) {
|
||||
absl::flat_hash_set<std::string> words;
|
||||
|
||||
int32_t start = wordIter->first();
|
||||
for (int32_t end = wordIter->next(); end != icu::BreakIterator::DONE;
|
||||
start = end, end = wordIter->next()) {
|
||||
icu::UnicodeString word = uStr.tempSubStringBetween(start, end);
|
||||
// If the substring is not a space, convert it to lowercase and add to results
|
||||
if (!word.isBogus() && !word.trim().isEmpty()) {
|
||||
word.toLower();
|
||||
word.toUTF8String(tmpStdWord);
|
||||
words.emplace(move(tmpStdWord));
|
||||
}
|
||||
}
|
||||
|
||||
for (std::string_view word : una::views::word_only::utf8(text))
|
||||
words.insert(una::cases::to_lowercase_utf8(word));
|
||||
return words;
|
||||
}
|
||||
|
||||
// Convert string to lowercase with ICU library
|
||||
std::string ICUToLowercase(string_view input) {
|
||||
icu::UnicodeString uStr =
|
||||
icu::UnicodeString::fromUTF8(icu::StringPiece(input.data(), input.size()));
|
||||
uStr.toLower();
|
||||
std::string result;
|
||||
uStr.toUTF8String(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Split taglist, remove duplicates and convert all to lowercase
|
||||
absl::flat_hash_set<string> NormalizeTags(string_view taglist) {
|
||||
string tmp;
|
||||
@ -127,7 +82,7 @@ const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
|
||||
if (IsAllAscii(str))
|
||||
word = absl::AsciiStrToLower(str);
|
||||
else
|
||||
word = ICUToLowercase(str);
|
||||
word = una::cases::to_lowercase_utf8(str);
|
||||
|
||||
auto it = entries_.find(word);
|
||||
return (it != entries_.end()) ? &it->second : nullptr;
|
||||
@ -144,7 +99,7 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field)
|
||||
}
|
||||
|
||||
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
|
||||
return ICUTokenizeWords(value);
|
||||
return TokenizeWords(value);
|
||||
}
|
||||
|
||||
absl::flat_hash_set<std::string> TagIndex::Tokenize(std::string_view value) const {
|
||||
|
@ -361,6 +361,19 @@ TEST_F(SearchFamilyTest, Unicode) {
|
||||
UnorderedElementsAre("visits", "100", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας"));
|
||||
}
|
||||
|
||||
TEST_F(SearchFamilyTest, UnicodeWords) {
|
||||
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text"}), "OK");
|
||||
|
||||
Run({"hset", "d:1", "title",
|
||||
"WORD!!! Одно слово? Zwei Wörter. Comma before ,sentence, "
|
||||
"Τρεις λέξεις: χελώνα-σκύλου-γάτας. !זה עובד",
|
||||
"visits", "400"});
|
||||
|
||||
// Make sure it includes ALL those words
|
||||
EXPECT_THAT(Run({"ft.search", "i1", "word слово wörter sentence λέξεις γάτας עובד"}),
|
||||
AreDocIds("d:1"));
|
||||
}
|
||||
|
||||
TEST_F(SearchFamilyTest, SimpleExpiry) {
|
||||
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "expires-in", "numeric"}), "OK");
|
||||
|
||||
|
@ -6,7 +6,7 @@ FROM alpine:3 as builder
|
||||
# "openssl-libs-static" fixes "Could NOT find OpenSSL, try to set the path to OpenSSL root folder in the"
|
||||
RUN apk add autoconf-archive automake bash bison boost-dev cmake coreutils \
|
||||
curl ccache git gcc gdb g++ libunwind-dev libtool libxml2-dev make ninja \
|
||||
openssl-dev openssl-libs-static patch zip zstd-static icu-dev
|
||||
openssl-dev openssl-libs-static patch zip zstd-static
|
||||
|
||||
# This is required to make static linking work
|
||||
RUN ls -1 /usr/lib/libboost_*.so | while read -r _file; do ln -sfv ${_file} ${_file//.so/.a}; done
|
||||
|
@ -12,7 +12,7 @@ RUN \
|
||||
apt update && \
|
||||
apt install -q -y autoconf-archive cmake curl git libssl-dev \
|
||||
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
|
||||
libxml2-dev zip libzstd-dev bison libicu-dev
|
||||
libxml2-dev zip libzstd-dev bison
|
||||
|
||||
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
|
||||
--slave /usr/bin/g++ g++ /usr/bin/g++-9
|
||||
|
Loading…
Reference in New Issue
Block a user