mirror of
https://github.com/dragonflydb/dragonfly
synced 2024-11-21 23:19:53 +00:00
feat: Support unicode strings in search (#1698)
Signed-off-by: Vladislav Oleshko <vlad@dragonflydb.io> Signed-off-by: Vladislav <vlad@dragonflydb.io>
This commit is contained in:
parent
003d2031b5
commit
5198622a15
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@ -60,7 +60,7 @@ jobs:
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt update && apt install -q -y autoconf-archive cmake curl git libssl-dev \
|
||||
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
|
||||
libxml2-dev zip libzstd-dev debhelper moreutils bison
|
||||
libxml2-dev zip libzstd-dev debhelper moreutils bison libicu-dev
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
|
||||
--slave /usr/bin/g++ g++ /usr/bin/g++-9
|
||||
run: |
|
||||
|
@ -16,21 +16,21 @@ On Debian/Ubuntu:
|
||||
|
||||
```bash
|
||||
sudo apt install ninja-build libunwind-dev libboost-fiber-dev libssl-dev \
|
||||
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev
|
||||
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev libicu-dev
|
||||
```
|
||||
|
||||
On Fedora:
|
||||
|
||||
```bash
|
||||
sudo yum install automake boost-devel g++ git cmake libtool ninja-build libzstd-devel \
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel
|
||||
```
|
||||
|
||||
On openSUSE:
|
||||
|
||||
```bash
|
||||
sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja libzstd-devel \
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel \
|
||||
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel \
|
||||
libboost_context-devel libboost_system-devel
|
||||
```
|
||||
|
||||
|
@ -3,8 +3,13 @@ gen_bison(parser)
|
||||
|
||||
cur_gen_dir(gen_dir)
|
||||
|
||||
find_package(ICU REQUIRED COMPONENTS uc i18n)
|
||||
|
||||
add_library(query_parser ast_expr.cc query_driver.cc search.cc indices.cc vector.cc compressed_sorted_set.cc
|
||||
${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
||||
|
||||
target_link_libraries(query_parser ICU::uc ICU::i18n)
|
||||
|
||||
target_link_libraries(query_parser base absl::strings TRDP::reflex)
|
||||
cxx_test(compressed_sorted_set_test query_parser LABELS DFLY)
|
||||
cxx_test(search_parser_test query_parser LABELS DFLY)
|
||||
|
@ -8,8 +8,11 @@
|
||||
#include <absl/strings/ascii.h>
|
||||
#include <absl/strings/numbers.h>
|
||||
#include <absl/strings/str_split.h>
|
||||
#include <unicode/brkiter.h>
|
||||
#include <unicode/unistr.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <regex>
|
||||
|
||||
#include "base/logging.h"
|
||||
@ -20,20 +23,64 @@ using namespace std;
|
||||
|
||||
namespace {
|
||||
|
||||
// Get all words from text as matched by regex word boundaries
|
||||
absl::flat_hash_set<string> TokenizeWords(string_view text) {
|
||||
std::regex rx{"\\b.*?\\b", std::regex_constants::icase};
|
||||
std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{};
|
||||
bool IsAllAscii(string_view sv) {
|
||||
return all_of(sv.begin(), sv.end(), [](unsigned char c) { return isascii(c); });
|
||||
}
|
||||
|
||||
absl::flat_hash_set<string> words;
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
auto word = it->str();
|
||||
absl::AsciiStrToLower(&word);
|
||||
words.insert(move(word));
|
||||
// Get all words from text as matched by the ICU library
|
||||
absl::flat_hash_set<std::string> ICUTokenizeWords(std::string_view text) {
|
||||
// Is text contains only ascii, skip working with ICU resources
|
||||
if (IsAllAscii(text)) {
|
||||
std::regex rx{"\\b.*?\\b", std::regex_constants::icase};
|
||||
std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{};
|
||||
|
||||
absl::flat_hash_set<string> words;
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
auto word = it->str();
|
||||
absl::AsciiStrToLower(&word);
|
||||
words.insert(move(word));
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
icu::UnicodeString uStr(text.data(), text.size(), "UTF-8");
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
std::unique_ptr<icu::BreakIterator> wordIter{
|
||||
icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status)};
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return {};
|
||||
|
||||
wordIter->setText(uStr);
|
||||
|
||||
std::string tmpStdWord;
|
||||
absl::flat_hash_set<std::string> words;
|
||||
|
||||
int32_t start = wordIter->first();
|
||||
for (int32_t end = wordIter->next(); end != icu::BreakIterator::DONE;
|
||||
start = end, end = wordIter->next()) {
|
||||
icu::UnicodeString word = uStr.tempSubStringBetween(start, end);
|
||||
// If the substring is not a space, convert it to lowercase and add to results
|
||||
if (!word.isBogus() && !word.trim().isEmpty()) {
|
||||
word.toLower();
|
||||
word.toUTF8String(tmpStdWord);
|
||||
words.emplace(move(tmpStdWord));
|
||||
}
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
// Convert string to lowercase with ICU library
|
||||
std::string ICUToLowercase(string_view input) {
|
||||
icu::UnicodeString uStr = icu::UnicodeString::fromUTF8(input);
|
||||
uStr.toLower();
|
||||
std::string result;
|
||||
uStr.toUTF8String(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Split taglist, remove duplicates and convert all to lowercase
|
||||
absl::flat_hash_set<string> NormalizeTags(string_view taglist) {
|
||||
string tmp;
|
||||
@ -73,7 +120,15 @@ vector<DocId> NumericIndex::Range(int64_t l, int64_t r) const {
|
||||
}
|
||||
|
||||
const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
|
||||
auto it = entries_.find(absl::StripAsciiWhitespace(str));
|
||||
str = absl::StripAsciiWhitespace(str);
|
||||
|
||||
string word;
|
||||
if (IsAllAscii(str))
|
||||
word = absl::AsciiStrToLower(str);
|
||||
else
|
||||
word = ICUToLowercase(str);
|
||||
|
||||
auto it = entries_.find(word);
|
||||
return (it != entries_.end()) ? &it->second : nullptr;
|
||||
}
|
||||
|
||||
@ -88,7 +143,7 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field)
|
||||
}
|
||||
|
||||
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
|
||||
return TokenizeWords(value);
|
||||
return ICUTokenizeWords(value);
|
||||
}
|
||||
|
||||
absl::flat_hash_set<std::string> TagIndex::Tokenize(std::string_view value) const {
|
||||
|
@ -287,6 +287,25 @@ TEST_F(SearchFamilyTest, SimpleUpdates) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SearchFamilyTest, Unicode) {
|
||||
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "visits", "numeric"}), "OK");
|
||||
|
||||
// Explicitly using screaming uppercase to check utf-8 to lowercase functionality
|
||||
Run({"hset", "d:1", "title", "Веселая СТРЕКОЗА Иван", "visits", "400"});
|
||||
Run({"hset", "d:2", "title", "Die fröhliche Libelle Günther", "visits", "300"});
|
||||
Run({"hset", "d:3", "title", "השפירית המהירה יעקב", "visits", "200"});
|
||||
Run({"hset", "d:4", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας", "visits", "100"});
|
||||
|
||||
// Check we find our dragonfly in all languages
|
||||
EXPECT_THAT(Run({"ft.search", "i1", "стРекоЗа|liBellE|השפירית|λΙβελλοΎλη"}),
|
||||
AreDocIds("d:1", "d:2", "d:3", "d:4"));
|
||||
|
||||
// Check the result is valid
|
||||
auto resp = Run({"ft.search", "i1", "λιβελλούλη"});
|
||||
EXPECT_THAT(resp.GetVec()[2].GetVec(),
|
||||
UnorderedElementsAre("visits", "100", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας"));
|
||||
}
|
||||
|
||||
TEST_F(SearchFamilyTest, SimpleExpiry) {
|
||||
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "expires-in", "numeric"}), "OK");
|
||||
|
||||
|
@ -12,7 +12,7 @@ RUN \
|
||||
apt update && \
|
||||
apt install -q -y autoconf-archive cmake curl git libssl-dev \
|
||||
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
|
||||
libxml2-dev zip libzstd-dev bison
|
||||
libxml2-dev zip libzstd-dev bison libicu-dev
|
||||
|
||||
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
|
||||
--slave /usr/bin/g++ g++ /usr/bin/g++-9
|
||||
|
Loading…
Reference in New Issue
Block a user