feat: Support unicode strings in search (#1698)

Signed-off-by: Vladislav Oleshko <vlad@dragonflydb.io>
Signed-off-by: Vladislav <vlad@dragonflydb.io>
This commit is contained in:
Vladislav 2023-08-18 15:40:37 +03:00 committed by GitHub
parent 003d2031b5
commit 5198622a15
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 95 additions and 16 deletions

View File

@ -60,7 +60,7 @@ jobs:
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -q -y autoconf-archive cmake curl git libssl-dev \
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
libxml2-dev zip libzstd-dev debhelper moreutils bison
libxml2-dev zip libzstd-dev debhelper moreutils bison libicu-dev
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
--slave /usr/bin/g++ g++ /usr/bin/g++-9
run: |

View File

@ -16,21 +16,21 @@ On Debian/Ubuntu:
```bash
sudo apt install ninja-build libunwind-dev libboost-fiber-dev libssl-dev \
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev libicu-dev
```
On Fedora:
```bash
sudo yum install automake boost-devel g++ git cmake libtool ninja-build libzstd-devel \
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel
```
On openSUSE:
```bash
sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja libzstd-devel \
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel \
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libicu-devel \
libboost_context-devel libboost_system-devel
```

View File

@ -3,8 +3,13 @@ gen_bison(parser)
cur_gen_dir(gen_dir)
find_package(ICU REQUIRED COMPONENTS uc i18n)
add_library(query_parser ast_expr.cc query_driver.cc search.cc indices.cc vector.cc compressed_sorted_set.cc
${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
target_link_libraries(query_parser ICU::uc ICU::i18n)
target_link_libraries(query_parser base absl::strings TRDP::reflex)
cxx_test(compressed_sorted_set_test query_parser LABELS DFLY)
cxx_test(search_parser_test query_parser LABELS DFLY)

View File

@ -8,8 +8,11 @@
#include <absl/strings/ascii.h>
#include <absl/strings/numbers.h>
#include <absl/strings/str_split.h>
#include <unicode/brkiter.h>
#include <unicode/unistr.h>
#include <algorithm>
#include <cctype>
#include <regex>
#include "base/logging.h"
@ -20,20 +23,64 @@ using namespace std;
namespace {
// Get all words from text as matched by regex word boundaries
absl::flat_hash_set<string> TokenizeWords(string_view text) {
std::regex rx{"\\b.*?\\b", std::regex_constants::icase};
std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{};
bool IsAllAscii(string_view sv) {
return all_of(sv.begin(), sv.end(), [](unsigned char c) { return isascii(c); });
}
absl::flat_hash_set<string> words;
for (auto it = begin; it != end; ++it) {
auto word = it->str();
absl::AsciiStrToLower(&word);
words.insert(move(word));
// Get all words from text as matched by the ICU library
absl::flat_hash_set<std::string> ICUTokenizeWords(std::string_view text) {
// Is text contains only ascii, skip working with ICU resources
if (IsAllAscii(text)) {
std::regex rx{"\\b.*?\\b", std::regex_constants::icase};
std::cregex_iterator begin{text.data(), text.data() + text.size(), rx}, end{};
absl::flat_hash_set<string> words;
for (auto it = begin; it != end; ++it) {
auto word = it->str();
absl::AsciiStrToLower(&word);
words.insert(move(word));
}
return words;
}
icu::UnicodeString uStr(text.data(), text.size(), "UTF-8");
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::BreakIterator> wordIter{
icu::BreakIterator::createWordInstance(icu::Locale::getDefault(), status)};
if (U_FAILURE(status))
return {};
wordIter->setText(uStr);
std::string tmpStdWord;
absl::flat_hash_set<std::string> words;
int32_t start = wordIter->first();
for (int32_t end = wordIter->next(); end != icu::BreakIterator::DONE;
start = end, end = wordIter->next()) {
icu::UnicodeString word = uStr.tempSubStringBetween(start, end);
// If the substring is not a space, convert it to lowercase and add to results
if (!word.isBogus() && !word.trim().isEmpty()) {
word.toLower();
word.toUTF8String(tmpStdWord);
words.emplace(move(tmpStdWord));
}
}
return words;
}
// Convert string to lowercase with ICU library
std::string ICUToLowercase(string_view input) {
icu::UnicodeString uStr = icu::UnicodeString::fromUTF8(input);
uStr.toLower();
std::string result;
uStr.toUTF8String(result);
return result;
}
// Split taglist, remove duplicates and convert all to lowercase
absl::flat_hash_set<string> NormalizeTags(string_view taglist) {
string tmp;
@ -73,7 +120,15 @@ vector<DocId> NumericIndex::Range(int64_t l, int64_t r) const {
}
const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
auto it = entries_.find(absl::StripAsciiWhitespace(str));
str = absl::StripAsciiWhitespace(str);
string word;
if (IsAllAscii(str))
word = absl::AsciiStrToLower(str);
else
word = ICUToLowercase(str);
auto it = entries_.find(word);
return (it != entries_.end()) ? &it->second : nullptr;
}
@ -88,7 +143,7 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field)
}
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
return TokenizeWords(value);
return ICUTokenizeWords(value);
}
absl::flat_hash_set<std::string> TagIndex::Tokenize(std::string_view value) const {

View File

@ -287,6 +287,25 @@ TEST_F(SearchFamilyTest, SimpleUpdates) {
}
}
TEST_F(SearchFamilyTest, Unicode) {
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "visits", "numeric"}), "OK");
// Explicitly using screaming uppercase to check utf-8 to lowercase functionality
Run({"hset", "d:1", "title", "Веселая СТРЕКОЗА Иван", "visits", "400"});
Run({"hset", "d:2", "title", "Die fröhliche Libelle Günther", "visits", "300"});
Run({"hset", "d:3", "title", "השפירית המהירה יעקב", "visits", "200"});
Run({"hset", "d:4", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας", "visits", "100"});
// Check we find our dragonfly in all languages
EXPECT_THAT(Run({"ft.search", "i1", "стРекоЗа|liBellE|השפירית|λΙβελλοΎλη"}),
AreDocIds("d:1", "d:2", "d:3", "d:4"));
// Check the result is valid
auto resp = Run({"ft.search", "i1", "λιβελλούλη"});
EXPECT_THAT(resp.GetVec()[2].GetVec(),
UnorderedElementsAre("visits", "100", "title", "πανίσχυρη ΛΙΒΕΛΛΟΎΛΗ Δίας"));
}
TEST_F(SearchFamilyTest, SimpleExpiry) {
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "expires-in", "numeric"}), "OK");

View File

@ -12,7 +12,7 @@ RUN \
apt update && \
apt install -q -y autoconf-archive cmake curl git libssl-dev \
libunwind-dev ninja-build libtool gcc-9 g++-9 libboost-fiber-dev \
libxml2-dev zip libzstd-dev bison
libxml2-dev zip libzstd-dev bison libicu-dev
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 \
--slave /usr/bin/g++ g++ /usr/bin/g++-9