From 3273a79b9881483fa1e77a588e4369928dc3c4f7 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 3 Nov 2023 09:25:59 -0400 Subject: [PATCH] fix characters --- src/transcription-filter.cpp | 75 ++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 11 deletions(-) diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 6ec573b..6735556 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -193,22 +193,75 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf) } } +#define is_lead_byte(c) (((c) & 0xe0) == 0xc0 || ((c) & 0xf0) == 0xe0 || ((c) & 0xf8) == 0xf0) +#define is_trail_byte(c) (((c) & 0xc0) == 0x80) + +inline int lead_byte_length(const uint8_t c) { + if ((c & 0xe0) == 0xc0) { + return 2; + } else if ((c & 0xf0) == 0xe0) { + return 3; + } else if ((c & 0xf8) == 0xf0) { + return 4; + } else { + return 1; + } +} + +inline bool is_valid_lead_byte(const uint8_t* c) { + const int length = lead_byte_length(c[0]); + if (length == 1) { + return true; + } + if (length == 2 && is_trail_byte(c[1])) { + return true; + } + if (length == 3 && is_trail_byte(c[1]) && is_trail_byte(c[2])) { + return true; + } + if (length == 4 && is_trail_byte(c[1]) && is_trail_byte(c[2]) && is_trail_byte(c[3])) { + return true; + } + return false; +} + void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result) { #ifdef _WIN32 // Some UTF8 charsets on Windows output have a bug, instead of 0xd? it outputs - // 0xf?, and 0xc? becomes 0xe?, so we need to replace it. - std::string str_copy = result.text; - for (size_t i = 0; i < str_copy.size(); ++i) { - // if the char MSBs starts with 0xf replace the MSBs with 0xd - if ((str_copy.c_str()[i] & 0xf0) == 0xf0) { - str_copy[i] = (str_copy.c_str()[i] & 0x0f) | 0xd0; - } - // if the char MSBs starts with 0xe replace the char with 0xc - if ((str_copy.c_str()[i] & 0xf0) == 0xe0) { - str_copy[i] = (str_copy.c_str()[i] & 0x0f) | 0xc0; - } + // 0xf?, and 0xc? becomes 0xe?, so we need to fix it. + std::stringstream ss; + uint8_t* c_str = (uint8_t*)result.text.c_str(); + for (size_t i = 0; i < result.text.size(); ++i) { + if (is_lead_byte(c_str[i])) { + // this is a unicode leading byte + // if the next char is 0xff - it's a bug char, replace it with 0x9f + if (c_str[i + 1] == 0xff) { + c_str[i + 1] = 0x9f; + } + if (!is_valid_lead_byte(c_str + i)) { + // This is a bug lead byte, because it's length 3 and the i+2 byte is also + // a lead byte + c_str[i] = c_str[i] - 0x20; + } + } else { + if (c_str[i] >= 0xf8) { + // this may be a malformed lead byte. + // lets see if it becomes a valid lead byte if we "fix" it + uint8_t buf_[4]; + buf_[0] = c_str[i] - 0x20; + buf_[1] = c_str[i+1]; + buf_[2] = c_str[i+2]; + buf_[3] = c_str[i+3]; + if (is_valid_lead_byte(buf_)) { + // this is a malformed lead byte, fix it + c_str[i] = c_str[i] - 0x20; + } + } + } } + + std::string str_copy = (char*)c_str; #else std::string str_copy = result.text; #endif