From 41507d81291e215b2aec6dfcb9bc7ed7063853d5 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 12 May 2014 14:35:25 +0200 Subject: icu/Collate: use u_strFoldCase() instead of ucol_getSortKey() Turns out ucol_getSortKey() does not what I thought it does. --- src/lib/icu/Collate.cxx | 51 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'src/lib') diff --git a/src/lib/icu/Collate.cxx b/src/lib/icu/Collate.cxx index fac73783b..4d1526b28 100644 --- a/src/lib/icu/Collate.cxx +++ b/src/lib/icu/Collate.cxx @@ -23,6 +23,7 @@ #ifdef HAVE_ICU #include "Error.hxx" #include "util/WritableBuffer.hxx" +#include "util/ConstBuffer.hxx" #include "util/Error.hxx" #include "util/Domain.hxx" @@ -97,6 +98,28 @@ UCharFromUTF8(const char *src) return { dest, size_t(dest_length) }; } +static WritableBuffer +UCharToUTF8(ConstBuffer src) +{ + assert(!src.IsNull()); + + /* worst-case estimate */ + size_t dest_capacity = 4 * src.size; + + char *dest = new char[dest_capacity]; + + UErrorCode error_code = U_ZERO_ERROR; + int32_t dest_length; + u_strToUTF8(dest, dest_capacity, &dest_length, src.data, src.size, + &error_code); + if (U_FAILURE(error_code)) { + delete[] dest; + return nullptr; + } + + return { dest, size_t(dest_length) }; +} + #endif gcc_pure @@ -147,19 +170,27 @@ IcuCaseFold(const char *src) if (u.IsNull()) return std::string(src); - size_t dest_length = ucol_getSortKey(collator, u.data, u.size, - nullptr, 0); - if (dest_length == 0) { - delete[] u.data; + size_t folded_capacity = u.size * 2u; + UChar *folded = new UChar[folded_capacity]; + + UErrorCode error_code = U_ZERO_ERROR; + size_t folded_length = u_strFoldCase(folded, folded_capacity, + u.data, u.size, + U_FOLD_CASE_DEFAULT, + &error_code); + delete[] u.data; + if (folded_length == 0 || error_code != U_ZERO_ERROR) { + delete[] folded; return std::string(src); } - uint8_t *dest = new uint8_t[dest_length]; - ucol_getSortKey(collator, u.data, u.size, - dest, dest_length); - delete[] u.data; - std::string result((const char *)dest); - delete[] dest; + auto result2 = UCharToUTF8({folded, folded_length}); + delete[] folded; + if (result2.IsNull()) + return std::string(src); + + std::string result(result2.data, result2.size); + delete[] result2.data; #elif defined(HAVE_GLIB) char *tmp = g_utf8_casefold(src, -1); std::string result(tmp); -- cgit v1.2.3