diff options
Diffstat (limited to 'src/util/UTF8.cxx')
-rw-r--r-- | src/util/UTF8.cxx | 345 |
1 files changed, 345 insertions, 0 deletions
diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx new file mode 100644 index 000000000..50ff19e88 --- /dev/null +++ b/src/util/UTF8.cxx @@ -0,0 +1,345 @@ +/* + * Copyright (C) 2011-2014 Max Kellermann <max@duempel.org> + * http://www.musicpd.org + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "UTF8.hxx" +#include "CharUtil.hxx" + +#include <algorithm> + +/** + * Is this a leading byte that is followed by 1 continuation byte? + */ +static constexpr bool +IsLeading1(unsigned char ch) +{ + return (ch & 0xe0) == 0xc0; +} + +static constexpr unsigned char +MakeLeading1(unsigned char value) +{ + return 0xc0 | value; +} + +/** + * Is this a leading byte that is followed by 2 continuation byte? + */ +static constexpr bool +IsLeading2(unsigned char ch) +{ + return (ch & 0xf0) == 0xe0; +} + +static constexpr unsigned char +MakeLeading2(unsigned char value) +{ + return 0xe0 | value; +} + +/** + * Is this a leading byte that is followed by 3 continuation byte? + */ +static constexpr bool +IsLeading3(unsigned char ch) +{ + return (ch & 0xf8) == 0xf0; +} + +static constexpr unsigned char +MakeLeading3(unsigned char value) +{ + return 0xf0 | value; +} + +/** + * Is this a leading byte that is followed by 4 continuation byte? + */ +static constexpr bool +IsLeading4(unsigned char ch) +{ + return (ch & 0xfc) == 0xf8; +} + +static constexpr unsigned char +MakeLeading4(unsigned char value) +{ + return 0xf8 | value; +} + +/** + * Is this a leading byte that is followed by 5 continuation byte? + */ +static constexpr bool +IsLeading5(unsigned char ch) +{ + return (ch & 0xfe) == 0xfc; +} + +static constexpr unsigned char +MakeLeading5(unsigned char value) +{ + return 0xfc | value; +} + +static constexpr bool +IsContinuation(unsigned char ch) +{ + return (ch & 0xc0) == 0x80; +} + +/** + * Generate a continuation byte of the low 6 bit. + */ +static constexpr unsigned char +MakeContinuation(unsigned char value) +{ + return 0x80 | (value & 0x3f); +} + +bool +ValidateUTF8(const char *p) +{ + for (; *p != 0; ++p) { + unsigned char ch = *p; + if (IsASCII(ch)) + continue; + + if (IsContinuation(ch)) + /* continuation without a prefix */ + return false; + + if (IsLeading1(ch)) { + /* 1 continuation */ + if (!IsContinuation(*++p)) + return false; + } else if (IsLeading2(ch)) { + /* 2 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p)) + return false; + } else if (IsLeading3(ch)) { + /* 3 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p)) + return false; + } else if (IsLeading4(ch)) { + /* 4 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p) || !IsContinuation(*++p)) + return false; + } else if (IsLeading5(ch)) { + /* 5 continuations */ + if (!IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p) || !IsContinuation(*++p) || + !IsContinuation(*++p)) + return false; + } else + return false; + } + + return true; +} + +size_t +SequenceLengthUTF8(char ch) +{ + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return 2; + else if (IsLeading2(ch)) + /* 2 continuations */ + return 3; + else if (IsLeading3(ch)) + /* 3 continuations */ + return 4; + else if (IsLeading4(ch)) + /* 4 continuations */ + return 5; + else if (IsLeading5(ch)) + /* 5 continuations */ + return 6; + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; + +} + +template<size_t L> +struct CheckSequenceUTF8 { + gcc_pure + bool operator()(const char *p) const { + return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1); + } +}; + +template<> +struct CheckSequenceUTF8<0u> { + constexpr bool operator()(gcc_unused const char *p) const { + return true; + } +}; + +template<size_t L> +gcc_pure +static size_t +InnerSequenceLengthUTF8(const char *p) +{ + return CheckSequenceUTF8<L>()(p) + ? L + 1 + : 0u; +} + +size_t +SequenceLengthUTF8(const char *p) +{ + const unsigned char ch = *p++; + + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return InnerSequenceLengthUTF8<1>(p); + else if (IsLeading2(ch)) + /* 2 continuations */ + return InnerSequenceLengthUTF8<2>(p); + else if (IsLeading3(ch)) + /* 3 continuations */ + return InnerSequenceLengthUTF8<3>(p); + else if (IsLeading4(ch)) + /* 4 continuations */ + return InnerSequenceLengthUTF8<4>(p); + else if (IsLeading5(ch)) + /* 5 continuations */ + return InnerSequenceLengthUTF8<5>(p); + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; +} + +static const char * +FindNonASCIIOrZero(const char *p) +{ + while (*p != 0 && IsASCII(*p)) + ++p; + return p; +} + +const char * +Latin1ToUTF8(const char *gcc_restrict src, char *gcc_restrict buffer, + size_t buffer_size) +{ + const char *p = FindNonASCIIOrZero(src); + if (*p == 0) + /* everything is plain ASCII, we don't need to convert anything */ + return src; + + if ((size_t)(p - src) >= buffer_size) + /* buffer too small */ + return nullptr; + + const char *const end = buffer + buffer_size; + char *q = std::copy(src, p, buffer); + + while (*p != 0) { + unsigned char ch = *p++; + + if (IsASCII(ch)) { + *q++ = ch; + + if (q >= end) + /* buffer too small */ + return nullptr; + } else { + if (q + 2 >= end) + /* buffer too small */ + return nullptr; + + *q++ = MakeLeading1(ch >> 6); + *q++ = MakeContinuation(ch); + } + } + + *q = 0; + return buffer; +} + +char * +UnicodeToUTF8(unsigned ch, char *q) +{ + if (gcc_likely(ch < 0x80)) { + *q++ = (char)ch; + } else if (gcc_likely(ch < 0x800)) { + *q++ = MakeLeading1(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x10000) { + *q++ = MakeLeading2(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x200000) { + *q++ = MakeLeading3(ch >> 18); + *q++ = MakeContinuation(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x4000000) { + *q++ = MakeLeading4(ch >> 24); + *q++ = MakeContinuation(ch >> 18); + *q++ = MakeContinuation(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else if (ch < 0x80000000) { + *q++ = MakeLeading5(ch >> 30); + *q++ = MakeContinuation(ch >> 24); + *q++ = MakeContinuation(ch >> 18); + *q++ = MakeContinuation(ch >> 12); + *q++ = MakeContinuation(ch >> 6); + *q++ = MakeContinuation(ch); + } else { + // error + } + + return q; +} + +size_t +LengthUTF8(const char *p) +{ + /* this is a very naive implementation: it does not do any + verification, it just counts the bytes that are not a UTF-8 + continuation */ + + size_t n = 0; + for (; *p != 0; ++p) + if (!IsContinuation(*p)) + ++n; + return n; +} |