diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/util/UTF8.cxx | 80 | ||||
-rw-r--r-- | src/util/UTF8.hxx | 16 |
2 files changed, 96 insertions, 0 deletions
diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx index 273cbac1e..50ff19e88 100644 --- a/src/util/UTF8.cxx +++ b/src/util/UTF8.cxx @@ -166,6 +166,86 @@ ValidateUTF8(const char *p) return true; } +size_t +SequenceLengthUTF8(char ch) +{ + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return 2; + else if (IsLeading2(ch)) + /* 2 continuations */ + return 3; + else if (IsLeading3(ch)) + /* 3 continuations */ + return 4; + else if (IsLeading4(ch)) + /* 4 continuations */ + return 5; + else if (IsLeading5(ch)) + /* 5 continuations */ + return 6; + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; + +} + +template<size_t L> +struct CheckSequenceUTF8 { + gcc_pure + bool operator()(const char *p) const { + return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1); + } +}; + +template<> +struct CheckSequenceUTF8<0u> { + constexpr bool operator()(gcc_unused const char *p) const { + return true; + } +}; + +template<size_t L> +gcc_pure +static size_t +InnerSequenceLengthUTF8(const char *p) +{ + return CheckSequenceUTF8<L>()(p) + ? L + 1 + : 0u; +} + +size_t +SequenceLengthUTF8(const char *p) +{ + const unsigned char ch = *p++; + + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return InnerSequenceLengthUTF8<1>(p); + else if (IsLeading2(ch)) + /* 2 continuations */ + return InnerSequenceLengthUTF8<2>(p); + else if (IsLeading3(ch)) + /* 3 continuations */ + return InnerSequenceLengthUTF8<3>(p); + else if (IsLeading4(ch)) + /* 4 continuations */ + return InnerSequenceLengthUTF8<4>(p); + else if (IsLeading5(ch)) + /* 5 continuations */ + return InnerSequenceLengthUTF8<5>(p); + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; +} + static const char * FindNonASCIIOrZero(const char *p) { diff --git a/src/util/UTF8.hxx b/src/util/UTF8.hxx index d3d694f6b..82d324f3e 100644 --- a/src/util/UTF8.hxx +++ b/src/util/UTF8.hxx @@ -43,6 +43,22 @@ bool ValidateUTF8(const char *p); /** + * @return the number of the sequence beginning with the given + * character, or 0 if the character is not a valid start byte + */ +gcc_const +size_t +SequenceLengthUTF8(char ch); + +/** + * @return the number of the first sequence in the given string, or 0 + * if the sequence is malformed + */ +gcc_pure +size_t +SequenceLengthUTF8(const char *p); + +/** * Convert the specified string from ISO-8859-1 to UTF-8. * * @return the UTF-8 version of the source string; may return #src if |