From b70bf938c230bb5e8aafe95094975a8e10109876 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 10 Oct 2014 21:17:40 +0200 Subject: util/UTF8: add SequenceLengthUTF8() --- src/util/UTF8.cxx | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/util/UTF8.hxx | 16 +++++++++++ 2 files changed, 96 insertions(+) diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx index 273cbac1e..50ff19e88 100644 --- a/src/util/UTF8.cxx +++ b/src/util/UTF8.cxx @@ -166,6 +166,86 @@ ValidateUTF8(const char *p) return true; } +size_t +SequenceLengthUTF8(char ch) +{ + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return 2; + else if (IsLeading2(ch)) + /* 2 continuations */ + return 3; + else if (IsLeading3(ch)) + /* 3 continuations */ + return 4; + else if (IsLeading4(ch)) + /* 4 continuations */ + return 5; + else if (IsLeading5(ch)) + /* 5 continuations */ + return 6; + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; + +} + +template +struct CheckSequenceUTF8 { + gcc_pure + bool operator()(const char *p) const { + return IsContinuation(*p) && CheckSequenceUTF8()(p + 1); + } +}; + +template<> +struct CheckSequenceUTF8<0u> { + constexpr bool operator()(gcc_unused const char *p) const { + return true; + } +}; + +template +gcc_pure +static size_t +InnerSequenceLengthUTF8(const char *p) +{ + return CheckSequenceUTF8()(p) + ? L + 1 + : 0u; +} + +size_t +SequenceLengthUTF8(const char *p) +{ + const unsigned char ch = *p++; + + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return InnerSequenceLengthUTF8<1>(p); + else if (IsLeading2(ch)) + /* 2 continuations */ + return InnerSequenceLengthUTF8<2>(p); + else if (IsLeading3(ch)) + /* 3 continuations */ + return InnerSequenceLengthUTF8<3>(p); + else if (IsLeading4(ch)) + /* 4 continuations */ + return InnerSequenceLengthUTF8<4>(p); + else if (IsLeading5(ch)) + /* 5 continuations */ + return InnerSequenceLengthUTF8<5>(p); + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; +} + static const char * FindNonASCIIOrZero(const char *p) { diff --git a/src/util/UTF8.hxx b/src/util/UTF8.hxx index d3d694f6b..82d324f3e 100644 --- a/src/util/UTF8.hxx +++ b/src/util/UTF8.hxx @@ -42,6 +42,22 @@ gcc_pure gcc_nonnull_all bool ValidateUTF8(const char *p); +/** + * @return the number of the sequence beginning with the given + * character, or 0 if the character is not a valid start byte + */ +gcc_const +size_t +SequenceLengthUTF8(char ch); + +/** + * @return the number of the first sequence in the given string, or 0 + * if the sequence is malformed + */ +gcc_pure +size_t +SequenceLengthUTF8(const char *p); + /** * Convert the specified string from ISO-8859-1 to UTF-8. * -- cgit v1.2.3