From b70bf938c230bb5e8aafe95094975a8e10109876 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 10 Oct 2014 21:17:40 +0200 Subject: util/UTF8: add SequenceLengthUTF8() --- src/util/UTF8.cxx | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'src/util/UTF8.cxx') diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx index 273cbac1e..50ff19e88 100644 --- a/src/util/UTF8.cxx +++ b/src/util/UTF8.cxx @@ -166,6 +166,86 @@ ValidateUTF8(const char *p) return true; } +size_t +SequenceLengthUTF8(char ch) +{ + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return 2; + else if (IsLeading2(ch)) + /* 2 continuations */ + return 3; + else if (IsLeading3(ch)) + /* 3 continuations */ + return 4; + else if (IsLeading4(ch)) + /* 4 continuations */ + return 5; + else if (IsLeading5(ch)) + /* 5 continuations */ + return 6; + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; + +} + +template +struct CheckSequenceUTF8 { + gcc_pure + bool operator()(const char *p) const { + return IsContinuation(*p) && CheckSequenceUTF8()(p + 1); + } +}; + +template<> +struct CheckSequenceUTF8<0u> { + constexpr bool operator()(gcc_unused const char *p) const { + return true; + } +}; + +template +gcc_pure +static size_t +InnerSequenceLengthUTF8(const char *p) +{ + return CheckSequenceUTF8()(p) + ? L + 1 + : 0u; +} + +size_t +SequenceLengthUTF8(const char *p) +{ + const unsigned char ch = *p++; + + if (IsASCII(ch)) + return 1; + else if (IsLeading1(ch)) + /* 1 continuation */ + return InnerSequenceLengthUTF8<1>(p); + else if (IsLeading2(ch)) + /* 2 continuations */ + return InnerSequenceLengthUTF8<2>(p); + else if (IsLeading3(ch)) + /* 3 continuations */ + return InnerSequenceLengthUTF8<3>(p); + else if (IsLeading4(ch)) + /* 4 continuations */ + return InnerSequenceLengthUTF8<4>(p); + else if (IsLeading5(ch)) + /* 5 continuations */ + return InnerSequenceLengthUTF8<5>(p); + else + /* continuation without a prefix or some other illegal + start byte */ + return 0; +} + static const char * FindNonASCIIOrZero(const char *p) { -- cgit v1.2.3