aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/util/UTF8.cxx80
-rw-r--r--src/util/UTF8.hxx16
2 files changed, 96 insertions, 0 deletions
diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx
index 273cbac1e..50ff19e88 100644
--- a/src/util/UTF8.cxx
+++ b/src/util/UTF8.cxx
@@ -166,6 +166,86 @@ ValidateUTF8(const char *p)
return true;
}
+size_t
+SequenceLengthUTF8(char ch)
+{
+ if (IsASCII(ch))
+ return 1;
+ else if (IsLeading1(ch))
+ /* 1 continuation */
+ return 2;
+ else if (IsLeading2(ch))
+ /* 2 continuations */
+ return 3;
+ else if (IsLeading3(ch))
+ /* 3 continuations */
+ return 4;
+ else if (IsLeading4(ch))
+ /* 4 continuations */
+ return 5;
+ else if (IsLeading5(ch))
+ /* 5 continuations */
+ return 6;
+ else
+ /* continuation without a prefix or some other illegal
+ start byte */
+ return 0;
+
+}
+
+template<size_t L>
+struct CheckSequenceUTF8 {
+ gcc_pure
+ bool operator()(const char *p) const {
+ return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1);
+ }
+};
+
+template<>
+struct CheckSequenceUTF8<0u> {
+ constexpr bool operator()(gcc_unused const char *p) const {
+ return true;
+ }
+};
+
+template<size_t L>
+gcc_pure
+static size_t
+InnerSequenceLengthUTF8(const char *p)
+{
+ return CheckSequenceUTF8<L>()(p)
+ ? L + 1
+ : 0u;
+}
+
+size_t
+SequenceLengthUTF8(const char *p)
+{
+ const unsigned char ch = *p++;
+
+ if (IsASCII(ch))
+ return 1;
+ else if (IsLeading1(ch))
+ /* 1 continuation */
+ return InnerSequenceLengthUTF8<1>(p);
+ else if (IsLeading2(ch))
+ /* 2 continuations */
+ return InnerSequenceLengthUTF8<2>(p);
+ else if (IsLeading3(ch))
+ /* 3 continuations */
+ return InnerSequenceLengthUTF8<3>(p);
+ else if (IsLeading4(ch))
+ /* 4 continuations */
+ return InnerSequenceLengthUTF8<4>(p);
+ else if (IsLeading5(ch))
+ /* 5 continuations */
+ return InnerSequenceLengthUTF8<5>(p);
+ else
+ /* continuation without a prefix or some other illegal
+ start byte */
+ return 0;
+}
+
static const char *
FindNonASCIIOrZero(const char *p)
{
diff --git a/src/util/UTF8.hxx b/src/util/UTF8.hxx
index d3d694f6b..82d324f3e 100644
--- a/src/util/UTF8.hxx
+++ b/src/util/UTF8.hxx
@@ -43,6 +43,22 @@ bool
ValidateUTF8(const char *p);
/**
+ * @return the number of the sequence beginning with the given
+ * character, or 0 if the character is not a valid start byte
+ */
+gcc_const
+size_t
+SequenceLengthUTF8(char ch);
+
+/**
+ * @return the number of the first sequence in the given string, or 0
+ * if the sequence is malformed
+ */
+gcc_pure
+size_t
+SequenceLengthUTF8(const char *p);
+
+/**
* Convert the specified string from ISO-8859-1 to UTF-8.
*
* @return the UTF-8 version of the source string; may return #src if