From b70bf938c230bb5e8aafe95094975a8e10109876 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max@duempel.org>
Date: Fri, 10 Oct 2014 21:17:40 +0200
Subject: util/UTF8: add SequenceLengthUTF8()

---
 src/util/UTF8.cxx | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/util/UTF8.hxx | 16 +++++++++++
 2 files changed, 96 insertions(+)

(limited to 'src')
diff --git a/src/util/UTF8.cxx b/src/util/UTF8.cxx
index 273cbac1e..50ff19e88 100644
--- a/src/util/UTF8.cxx
+++ b/src/util/UTF8.cxx
@@ -166,6 +166,86 @@ ValidateUTF8(const char *p)
 	return true;
 }
 
+size_t
+SequenceLengthUTF8(char ch)
+{
+	if (IsASCII(ch))
+		return 1;
+	else if (IsLeading1(ch))
+		/* 1 continuation */
+		return 2;
+	else if (IsLeading2(ch))
+		/* 2 continuations */
+		return 3;
+	else if (IsLeading3(ch))
+		/* 3 continuations */
+		return 4;
+	else if (IsLeading4(ch))
+		/* 4 continuations */
+		return 5;
+	else if (IsLeading5(ch))
+		/* 5 continuations */
+		return 6;
+	else
+		/* continuation without a prefix or some other illegal
+		   start byte */
+		return 0;
+
+}
+
+template<size_t L>
+struct CheckSequenceUTF8 {
+	gcc_pure
+	bool operator()(const char *p) const {
+		return IsContinuation(*p) && CheckSequenceUTF8<L-1>()(p + 1);
+	}
+};
+
+template<>
+struct CheckSequenceUTF8<0u> {
+	constexpr bool operator()(gcc_unused const char *p) const {
+		return true;
+	}
+};
+
+template<size_t L>
+gcc_pure
+static size_t
+InnerSequenceLengthUTF8(const char *p)
+{
+	return CheckSequenceUTF8<L>()(p)
+		? L + 1
+		: 0u;
+}
+
+size_t
+SequenceLengthUTF8(const char *p)
+{
+	const unsigned char ch = *p++;
+
+	if (IsASCII(ch))
+		return 1;
+	else if (IsLeading1(ch))
+		/* 1 continuation */
+		return InnerSequenceLengthUTF8<1>(p);
+	else if (IsLeading2(ch))
+		/* 2 continuations */
+		return InnerSequenceLengthUTF8<2>(p);
+	else if (IsLeading3(ch))
+		/* 3 continuations */
+		return InnerSequenceLengthUTF8<3>(p);
+	else if (IsLeading4(ch))
+		/* 4 continuations */
+		return InnerSequenceLengthUTF8<4>(p);
+	else if (IsLeading5(ch))
+		/* 5 continuations */
+		return InnerSequenceLengthUTF8<5>(p);
+	else
+		/* continuation without a prefix or some other illegal
+		   start byte */
+		return 0;
+}
+
 static const char *
 FindNonASCIIOrZero(const char *p)
 {
diff --git a/src/util/UTF8.hxx b/src/util/UTF8.hxx
index d3d694f6b..82d324f3e 100644
--- a/src/util/UTF8.hxx
+++ b/src/util/UTF8.hxx
@@ -42,6 +42,22 @@ gcc_pure gcc_nonnull_all
 bool
 ValidateUTF8(const char *p);
 
+/**
+ * @return the number of the sequence beginning with the given
+ * character, or 0 if the character is not a valid start byte
+ */
+gcc_const
+size_t
+SequenceLengthUTF8(char ch);
+
+/**
+ * @return the number of the first sequence in the given string, or 0
+ * if the sequence is malformed
+ */
+gcc_pure
+size_t
+SequenceLengthUTF8(const char *p);
+
 /**
  * Convert the specified string from ISO-8859-1 to UTF-8.
  *
-- 
cgit v1.2.3