From af8fa664f71276fb857360531f04b1c7fb101d22 Mon Sep 17 00:00:00 2001
From: tobigun <tobigun@b956fd51-792f-4845-bead-9b4dfca2ff2c>
Date: Sat, 14 Mar 2009 22:10:00 +0000
Subject: - font-engine uses UCS4 internally - more UTf-8 and UCS4 routines in
 UnicodeUtils

git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/branches/experimental@1636 b956fd51-792f-4845-bead-9b4dfca2ff2c
---
 unicode/src/base/UUnicodeUtils.pas | 133 +++++++++++++++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 5 deletions(-)

(limited to 'unicode/src/base/UUnicodeUtils.pas')

diff --git a/unicode/src/base/UUnicodeUtils.pas b/unicode/src/base/UUnicodeUtils.pas
index 01c279bd..26f240a9 100644
--- a/unicode/src/base/UUnicodeUtils.pas
+++ b/unicode/src/base/UUnicodeUtils.pas
@@ -34,11 +34,11 @@ interface
 {$I switches.inc}
 
 uses
-  SysUtils
 {$IFDEF MSWINDOWS}
-  , Windows
+  Windows,
 {$ENDIF}
-  ;
+  SysUtils;
+  
 (*
  * Character classes
  *)
@@ -58,6 +58,19 @@ function IsPunctuationChar(ch: UCS4Char): boolean; overload;
 function IsControlChar(ch: WideChar): boolean; overload;
 function IsControlChar(ch: UCS4Char): boolean; overload;
 
+{**
+ * Checks if the given string is a valid UTF-8 string.
+ * If an ANSI encoded string (with char codes >= 128) is passed, the
+ * function will most probably return false, as most ANSI strings sequences
+ * are illegal in UTF-8.
+ *}
+function IsUTF8String(const str: AnsiString): boolean;
+
+{**
+ * Checks if the string is composed of ASCII characters.
+ *}
+function IsASCIIString(const str: AnsiString): boolean;
+
 {*
  * String format conversion
  *}
@@ -71,6 +84,12 @@ function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload;
  *}
 function LengthUTF8(const str: UTF8String): integer;
 
+{**
+ * Returns the length of an UCS4String. Note that Length(UCS4String) returns
+ * the length+1 as UCS4Strings are zero-terminated.
+ *}
+function LengthUCS4(const str: UCS4String): integer;
+
 function UTF8CompareStr(const S1, S2: UTF8String): integer;
 function UTF8CompareText(const S1, S2: UTF8String): integer;
 
@@ -93,12 +112,19 @@ function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload;
 function UCS4UpperCase(const str: UCS4String): UCS4String; overload;
 
 {**
- *
+ * Converts a UCS4Char to an UCS4String.
+ * Note that UCS4Strings are zero-terminated dynamic arrays.
  *}
 function UCS4CharToString(ch: UCS4Char): UCS4String;
 
-(*
+{**
+ * Copies a segment of str starting with Index with Count characters.
+ * Note: Do not use Copy() to copy UCS4Strings as the result will not contain
+ * a trailing #0 character and hence is invalid.  
+ *}
+function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String;
 
+(*
  * Converts a WideString to its upper-case representation.
  * Wrapper for WideUpperCase. Needed because some plattforms have problems with
  * unicode support.
@@ -199,6 +225,78 @@ begin
   Result := IsControlChar(WideChar(Ord(ch)));
 end;
 
+
+function IsUTF8String(const str: AnsiString): boolean;
+
+  // find the most significant zero bit (Result: [7..-1])
+  function FindZeroMSB(b: byte): integer;
+  var
+    Mask: byte;
+  begin
+    Mask := $80;
+    Result := 7;
+    while (b and Mask <> 0) do
+    begin
+      Mask := Mask shr 1;
+      Dec(Result);
+    end;
+  end;
+
+var
+  I: integer;
+  ZeroBit: integer;
+  SeqCount: integer; // number of trailing bytes to follow
+begin
+  Result := false;
+  SeqCount := 0;
+
+  for I := 1 to Length(str) do
+  begin
+    if (str[I] >= #128) then
+    begin
+      ZeroBit := FindZeroMSB(Ord(str[I]));
+      // trailing byte expected
+      if (SeqCount > 0) then
+      begin
+        // check if trailing byte has pattern 10xxxxxx
+        if (ZeroBit <> 6) then
+          Exit;
+        Dec(SeqCount);
+      end
+      else // leading byte expected
+      begin
+        // check if pattern is one of 110xxxxx/1110xxxx/11110xxx
+        if (ZeroBit > 5) or (ZeroBit < 3) then
+          Exit;
+        // calculate number of trailing bytes (1, 2 or 3)
+        SeqCount := 6 - ZeroBit;
+      end;
+    end;
+  end;
+
+  // trailing bytes missing?
+  if (SeqCount > 0) then
+    Exit;
+
+  Result := true;
+end;
+
+function IsASCIIString(const str: AnsiString): boolean;
+var
+  I: integer;
+begin
+  for I := 1 to Length(str) do
+  begin
+    if (str[I] >= #128) then
+    begin
+      Result := false;
+      Exit;
+    end;
+  end;    
+  Result := true;
+end;
+
+
 function UTF8ToUCS4String(const str: UTF8String): UCS4String;
 begin
   Result := WideStringToUCS4String(UTF8Decode(str));
@@ -219,6 +317,11 @@ begin
   Result := Length(UTF8ToUCS4String(str));
 end;
 
+function LengthUCS4(const str: UCS4String): integer;
+begin
+  Result := High(str);
+end;
+
 function UTF8CompareStr(const S1, S2: UTF8String): integer;
 begin
   // FIXME
@@ -284,6 +387,26 @@ begin
   Result[1] := 0;
 end;
 
+function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String;
+var
+  I: integer;
+  MaxCount: integer;
+begin
+  // calculate max. copy count
+  MaxCount := LengthUCS4(str)-Index;
+  if (MaxCount < 0) then
+    MaxCount := 0;
+  // adjust copy count
+  if (Count > MaxCount) or (Count < 0) then
+    Count := MaxCount;
+
+  // copy (and add zero terminator)
+  SetLength(Result, Count + 1);
+  for I := 0 to Count-1 do
+    Result[I] := str[Index+I];
+  Result[Count] := 0;
+end;
+
 function WideStringUpperCase(ch: WideChar): WideString;
 begin
   // If WideChar #0 is converted to a WideString in Delphi, a string with
-- 
cgit v1.2.3