aboutsummaryrefslogtreecommitdiffstats
path: root/src/base/UUnicodeUtils.pas
diff options
context:
space:
mode:
authors_alexander <s_alexander@b956fd51-792f-4845-bead-9b4dfca2ff2c>2009-11-09 00:27:55 +0000
committers_alexander <s_alexander@b956fd51-792f-4845-bead-9b4dfca2ff2c>2009-11-09 00:27:55 +0000
commit917901e8e33438c425aef50a0a7417f32d77b760 (patch)
tree95f081dd0d9a206bba3bd9c0a70e7a9a4cddafc8 /src/base/UUnicodeUtils.pas
parent474452a88427e6ea83d6435b117e5deb1d4cd0c6 (diff)
downloadusdx-917901e8e33438c425aef50a0a7417f32d77b760.tar.gz
usdx-917901e8e33438c425aef50a0a7417f32d77b760.tar.xz
usdx-917901e8e33438c425aef50a0a7417f32d77b760.zip
merged unicode branch (r1931) into trunk
git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/trunk@1939 b956fd51-792f-4845-bead-9b4dfca2ff2c
Diffstat (limited to '')
-rw-r--r--src/base/UUnicodeUtils.pas670
1 files changed, 670 insertions, 0 deletions
diff --git a/src/base/UUnicodeUtils.pas b/src/base/UUnicodeUtils.pas
new file mode 100644
index 00000000..37b53a67
--- /dev/null
+++ b/src/base/UUnicodeUtils.pas
@@ -0,0 +1,670 @@
+{* UltraStar Deluxe - Karaoke Game
+ *
+ * UltraStar Deluxe is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ * $URL$
+ * $Id$
+ *}
+
+unit UUnicodeUtils;
+
+interface
+
+{$IFDEF FPC}
+ {$MODE Delphi}
+{$ENDIF}
+
+uses
+{$IFDEF MSWINDOWS}
+ Windows,
+{$ENDIF}
+ StrUtils,
+ SysUtils;
+
+type
+ // String with unknown encoding. Introduced with Delphi 2009 and maybe soon
+ // with FPC.
+ RawByteString = AnsiString;
+
+{**
+ * Returns true if the system uses UTF-8 as default string type
+ * (filesystem or API calls).
+ * This is always true on Mac OS X and always false on Win32. On Unix it depends
+ * on the LC_CTYPE setting.
+ * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true.
+ *}
+function IsNativeUTF8(): boolean;
+
+(*
+ * Character classes
+ *)
+
+function IsAlphaChar(ch: WideChar): boolean; overload;
+function IsAlphaChar(ch: UCS4Char): boolean; overload;
+
+function IsNumericChar(ch: WideChar): boolean; overload;
+function IsNumericChar(ch: UCS4Char): boolean; overload;
+
+function IsAlphaNumericChar(ch: WideChar): boolean; overload;
+function IsAlphaNumericChar(ch: UCS4Char): boolean; overload;
+
+function IsPunctuationChar(ch: WideChar): boolean; overload;
+function IsPunctuationChar(ch: UCS4Char): boolean; overload;
+
+function IsControlChar(ch: WideChar): boolean; overload;
+function IsControlChar(ch: UCS4Char): boolean; overload;
+
+function IsPrintableChar(ch: WideChar): boolean; overload;
+function IsPrintableChar(ch: UCS4Char): boolean; overload;
+
+{**
+ * Checks if the given string is a valid UTF-8 string.
+ * If an ANSI encoded string (with char codes >= 128) is passed, the
+ * function will most probably return false, as most ANSI strings sequences
+ * are illegal in UTF-8.
+ *}
+function IsUTF8String(const str: RawByteString): boolean;
+
+{**
+ * Iterates over an UTF-8 encoded string.
+ * StrPtr will be increased to the beginning of the next character on each
+ * call.
+ * Results true if the given string starts with an UTF-8 encoded char.
+ *}
+function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;
+
+{**
+ * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index.
+ * Index values start with 1.
+ *}
+procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
+procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);
+
+{**
+ * Checks if the string is composed of ASCII characters.
+ *}
+function IsASCIIString(const str: RawByteString): boolean;
+
+{*
+ * String format conversion
+ *}
+
+function UTF8ToUCS4String(const str: UTF8String): UCS4String;
+function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload;
+function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload;
+
+{**
+ * Returns the number of characters (not bytes) in string str.
+ *}
+function LengthUTF8(const str: UTF8String): integer;
+
+{**
+ * Returns the length of an UCS4String. Note that Length(UCS4String) returns
+ * the length+1 as UCS4Strings are zero-terminated.
+ *}
+function LengthUCS4(const str: UCS4String): integer;
+
+{** @seealso WideCompareStr *}
+function UTF8CompareStr(const S1, S2: UTF8String): integer;
+{** @seealso WideCompareText *}
+function UTF8CompareText(const S1, S2: UTF8String): integer;
+
+function UTF8StartsText(const SubText, Text: UTF8String): boolean;
+
+function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
+function UTF8ContainsText(const Text, SubText: UTF8String): boolean;
+
+{** @seealso WideUpperCase *}
+function UTF8UpperCase(const str: UTF8String): UTF8String;
+{** @seealso WideCompareText *}
+function UTF8LowerCase(const str: UTF8String): UTF8String;
+
+{**
+ * Converts a UCS-4 char ch to its upper-case representation.
+ *}
+function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload;
+
+{**
+ * Converts a UCS-4 string str to its upper-case representation.
+ *}
+function UCS4UpperCase(const str: UCS4String): UCS4String; overload;
+
+{**
+ * Converts a UCS4Char to an UCS4String.
+ * Note that UCS4Strings are zero-terminated dynamic arrays.
+ *}
+function UCS4CharToString(ch: UCS4Char): UCS4String;
+
+{**
+ * @seealso System.Pos()
+ *}
+function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;
+
+{**
+ * Copies a segment of str starting with Index (1-based) with Count characters (not bytes).
+ *}
+function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String;
+
+{**
+ * Copies a segment of str starting with Index (0-based) with Count characters.
+ * Note: Do not use Copy() to copy UCS4Strings as the result will not contain
+ * a trailing #0 character and hence is invalid.
+ *}
+function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String;
+
+(*
+ * Converts a WideString to its upper- or lower-case representation.
+ * Wrapper for WideUpper/LowerCase. Needed because some plattforms have
+ * problems with unicode support.
+ *
+ * Note that characters in UTF-16 might consist of one or two WideChar valus
+ * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single
+ * character access, convert to UCS-4 where each character is represented by
+ * one UCS4Char.
+ *)
+function WideStringUpperCase(const str: WideString) : WideString; overload;
+function WideStringUpperCase(ch: WideChar): WideString; overload;
+function WideStringLowerCase(const str: WideString): WideString; overload;
+function WideStringLowerCase(ch: WideChar): WideString; overload;
+
+function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;
+
+implementation
+
+{$IFDEF UNIX}
+{$IFNDEF DARWIN}
+const
+ LC_CTYPE = 0;
+
+function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c';
+{$ENDIF}
+{$ENDIF}
+
+var
+ NativeUTF8: boolean;
+
+procedure InitUnicodeUtils();
+{$IFDEF UNIX}
+{$IFNDEF DARWIN}
+var
+ localeName: PChar;
+{$ENDIF}
+{$ENDIF}
+begin
+ {$IF Defined(DARWIN)}
+ NativeUTF8 := true;
+ {$ELSEIF Defined(MSWindows)}
+ NativeUTF8 := false;
+ {$ELSEIF Defined(UNIX)}
+ // check if locale name contains UTF8 or UTF-8
+ localeName := setlocale(LC_CTYPE, nil);
+ NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0;
+ {$ELSE}
+ raise Exception.Create('Unknown system');
+ {$IFEND}
+end;
+
+function IsNativeUTF8(): boolean;
+begin
+ Result := NativeUTF8;
+end;
+
+function IsAlphaChar(ch: WideChar): boolean;
+begin
+ {$IFDEF MSWINDOWS}
+ Result := IsCharAlphaW(ch);
+ {$ELSE}
+ // TODO: add chars > 255 (or replace with libxml2 functions?)
+ case ch of
+ 'A'..'Z', // A-Z
+ 'a'..'z', // a-z
+ #170,#181,#186,
+ #192..#214,
+ #216..#246,
+ #248..#255:
+ Result := true;
+ else
+ Result := false;
+ end;
+ {$ENDIF}
+end;
+
+function IsAlphaChar(ch: UCS4Char): boolean;
+begin
+ Result := IsAlphaChar(WideChar(Ord(ch)));
+end;
+
+function IsNumericChar(ch: WideChar): boolean;
+begin
+ // TODO: replace with libxml2 functions?
+ // ignore non-arabic numerals as we do not want to handle them
+ case ch of
+ '0'..'9':
+ Result := true;
+ else
+ Result := false;
+ end;
+end;
+
+function IsNumericChar(ch: UCS4Char): boolean;
+begin
+ Result := IsNumericChar(WideChar(Ord(ch)));
+end;
+
+function IsAlphaNumericChar(ch: WideChar): boolean;
+begin
+ Result := (IsAlphaChar(ch) or IsNumericChar(ch));
+end;
+
+function IsAlphaNumericChar(ch: UCS4Char): boolean;
+begin
+ Result := (IsAlphaChar(ch) or IsNumericChar(ch));
+end;
+
+function IsPunctuationChar(ch: WideChar): boolean;
+begin
+ // TODO: add chars > 255 (or replace with libxml2 functions?)
+ case ch of
+ ' '..'/',':'..'@','['..'`','{'..'~',
+ #160..#191,#215,#247:
+ Result := true;
+ else
+ Result := false;
+ end;
+end;
+
+function IsPunctuationChar(ch: UCS4Char): boolean;
+begin
+ Result := IsPunctuationChar(WideChar(Ord(ch)));
+end;
+
+function IsControlChar(ch: WideChar): boolean;
+begin
+ case ch of
+ #0..#31,
+ #127..#159:
+ Result := true;
+ else
+ Result := false;
+ end;
+end;
+
+function IsControlChar(ch: UCS4Char): boolean;
+begin
+ Result := IsControlChar(WideChar(Ord(ch)));
+end;
+
+function IsPrintableChar(ch: WideChar): boolean;
+begin
+ Result := not IsControlChar(ch);
+end;
+
+function IsPrintableChar(ch: UCS4Char): boolean;
+begin
+ Result := IsPrintableChar(WideChar(Ord(ch)));
+end;
+
+
+function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;
+
+ // find the most significant zero bit (Result: [7..-1])
+ function FindZeroMSB(b: byte): integer;
+ var
+ Mask: byte;
+ begin
+ Mask := $80;
+ Result := 7;
+ while (b and Mask <> 0) do
+ begin
+ Mask := Mask shr 1;
+ Dec(Result);
+ end;
+ end;
+
+var
+ ZeroBit: integer;
+ SeqCount: integer; // number of trailing bytes to follow
+const
+ Mask: array[1..3] of byte = ($1F, $0F, $07);
+begin
+ Result := false;
+ SeqCount := 0;
+ Ch := 0;
+
+ while (StrPtr^ <> #0) do
+ begin
+ if (StrPtr^ < #128) then
+ begin
+ // check that no more trailing bytes are expected
+ if (SeqCount = 0) then
+ begin
+ Ch := Ord(StrPtr^);
+ Inc(StrPtr);
+ Result := true;
+ end;
+ Break;
+ end
+ else
+ begin
+ ZeroBit := FindZeroMSB(Ord(StrPtr^));
+ // trailing byte expected
+ if (SeqCount > 0) then
+ begin
+ // check if trailing byte has pattern 10xxxxxx
+ if (ZeroBit <> 6) then
+ begin
+ Inc(StrPtr);
+ Break;
+ end;
+
+ Dec(SeqCount);
+ Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F);
+
+ // check if char is finished
+ if (SeqCount = 0) then
+ begin
+ Inc(StrPtr);
+ Result := true;
+ Break;
+ end;
+ end
+ else // leading byte expected
+ begin
+ // check if pattern is one of 110xxxxx/1110xxxx/11110xxx
+ if (ZeroBit > 5) or (ZeroBit < 3) then
+ begin
+ Inc(StrPtr);
+ Break;
+ end;
+ // calculate number of trailing bytes (1, 2 or 3)
+ SeqCount := 6 - ZeroBit;
+ // extract first part of char
+ Ch := Ord(StrPtr^) and Mask[SeqCount];
+ end;
+ end;
+
+ Inc(StrPtr);
+ end;
+
+ if (not Result) then
+ Ch := Ord('?');
+end;
+
+function IsUTF8String(const str: RawByteString): boolean;
+var
+ Ch: UCS4Char;
+ StrPtr: PAnsiChar;
+begin
+ Result := true;
+ StrPtr := PChar(str);
+ while (StrPtr^ <> #0) do
+ begin
+ if (not NextCharUTF8(StrPtr, Ch)) then
+ begin
+ Result := false;
+ Exit;
+ end;
+ end;
+end;
+
+function IsASCIIString(const str: RawByteString): boolean;
+var
+ I: integer;
+begin
+ for I := 1 to Length(str) do
+ begin
+ if (str[I] >= #128) then
+ begin
+ Result := false;
+ Exit;
+ end;
+ end;
+ Result := true;
+end;
+
+
+function UTF8ToUCS4String(const str: UTF8String): UCS4String;
+begin
+ Result := WideStringToUCS4String(UTF8Decode(str));
+end;
+
+function UCS4ToUTF8String(const str: UCS4String): UTF8String;
+begin
+ Result := UTF8Encode(UCS4StringToWideString(str));
+end;
+
+function UCS4ToUTF8String(ch: UCS4Char): UTF8String;
+begin
+ Result := UCS4ToUTF8String(UCS4CharToString(ch));
+end;
+
+function LengthUTF8(const str: UTF8String): integer;
+begin
+ Result := LengthUCS4(UTF8ToUCS4String(str));
+end;
+
+function LengthUCS4(const str: UCS4String): integer;
+begin
+ Result := High(str);
+ if (Result = -1) then
+ Result := 0;
+end;
+
+function UTF8CompareStr(const S1, S2: UTF8String): integer;
+begin
+ Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2));
+end;
+
+function UTF8CompareText(const S1, S2: UTF8String): integer;
+begin
+ Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2));
+end;
+
+function UTF8StartsStr(const SubText, Text: UTF8String): boolean;
+begin
+ // TODO: use WideSameStr (slower but handles different representations of the same char)?
+ Result := (Pos(SubText, Text) = 1);
+end;
+
+function UTF8StartsText(const SubText, Text: UTF8String): boolean;
+begin
+ // TODO: use WideSameText (slower but handles different representations of the same char)?
+ Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1);
+end;
+
+function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
+begin
+ Result := Pos(SubText, Text) > 0;
+end;
+
+function UTF8ContainsText(const Text, SubText: UTF8String): boolean;
+begin
+ Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0;
+end;
+
+function UTF8UpperCase(const str: UTF8String): UTF8String;
+begin
+ Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str)));
+end;
+
+function UTF8LowerCase(const str: UTF8String): UTF8String;
+begin
+ Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str)));
+end;
+
+function UCS4UpperCase(ch: UCS4Char): UCS4Char;
+begin
+ Result := UCS4UpperCase(UCS4CharToString(ch))[0];
+end;
+
+function UCS4UpperCase(const str: UCS4String): UCS4String;
+begin
+ // convert to upper-case as WideString and convert result back to UCS-4
+ Result := WideStringToUCS4String(
+ WideStringUpperCase(
+ UCS4StringToWideString(str)));
+end;
+
+function UCS4CharToString(ch: UCS4Char): UCS4String;
+begin
+ SetLength(Result, 2);
+ Result[0] := ch;
+ Result[1] := 0;
+end;
+
+function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;
+begin
+ Result := Pos(substr, str);
+end;
+
+function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String;
+begin
+ Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count));
+end;
+
+function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String;
+var
+ I: integer;
+ MaxCount: integer;
+begin
+ // calculate max. copy count
+ MaxCount := LengthUCS4(str)-Index;
+ if (MaxCount < 0) then
+ MaxCount := 0;
+ // adjust copy count
+ if (Count > MaxCount) or (Count < 0) then
+ Count := MaxCount;
+
+ // copy (and add zero terminator)
+ SetLength(Result, Count + 1);
+ for I := 0 to Count-1 do
+ Result[I] := str[Index+I];
+ Result[Count] := 0;
+end;
+
+procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
+var
+ StrUCS4: UCS4String;
+begin
+ StrUCS4 := UTF8ToUCS4String(str);
+ UCS4Delete(StrUCS4, Index-1, Count);
+ Str := UCS4ToUTF8String(StrUCS4);
+end;
+
+procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);
+var
+ Len: integer;
+ OldStr: UCS4String;
+ I: integer;
+begin
+ Len := LengthUCS4(Str);
+ if (Count <= 0) or (Index < 0) or (Index >= Len) then
+ Exit;
+ if (Index + Count > Len) then
+ Count := Len-Index;
+
+ OldStr := Str;
+ SetLength(Str, Len-Count+1);
+ for I := 0 to Index-1 do
+ Str[I] := OldStr[I];
+ for I := Index+Count to Len-1 do
+ Str[I-Count] := OldStr[I];
+ Str[High(Str)] := 0;
+end;
+
+function WideStringUpperCase(ch: WideChar): WideString;
+begin
+ // If WideChar #0 is converted to a WideString in Delphi, a string with
+ // length 1 and a single char #0 is returned. In FPC an empty (length=0)
+ // string will be returned. This will crash, if a non printable key was
+ // pressed, its char code (#0) is translated to upper-case and the the first
+ // character is accessed with Result[1].
+ // We cannot catch this error in the WideString parameter variant as the string
+ // has length 0 already.
+
+ // Force min. string length of 1
+ if (ch = #0) then
+ Result := #0
+ else
+ Result := WideStringUpperCase(WideString(ch));
+end;
+
+function WideStringUpperCase(const str: WideString): WideString;
+begin
+ // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls.
+ // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()).
+ // The Unicode manager cwstring does not work with MacOSX at the moment because
+ // of missing references to iconv.
+ // Note: Should be fixed now
+
+ {.$IFNDEF DARWIN}
+ {.$IFDEF NOIGNORE}
+ Result := WideUpperCase(str)
+ {.$ELSE}
+ //Result := UTF8Decode(UpperCase(UTF8Encode(str)));
+ {.$ENDIF}
+end;
+
+function WideStringLowerCase(ch: WideChar): WideString;
+begin
+ // see WideStringUpperCase
+ if (ch = #0) then
+ Result := #0
+ else
+ Result := WideStringLowerCase(WideString(ch));
+end;
+
+function WideStringLowerCase(const str: WideString): WideString;
+begin
+ // see WideStringUpperCase
+ Result := WideLowerCase(str)
+end;
+
+function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;
+var
+ iPos : integer;
+// sTemp : WideString;
+begin
+(*
+ result := text;
+ iPos := Pos(search, result);
+ while (iPos > 0) do
+ begin
+ sTemp := copy(result, iPos + length(search), length(result));
+ result := copy(result, 1, iPos - 1) + rep + sTEmp;
+ iPos := Pos(search, result);
+ end;
+*)
+ result := text;
+
+ if search = rep then
+ exit;
+
+ for iPos := 1 to length(result) do
+ begin
+ if result[iPos] = search then
+ result[iPos] := rep;
+ end;
+end;
+
+initialization
+ InitUnicodeUtils;
+
+end.