From 917901e8e33438c425aef50a0a7417f32d77b760 Mon Sep 17 00:00:00 2001 From: s_alexander Date: Mon, 9 Nov 2009 00:27:55 +0000 Subject: merged unicode branch (r1931) into trunk git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/trunk@1939 b956fd51-792f-4845-bead-9b4dfca2ff2c --- src/base/UUnicodeUtils.pas | 670 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 670 insertions(+) create mode 100644 src/base/UUnicodeUtils.pas (limited to 'src/base/UUnicodeUtils.pas') diff --git a/src/base/UUnicodeUtils.pas b/src/base/UUnicodeUtils.pas new file mode 100644 index 00000000..37b53a67 --- /dev/null +++ b/src/base/UUnicodeUtils.pas @@ -0,0 +1,670 @@ +{* UltraStar Deluxe - Karaoke Game + * + * UltraStar Deluxe is the legal property of its developers, whose names + * are too numerous to list here. Please refer to the COPYRIGHT + * file distributed with this source distribution. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * $URL$ + * $Id$ + *} + +unit UUnicodeUtils; + +interface + +{$IFDEF FPC} + {$MODE Delphi} +{$ENDIF} + +uses +{$IFDEF MSWINDOWS} + Windows, +{$ENDIF} + StrUtils, + SysUtils; + +type + // String with unknown encoding. Introduced with Delphi 2009 and maybe soon + // with FPC. + RawByteString = AnsiString; + +{** + * Returns true if the system uses UTF-8 as default string type + * (filesystem or API calls). + * This is always true on Mac OS X and always false on Win32. On Unix it depends + * on the LC_CTYPE setting. + * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true. + *} +function IsNativeUTF8(): boolean; + +(* + * Character classes + *) + +function IsAlphaChar(ch: WideChar): boolean; overload; +function IsAlphaChar(ch: UCS4Char): boolean; overload; + +function IsNumericChar(ch: WideChar): boolean; overload; +function IsNumericChar(ch: UCS4Char): boolean; overload; + +function IsAlphaNumericChar(ch: WideChar): boolean; overload; +function IsAlphaNumericChar(ch: UCS4Char): boolean; overload; + +function IsPunctuationChar(ch: WideChar): boolean; overload; +function IsPunctuationChar(ch: UCS4Char): boolean; overload; + +function IsControlChar(ch: WideChar): boolean; overload; +function IsControlChar(ch: UCS4Char): boolean; overload; + +function IsPrintableChar(ch: WideChar): boolean; overload; +function IsPrintableChar(ch: UCS4Char): boolean; overload; + +{** + * Checks if the given string is a valid UTF-8 string. + * If an ANSI encoded string (with char codes >= 128) is passed, the + * function will most probably return false, as most ANSI strings sequences + * are illegal in UTF-8. + *} +function IsUTF8String(const str: RawByteString): boolean; + +{** + * Iterates over an UTF-8 encoded string. + * StrPtr will be increased to the beginning of the next character on each + * call. + * Results true if the given string starts with an UTF-8 encoded char. + *} +function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean; + +{** + * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index. + * Index values start with 1. + *} +procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer); +procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer); + +{** + * Checks if the string is composed of ASCII characters. + *} +function IsASCIIString(const str: RawByteString): boolean; + +{* + * String format conversion + *} + +function UTF8ToUCS4String(const str: UTF8String): UCS4String; +function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload; +function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload; + +{** + * Returns the number of characters (not bytes) in string str. + *} +function LengthUTF8(const str: UTF8String): integer; + +{** + * Returns the length of an UCS4String. Note that Length(UCS4String) returns + * the length+1 as UCS4Strings are zero-terminated. + *} +function LengthUCS4(const str: UCS4String): integer; + +{** @seealso WideCompareStr *} +function UTF8CompareStr(const S1, S2: UTF8String): integer; +{** @seealso WideCompareText *} +function UTF8CompareText(const S1, S2: UTF8String): integer; + +function UTF8StartsText(const SubText, Text: UTF8String): boolean; + +function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; +function UTF8ContainsText(const Text, SubText: UTF8String): boolean; + +{** @seealso WideUpperCase *} +function UTF8UpperCase(const str: UTF8String): UTF8String; +{** @seealso WideCompareText *} +function UTF8LowerCase(const str: UTF8String): UTF8String; + +{** + * Converts a UCS-4 char ch to its upper-case representation. + *} +function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload; + +{** + * Converts a UCS-4 string str to its upper-case representation. + *} +function UCS4UpperCase(const str: UCS4String): UCS4String; overload; + +{** + * Converts a UCS4Char to an UCS4String. + * Note that UCS4Strings are zero-terminated dynamic arrays. + *} +function UCS4CharToString(ch: UCS4Char): UCS4String; + +{** + * @seealso System.Pos() + *} +function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer; + +{** + * Copies a segment of str starting with Index (1-based) with Count characters (not bytes). + *} +function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String; + +{** + * Copies a segment of str starting with Index (0-based) with Count characters. + * Note: Do not use Copy() to copy UCS4Strings as the result will not contain + * a trailing #0 character and hence is invalid. + *} +function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String; + +(* + * Converts a WideString to its upper- or lower-case representation. + * Wrapper for WideUpper/LowerCase. Needed because some plattforms have + * problems with unicode support. + * + * Note that characters in UTF-16 might consist of one or two WideChar valus + * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single + * character access, convert to UCS-4 where each character is represented by + * one UCS4Char. + *) +function WideStringUpperCase(const str: WideString) : WideString; overload; +function WideStringUpperCase(ch: WideChar): WideString; overload; +function WideStringLowerCase(const str: WideString): WideString; overload; +function WideStringLowerCase(ch: WideChar): WideString; overload; + +function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString; + +implementation + +{$IFDEF UNIX} +{$IFNDEF DARWIN} +const + LC_CTYPE = 0; + +function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c'; +{$ENDIF} +{$ENDIF} + +var + NativeUTF8: boolean; + +procedure InitUnicodeUtils(); +{$IFDEF UNIX} +{$IFNDEF DARWIN} +var + localeName: PChar; +{$ENDIF} +{$ENDIF} +begin + {$IF Defined(DARWIN)} + NativeUTF8 := true; + {$ELSEIF Defined(MSWindows)} + NativeUTF8 := false; + {$ELSEIF Defined(UNIX)} + // check if locale name contains UTF8 or UTF-8 + localeName := setlocale(LC_CTYPE, nil); + NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0; + {$ELSE} + raise Exception.Create('Unknown system'); + {$IFEND} +end; + +function IsNativeUTF8(): boolean; +begin + Result := NativeUTF8; +end; + +function IsAlphaChar(ch: WideChar): boolean; +begin + {$IFDEF MSWINDOWS} + Result := IsCharAlphaW(ch); + {$ELSE} + // TODO: add chars > 255 (or replace with libxml2 functions?) + case ch of + 'A'..'Z', // A-Z + 'a'..'z', // a-z + #170,#181,#186, + #192..#214, + #216..#246, + #248..#255: + Result := true; + else + Result := false; + end; + {$ENDIF} +end; + +function IsAlphaChar(ch: UCS4Char): boolean; +begin + Result := IsAlphaChar(WideChar(Ord(ch))); +end; + +function IsNumericChar(ch: WideChar): boolean; +begin + // TODO: replace with libxml2 functions? + // ignore non-arabic numerals as we do not want to handle them + case ch of + '0'..'9': + Result := true; + else + Result := false; + end; +end; + +function IsNumericChar(ch: UCS4Char): boolean; +begin + Result := IsNumericChar(WideChar(Ord(ch))); +end; + +function IsAlphaNumericChar(ch: WideChar): boolean; +begin + Result := (IsAlphaChar(ch) or IsNumericChar(ch)); +end; + +function IsAlphaNumericChar(ch: UCS4Char): boolean; +begin + Result := (IsAlphaChar(ch) or IsNumericChar(ch)); +end; + +function IsPunctuationChar(ch: WideChar): boolean; +begin + // TODO: add chars > 255 (or replace with libxml2 functions?) + case ch of + ' '..'/',':'..'@','['..'`','{'..'~', + #160..#191,#215,#247: + Result := true; + else + Result := false; + end; +end; + +function IsPunctuationChar(ch: UCS4Char): boolean; +begin + Result := IsPunctuationChar(WideChar(Ord(ch))); +end; + +function IsControlChar(ch: WideChar): boolean; +begin + case ch of + #0..#31, + #127..#159: + Result := true; + else + Result := false; + end; +end; + +function IsControlChar(ch: UCS4Char): boolean; +begin + Result := IsControlChar(WideChar(Ord(ch))); +end; + +function IsPrintableChar(ch: WideChar): boolean; +begin + Result := not IsControlChar(ch); +end; + +function IsPrintableChar(ch: UCS4Char): boolean; +begin + Result := IsPrintableChar(WideChar(Ord(ch))); +end; + + +function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean; + + // find the most significant zero bit (Result: [7..-1]) + function FindZeroMSB(b: byte): integer; + var + Mask: byte; + begin + Mask := $80; + Result := 7; + while (b and Mask <> 0) do + begin + Mask := Mask shr 1; + Dec(Result); + end; + end; + +var + ZeroBit: integer; + SeqCount: integer; // number of trailing bytes to follow +const + Mask: array[1..3] of byte = ($1F, $0F, $07); +begin + Result := false; + SeqCount := 0; + Ch := 0; + + while (StrPtr^ <> #0) do + begin + if (StrPtr^ < #128) then + begin + // check that no more trailing bytes are expected + if (SeqCount = 0) then + begin + Ch := Ord(StrPtr^); + Inc(StrPtr); + Result := true; + end; + Break; + end + else + begin + ZeroBit := FindZeroMSB(Ord(StrPtr^)); + // trailing byte expected + if (SeqCount > 0) then + begin + // check if trailing byte has pattern 10xxxxxx + if (ZeroBit <> 6) then + begin + Inc(StrPtr); + Break; + end; + + Dec(SeqCount); + Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F); + + // check if char is finished + if (SeqCount = 0) then + begin + Inc(StrPtr); + Result := true; + Break; + end; + end + else // leading byte expected + begin + // check if pattern is one of 110xxxxx/1110xxxx/11110xxx + if (ZeroBit > 5) or (ZeroBit < 3) then + begin + Inc(StrPtr); + Break; + end; + // calculate number of trailing bytes (1, 2 or 3) + SeqCount := 6 - ZeroBit; + // extract first part of char + Ch := Ord(StrPtr^) and Mask[SeqCount]; + end; + end; + + Inc(StrPtr); + end; + + if (not Result) then + Ch := Ord('?'); +end; + +function IsUTF8String(const str: RawByteString): boolean; +var + Ch: UCS4Char; + StrPtr: PAnsiChar; +begin + Result := true; + StrPtr := PChar(str); + while (StrPtr^ <> #0) do + begin + if (not NextCharUTF8(StrPtr, Ch)) then + begin + Result := false; + Exit; + end; + end; +end; + +function IsASCIIString(const str: RawByteString): boolean; +var + I: integer; +begin + for I := 1 to Length(str) do + begin + if (str[I] >= #128) then + begin + Result := false; + Exit; + end; + end; + Result := true; +end; + + +function UTF8ToUCS4String(const str: UTF8String): UCS4String; +begin + Result := WideStringToUCS4String(UTF8Decode(str)); +end; + +function UCS4ToUTF8String(const str: UCS4String): UTF8String; +begin + Result := UTF8Encode(UCS4StringToWideString(str)); +end; + +function UCS4ToUTF8String(ch: UCS4Char): UTF8String; +begin + Result := UCS4ToUTF8String(UCS4CharToString(ch)); +end; + +function LengthUTF8(const str: UTF8String): integer; +begin + Result := LengthUCS4(UTF8ToUCS4String(str)); +end; + +function LengthUCS4(const str: UCS4String): integer; +begin + Result := High(str); + if (Result = -1) then + Result := 0; +end; + +function UTF8CompareStr(const S1, S2: UTF8String): integer; +begin + Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2)); +end; + +function UTF8CompareText(const S1, S2: UTF8String): integer; +begin + Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2)); +end; + +function UTF8StartsStr(const SubText, Text: UTF8String): boolean; +begin + // TODO: use WideSameStr (slower but handles different representations of the same char)? + Result := (Pos(SubText, Text) = 1); +end; + +function UTF8StartsText(const SubText, Text: UTF8String): boolean; +begin + // TODO: use WideSameText (slower but handles different representations of the same char)? + Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1); +end; + +function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; +begin + Result := Pos(SubText, Text) > 0; +end; + +function UTF8ContainsText(const Text, SubText: UTF8String): boolean; +begin + Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0; +end; + +function UTF8UpperCase(const str: UTF8String): UTF8String; +begin + Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str))); +end; + +function UTF8LowerCase(const str: UTF8String): UTF8String; +begin + Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str))); +end; + +function UCS4UpperCase(ch: UCS4Char): UCS4Char; +begin + Result := UCS4UpperCase(UCS4CharToString(ch))[0]; +end; + +function UCS4UpperCase(const str: UCS4String): UCS4String; +begin + // convert to upper-case as WideString and convert result back to UCS-4 + Result := WideStringToUCS4String( + WideStringUpperCase( + UCS4StringToWideString(str))); +end; + +function UCS4CharToString(ch: UCS4Char): UCS4String; +begin + SetLength(Result, 2); + Result[0] := ch; + Result[1] := 0; +end; + +function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer; +begin + Result := Pos(substr, str); +end; + +function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String; +begin + Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count)); +end; + +function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String; +var + I: integer; + MaxCount: integer; +begin + // calculate max. copy count + MaxCount := LengthUCS4(str)-Index; + if (MaxCount < 0) then + MaxCount := 0; + // adjust copy count + if (Count > MaxCount) or (Count < 0) then + Count := MaxCount; + + // copy (and add zero terminator) + SetLength(Result, Count + 1); + for I := 0 to Count-1 do + Result[I] := str[Index+I]; + Result[Count] := 0; +end; + +procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer); +var + StrUCS4: UCS4String; +begin + StrUCS4 := UTF8ToUCS4String(str); + UCS4Delete(StrUCS4, Index-1, Count); + Str := UCS4ToUTF8String(StrUCS4); +end; + +procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer); +var + Len: integer; + OldStr: UCS4String; + I: integer; +begin + Len := LengthUCS4(Str); + if (Count <= 0) or (Index < 0) or (Index >= Len) then + Exit; + if (Index + Count > Len) then + Count := Len-Index; + + OldStr := Str; + SetLength(Str, Len-Count+1); + for I := 0 to Index-1 do + Str[I] := OldStr[I]; + for I := Index+Count to Len-1 do + Str[I-Count] := OldStr[I]; + Str[High(Str)] := 0; +end; + +function WideStringUpperCase(ch: WideChar): WideString; +begin + // If WideChar #0 is converted to a WideString in Delphi, a string with + // length 1 and a single char #0 is returned. In FPC an empty (length=0) + // string will be returned. This will crash, if a non printable key was + // pressed, its char code (#0) is translated to upper-case and the the first + // character is accessed with Result[1]. + // We cannot catch this error in the WideString parameter variant as the string + // has length 0 already. + + // Force min. string length of 1 + if (ch = #0) then + Result := #0 + else + Result := WideStringUpperCase(WideString(ch)); +end; + +function WideStringUpperCase(const str: WideString): WideString; +begin + // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls. + // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()). + // The Unicode manager cwstring does not work with MacOSX at the moment because + // of missing references to iconv. + // Note: Should be fixed now + + {.$IFNDEF DARWIN} + {.$IFDEF NOIGNORE} + Result := WideUpperCase(str) + {.$ELSE} + //Result := UTF8Decode(UpperCase(UTF8Encode(str))); + {.$ENDIF} +end; + +function WideStringLowerCase(ch: WideChar): WideString; +begin + // see WideStringUpperCase + if (ch = #0) then + Result := #0 + else + Result := WideStringLowerCase(WideString(ch)); +end; + +function WideStringLowerCase(const str: WideString): WideString; +begin + // see WideStringUpperCase + Result := WideLowerCase(str) +end; + +function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString; +var + iPos : integer; +// sTemp : WideString; +begin +(* + result := text; + iPos := Pos(search, result); + while (iPos > 0) do + begin + sTemp := copy(result, iPos + length(search), length(result)); + result := copy(result, 1, iPos - 1) + rep + sTEmp; + iPos := Pos(search, result); + end; +*) + result := text; + + if search = rep then + exit; + + for iPos := 1 to length(result) do + begin + if result[iPos] = search then + result[iPos] := rep; + end; +end; + +initialization + InitUnicodeUtils; + +end. -- cgit v1.2.3