{* UltraStar Deluxe - Karaoke Game * * UltraStar Deluxe is the legal property of its developers, whose names * are too numerous to list here. Please refer to the COPYRIGHT * file distributed with this source distribution. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * * $URL$ * $Id$ *} unit UUnicodeUtils; interface {$IFDEF FPC} {$MODE Delphi} {$ENDIF} uses {$IFDEF MSWINDOWS} Windows, {$ENDIF} StrUtils, SysUtils; type // String with unknown encoding. Introduced with Delphi 2009 and maybe soon // with FPC. RawByteString = AnsiString; {** * Returns true if the system uses UTF-8 as default string type * (filesystem or API calls). * This is always true on Mac OS X and always false on Win32. On Unix it depends * on the LC_CTYPE setting. * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true. *} function IsNativeUTF8(): boolean; (* * Character classes *) function IsAlphaChar(ch: WideChar): boolean; overload; function IsAlphaChar(ch: UCS4Char): boolean; overload; function IsNumericChar(ch: WideChar): boolean; overload; function IsNumericChar(ch: UCS4Char): boolean; overload; function IsAlphaNumericChar(ch: WideChar): boolean; overload; function IsAlphaNumericChar(ch: UCS4Char): boolean; overload; function IsPunctuationChar(ch: WideChar): boolean; overload; function IsPunctuationChar(ch: UCS4Char): boolean; overload; function IsControlChar(ch: WideChar): boolean; overload; function IsControlChar(ch: UCS4Char): boolean; overload; function IsPrintableChar(ch: WideChar): boolean; overload; function IsPrintableChar(ch: UCS4Char): boolean; overload; {** * Checks if the given string is a valid UTF-8 string. * If an ANSI encoded string (with char codes >= 128) is passed, the * function will most probably return false, as most ANSI strings sequences * are illegal in UTF-8. *} function IsUTF8String(const str: RawByteString): boolean; {** * Iterates over an UTF-8 encoded string. * StrPtr will be increased to the beginning of the next character on each * call. * Results true if the given string starts with an UTF-8 encoded char. *} function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean; {** * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index. * Index values start with 1. *} procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer); procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer); {** * Checks if the string is composed of ASCII characters. *} function IsASCIIString(const str: RawByteString): boolean; {* * String format conversion *} function UTF8ToUCS4String(const str: UTF8String): UCS4String; function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload; function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload; {** * Returns the number of characters (not bytes) in string str. *} function LengthUTF8(const str: UTF8String): integer; {** * Returns the length of an UCS4String. Note that Length(UCS4String) returns * the length+1 as UCS4Strings are zero-terminated. *} function LengthUCS4(const str: UCS4String): integer; {** @seealso WideCompareStr *} function UTF8CompareStr(const S1, S2: UTF8String): integer; {** @seealso WideCompareText *} function UTF8CompareText(const S1, S2: UTF8String): integer; function UTF8StartsText(const SubText, Text: UTF8String): boolean; function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; function UTF8ContainsText(const Text, SubText: UTF8String): boolean; {** @seealso WideUpperCase *} function UTF8UpperCase(const str: UTF8String): UTF8String; {** @seealso WideCompareText *} function UTF8LowerCase(const str: UTF8String): UTF8String; {** * Converts a UCS-4 char ch to its upper-case representation. *} function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload; {** * Converts a UCS-4 string str to its upper-case representation. *} function UCS4UpperCase(const str: UCS4String): UCS4String; overload; {** * Converts a UCS4Char to an UCS4String. * Note that UCS4Strings are zero-terminated dynamic arrays. *} function UCS4CharToString(ch: UCS4Char): UCS4String; {** * @seealso System.Pos() *} function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer; {** * Copies a segment of str starting with Index (1-based) with Count characters (not bytes). *} function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String; {** * Copies a segment of str starting with Index (0-based) with Count characters. * Note: Do not use Copy() to copy UCS4Strings as the result will not contain * a trailing #0 character and hence is invalid. *} function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String; (* * Converts a WideString to its upper- or lower-case representation. * Wrapper for WideUpper/LowerCase. Needed because some plattforms have * problems with unicode support. * * Note that characters in UTF-16 might consist of one or two WideChar valus * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single * character access, convert to UCS-4 where each character is represented by * one UCS4Char. *) function WideStringUpperCase(const str: WideString) : WideString; overload; function WideStringUpperCase(ch: WideChar): WideString; overload; function WideStringLowerCase(const str: WideString): WideString; overload; function WideStringLowerCase(ch: WideChar): WideString; overload; function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString; implementation {$IFDEF UNIX} {$IFNDEF DARWIN} const LC_CTYPE = 0; function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c'; {$ENDIF} {$ENDIF} var NativeUTF8: boolean; procedure InitUnicodeUtils(); {$IFDEF UNIX} {$IFNDEF DARWIN} var localeName: PChar; {$ENDIF} {$ENDIF} begin {$IF Defined(DARWIN)} NativeUTF8 := true; {$ELSEIF Defined(MSWindows)} NativeUTF8 := false; {$ELSEIF Defined(UNIX)} // check if locale name contains UTF8 or UTF-8 localeName := setlocale(LC_CTYPE, nil); NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0; {$ELSE} raise Exception.Create('Unknown system'); {$IFEND} end; function IsNativeUTF8(): boolean; begin Result := NativeUTF8; end; function IsAlphaChar(ch: WideChar): boolean; begin {$IFDEF MSWINDOWS} Result := IsCharAlphaW(ch); {$ELSE} // TODO: add chars > 255 (or replace with libxml2 functions?) case ch of 'A'..'Z', // A-Z 'a'..'z', // a-z #170,#181,#186, #192..#214, #216..#246, #248..#255: Result := true; else Result := false; end; {$ENDIF} end; function IsAlphaChar(ch: UCS4Char): boolean; begin Result := IsAlphaChar(WideChar(Ord(ch))); end; function IsNumericChar(ch: WideChar): boolean; begin // TODO: replace with libxml2 functions? // ignore non-arabic numerals as we do not want to handle them case ch of '0'..'9': Result := true; else Result := false; end; end; function IsNumericChar(ch: UCS4Char): boolean; begin Result := IsNumericChar(WideChar(Ord(ch))); end; function IsAlphaNumericChar(ch: WideChar): boolean; begin Result := (IsAlphaChar(ch) or IsNumericChar(ch)); end; function IsAlphaNumericChar(ch: UCS4Char): boolean; begin Result := (IsAlphaChar(ch) or IsNumericChar(ch)); end; function IsPunctuationChar(ch: WideChar): boolean; begin // TODO: add chars > 255 (or replace with libxml2 functions?) case ch of ' '..'/',':'..'@','['..'`','{'..'~', #160..#191,#215,#247: Result := true; else Result := false; end; end; function IsPunctuationChar(ch: UCS4Char): boolean; begin Result := IsPunctuationChar(WideChar(Ord(ch))); end; function IsControlChar(ch: WideChar): boolean; begin case ch of #0..#31, #127..#159: Result := true; else Result := false; end; end; function IsControlChar(ch: UCS4Char): boolean; begin Result := IsControlChar(WideChar(Ord(ch))); end; function IsPrintableChar(ch: WideChar): boolean; begin Result := not IsControlChar(ch); end; function IsPrintableChar(ch: UCS4Char): boolean; begin Result := IsPrintableChar(WideChar(Ord(ch))); end; function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean; // find the most significant zero bit (Result: [7..-1]) function FindZeroMSB(b: byte): integer; var Mask: byte; begin Mask := $80; Result := 7; while (b and Mask <> 0) do begin Mask := Mask shr 1; Dec(Result); end; end; var ZeroBit: integer; SeqCount: integer; // number of trailing bytes to follow const Mask: array[1..3] of byte = ($1F, $0F, $07); begin Result := false; SeqCount := 0; Ch := 0; while (StrPtr^ <> #0) do begin if (StrPtr^ < #128) then begin // check that no more trailing bytes are expected if (SeqCount = 0) then begin Ch := Ord(StrPtr^); Inc(StrPtr); Result := true; end; Break; end else begin ZeroBit := FindZeroMSB(Ord(StrPtr^)); // trailing byte expected if (SeqCount > 0) then begin // check if trailing byte has pattern 10xxxxxx if (ZeroBit <> 6) then begin Inc(StrPtr); Break; end; Dec(SeqCount); Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F); // check if char is finished if (SeqCount = 0) then begin Inc(StrPtr); Result := true; Break; end; end else // leading byte expected begin // check if pattern is one of 110xxxxx/1110xxxx/11110xxx if (ZeroBit > 5) or (ZeroBit < 3) then begin Inc(StrPtr); Break; end; // calculate number of trailing bytes (1, 2 or 3) SeqCount := 6 - ZeroBit; // extract first part of char Ch := Ord(StrPtr^) and Mask[SeqCount]; end; end; Inc(StrPtr); end; if (not Result) then Ch := Ord('?'); end; function IsUTF8String(const str: RawByteString): boolean; var Ch: UCS4Char; StrPtr: PAnsiChar; begin Result := true; StrPtr := PChar(str); while (StrPtr^ <> #0) do begin if (not NextCharUTF8(StrPtr, Ch)) then begin Result := false; Exit; end; end; end; function IsASCIIString(const str: RawByteString): boolean; var I: integer; begin for I := 1 to Length(str) do begin if (str[I] >= #128) then begin Result := false; Exit; end; end; Result := true; end; function UTF8ToUCS4String(const str: UTF8String): UCS4String; begin Result := WideStringToUCS4String(UTF8Decode(str)); end; function UCS4ToUTF8String(const str: UCS4String): UTF8String; begin Result := UTF8Encode(UCS4StringToWideString(str)); end; function UCS4ToUTF8String(ch: UCS4Char): UTF8String; begin Result := UCS4ToUTF8String(UCS4CharToString(ch)); end; function LengthUTF8(const str: UTF8String): integer; begin Result := LengthUCS4(UTF8ToUCS4String(str)); end; function LengthUCS4(const str: UCS4String): integer; begin Result := High(str); if (Result = -1) then Result := 0; end; function UTF8CompareStr(const S1, S2: UTF8String): integer; begin Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2)); end; function UTF8CompareText(const S1, S2: UTF8String): integer; begin Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2)); end; function UTF8StartsStr(const SubText, Text: UTF8String): boolean; begin // TODO: use WideSameStr (slower but handles different representations of the same char)? Result := (Pos(SubText, Text) = 1); end; function UTF8StartsText(const SubText, Text: UTF8String): boolean; begin // TODO: use WideSameText (slower but handles different representations of the same char)? Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1); end; function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; begin Result := Pos(SubText, Text) > 0; end; function UTF8ContainsText(const Text, SubText: UTF8String): boolean; begin Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0; end; function UTF8UpperCase(const str: UTF8String): UTF8String; begin Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str))); end; function UTF8LowerCase(const str: UTF8String): UTF8String; begin Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str))); end; function UCS4UpperCase(ch: UCS4Char): UCS4Char; begin Result := UCS4UpperCase(UCS4CharToString(ch))[0]; end; function UCS4UpperCase(const str: UCS4String): UCS4String; begin // convert to upper-case as WideString and convert result back to UCS-4 Result := WideStringToUCS4String( WideStringUpperCase( UCS4StringToWideString(str))); end; function UCS4CharToString(ch: UCS4Char): UCS4String; begin SetLength(Result, 2); Result[0] := ch; Result[1] := 0; end; function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer; begin Result := Pos(substr, str); end; function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String; begin Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count)); end; function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String; var I: integer; MaxCount: integer; begin // calculate max. copy count MaxCount := LengthUCS4(str)-Index; if (MaxCount < 0) then MaxCount := 0; // adjust copy count if (Count > MaxCount) or (Count < 0) then Count := MaxCount; // copy (and add zero terminator) SetLength(Result, Count + 1); for I := 0 to Count-1 do Result[I] := str[Index+I]; Result[Count] := 0; end; procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer); var StrUCS4: UCS4String; begin StrUCS4 := UTF8ToUCS4String(str); UCS4Delete(StrUCS4, Index-1, Count); Str := UCS4ToUTF8String(StrUCS4); end; procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer); var Len: integer; OldStr: UCS4String; I: integer; begin Len := LengthUCS4(Str); if (Count <= 0) or (Index < 0) or (Index >= Len) then Exit; if (Index + Count > Len) then Count := Len-Index; OldStr := Str; SetLength(Str, Len-Count+1); for I := 0 to Index-1 do Str[I] := OldStr[I]; for I := Index+Count to Len-1 do Str[I-Count] := OldStr[I]; Str[High(Str)] := 0; end; function WideStringUpperCase(ch: WideChar): WideString; begin // If WideChar #0 is converted to a WideString in Delphi, a string with // length 1 and a single char #0 is returned. In FPC an empty (length=0) // string will be returned. This will crash, if a non printable key was // pressed, its char code (#0) is translated to upper-case and the the first // character is accessed with Result[1]. // We cannot catch this error in the WideString parameter variant as the string // has length 0 already. // Force min. string length of 1 if (ch = #0) then Result := #0 else Result := WideStringUpperCase(WideString(ch)); end; function WideStringUpperCase(const str: WideString): WideString; begin // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls. // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()). // The Unicode manager cwstring does not work with MacOSX at the moment because // of missing references to iconv. // Note: Should be fixed now {.$IFNDEF DARWIN} {.$IFDEF NOIGNORE} Result := WideUpperCase(str) {.$ELSE} //Result := UTF8Decode(UpperCase(UTF8Encode(str))); {.$ENDIF} end; function WideStringLowerCase(ch: WideChar): WideString; begin // see WideStringUpperCase if (ch = #0) then Result := #0 else Result := WideStringLowerCase(WideString(ch)); end; function WideStringLowerCase(const str: WideString): WideString; begin // see WideStringUpperCase Result := WideLowerCase(str) end; function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString; var iPos : integer; // sTemp : WideString; begin (* result := text; iPos := Pos(search, result); while (iPos > 0) do begin sTemp := copy(result, iPos + length(search), length(result)); result := copy(result, 1, iPos - 1) + rep + sTEmp; iPos := Pos(search, result); end; *) result := text; if search = rep then exit; for iPos := 1 to length(result) do begin if result[iPos] = search then result[iPos] := rep; end; end; initialization InitUnicodeUtils; end.