{* UltraStar Deluxe - Karaoke Game * * UltraStar Deluxe is the legal property of its developers, whose names * are too numerous to list here. Please refer to the COPYRIGHT * file distributed with this source distribution. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * * $URL$ * $Id$ *} unit UUnicodeUtils; interface {$IFDEF FPC} {$MODE Delphi} {$ENDIF} {$I switches.inc} uses {$IFDEF MSWINDOWS} Windows, {$ENDIF} SysUtils; (* * Character classes *) function IsAlphaChar(ch: WideChar): boolean; overload; function IsAlphaChar(ch: UCS4Char): boolean; overload; function IsNumericChar(ch: WideChar): boolean; overload; function IsNumericChar(ch: UCS4Char): boolean; overload; function IsAlphaNumericChar(ch: WideChar): boolean; overload; function IsAlphaNumericChar(ch: UCS4Char): boolean; overload; function IsPunctuationChar(ch: WideChar): boolean; overload; function IsPunctuationChar(ch: UCS4Char): boolean; overload; function IsControlChar(ch: WideChar): boolean; overload; function IsControlChar(ch: UCS4Char): boolean; overload; {** * Checks if the given string is a valid UTF-8 string. * If an ANSI encoded string (with char codes >= 128) is passed, the * function will most probably return false, as most ANSI strings sequences * are illegal in UTF-8. *} function IsUTF8String(const str: AnsiString): boolean; {** * Checks if the string is composed of ASCII characters. *} function IsASCIIString(const str: AnsiString): boolean; {* * String format conversion *} function UTF8ToUCS4String(const str: UTF8String): UCS4String; function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload; function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload; {** * Returns the number of characters (not bytes) in string str. *} function LengthUTF8(const str: UTF8String): integer; {** * Returns the length of an UCS4String. Note that Length(UCS4String) returns * the length+1 as UCS4Strings are zero-terminated. *} function LengthUCS4(const str: UCS4String): integer; function UTF8CompareStr(const S1, S2: UTF8String): integer; function UTF8CompareText(const S1, S2: UTF8String): integer; function UTF8StartsText(const SubText, Text: UTF8String): boolean; function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; function UTF8ContainsText(const Text, SubText: UTF8String): boolean; function UTF8UpperCase(const str: UTF8String): UTF8String; function UTF8LowerCase(const str: UTF8String): UTF8String; {** * Converts a UCS-4 char ch to its upper-case representation. *} function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload; {** * Converts a UCS-4 string str to its upper-case representation. *} function UCS4UpperCase(const str: UCS4String): UCS4String; overload; {** * Converts a UCS4Char to an UCS4String. * Note that UCS4Strings are zero-terminated dynamic arrays. *} function UCS4CharToString(ch: UCS4Char): UCS4String; {** * Copies a segment of str starting with Index with Count characters. * Note: Do not use Copy() to copy UCS4Strings as the result will not contain * a trailing #0 character and hence is invalid. *} function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String; (* * Converts a WideString to its upper-case representation. * Wrapper for WideUpperCase. Needed because some plattforms have problems with * unicode support. * * Note that characters in UTF-16 might consist of one or two WideChar valus * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single * character access, convert to UCS-4 where each character is represented by * one UCS4Char. *) function WideStringUpperCase(const str: WideString) : WideString; overload; function WideStringUpperCase(ch: WideChar): WideString; overload; function StringReplaceW(const text : WideString; search, rep: WideChar): WideString; implementation function IsAlphaChar(ch: WideChar): boolean; begin {$IFDEF MSWINDOWS} Result := IsCharAlphaW(ch); {$ELSE} // TODO: add chars > 255 case ch of 'A'..'Z', // A-Z 'a'..'z', // a-z #170,#181,#186, #192..#214, #216..#246, #248..#255: Result := true; else Result := false; end; {$ENDIF} end; function IsAlphaChar(ch: UCS4Char): boolean; begin Result := IsAlphaChar(WideChar(Ord(ch))); end; function IsNumericChar(ch: WideChar): boolean; begin // ignore non-arabic numerals as we do not want to handle them case ch of '0'..'9': Result := true; else Result := false; end; end; function IsNumericChar(ch: UCS4Char): boolean; begin Result := IsNumericChar(WideChar(Ord(ch))); end; function IsAlphaNumericChar(ch: WideChar): boolean; begin Result := (IsAlphaChar(ch) or IsNumericChar(ch)); end; function IsAlphaNumericChar(ch: UCS4Char): boolean; begin Result := (IsAlphaChar(ch) or IsNumericChar(ch)); end; function IsPunctuationChar(ch: WideChar): boolean; begin // TODO: add chars > 255? case ch of ' '..'/',':'..'@','['..'`','{'..'~', #160..#191,#215,#247: Result := true; else Result := false; end; end; function IsPunctuationChar(ch: UCS4Char): boolean; begin Result := IsPunctuationChar(WideChar(Ord(ch))); end; function IsControlChar(ch: WideChar): boolean; begin case ch of #0..#31, #127..#159: Result := true; else Result := false; end; end; function IsControlChar(ch: UCS4Char): boolean; begin Result := IsControlChar(WideChar(Ord(ch))); end; function IsUTF8String(const str: AnsiString): boolean; // find the most significant zero bit (Result: [7..-1]) function FindZeroMSB(b: byte): integer; var Mask: byte; begin Mask := $80; Result := 7; while (b and Mask <> 0) do begin Mask := Mask shr 1; Dec(Result); end; end; var I: integer; ZeroBit: integer; SeqCount: integer; // number of trailing bytes to follow begin Result := false; SeqCount := 0; for I := 1 to Length(str) do begin if (str[I] >= #128) then begin ZeroBit := FindZeroMSB(Ord(str[I])); // trailing byte expected if (SeqCount > 0) then begin // check if trailing byte has pattern 10xxxxxx if (ZeroBit <> 6) then Exit; Dec(SeqCount); end else // leading byte expected begin // check if pattern is one of 110xxxxx/1110xxxx/11110xxx if (ZeroBit > 5) or (ZeroBit < 3) then Exit; // calculate number of trailing bytes (1, 2 or 3) SeqCount := 6 - ZeroBit; end; end; end; // trailing bytes missing? if (SeqCount > 0) then Exit; Result := true; end; function IsASCIIString(const str: AnsiString): boolean; var I: integer; begin for I := 1 to Length(str) do begin if (str[I] >= #128) then begin Result := false; Exit; end; end; Result := true; end; function UTF8ToUCS4String(const str: UTF8String): UCS4String; begin Result := WideStringToUCS4String(UTF8Decode(str)); end; function UCS4ToUTF8String(const str: UCS4String): UTF8String; begin Result := UTF8Encode(UCS4StringToWideString(str)); end; function UCS4ToUTF8String(ch: UCS4Char): UTF8String; begin Result := UCS4ToUTF8String(UCS4CharToString(ch)); end; function LengthUTF8(const str: UTF8String): integer; begin Result := Length(UTF8ToUCS4String(str)); end; function LengthUCS4(const str: UCS4String): integer; begin Result := High(str); end; function UTF8CompareStr(const S1, S2: UTF8String): integer; begin // FIXME Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2)); end; function UTF8CompareText(const S1, S2: UTF8String): integer; begin // FIXME Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2)); end; function UTF8StartsStr(const SubText, Text: UTF8String): boolean; begin // TODO: use WideSameStr ()? Result := (Pos(SubText, Text) = 1); end; function UTF8StartsText(const SubText, Text: UTF8String): boolean; begin // TODO: use WideSameText? Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1); end; function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; begin Result := Pos(SubText, Text) > 0; end; function UTF8ContainsText(const Text, SubText: UTF8String): boolean; begin Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0; end; function UTF8UpperCase(const str: UTF8String): UTF8String; begin Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str))); end; function UTF8LowerCase(const str: UTF8String): UTF8String; begin // FIXME Result := UTF8Encode(WideLowerCase(UTF8Decode(str))); end; function UCS4UpperCase(ch: UCS4Char): UCS4Char; begin Result := UCS4UpperCase(UCS4CharToString(ch))[0]; end; function UCS4UpperCase(const str: UCS4String): UCS4String; begin // convert to upper-case as WideString and convert result back to UCS-4 Result := WideStringToUCS4String( WideStringUpperCase( UCS4StringToWideString(str))); end; function UCS4CharToString(ch: UCS4Char): UCS4String; begin SetLength(Result, 2); Result[0] := ch; Result[1] := 0; end; function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String; var I: integer; MaxCount: integer; begin // calculate max. copy count MaxCount := LengthUCS4(str)-Index; if (MaxCount < 0) then MaxCount := 0; // adjust copy count if (Count > MaxCount) or (Count < 0) then Count := MaxCount; // copy (and add zero terminator) SetLength(Result, Count + 1); for I := 0 to Count-1 do Result[I] := str[Index+I]; Result[Count] := 0; end; function WideStringUpperCase(ch: WideChar): WideString; begin // If WideChar #0 is converted to a WideString in Delphi, a string with // length 1 and a single char #0 is returned. In FPC an empty (length=0) // string will be returned. This will crash, if a non printable key was // pressed, its char code (#0) is translated to upper-case and the the first // character is accessed with Result[1]. // We cannot catch this error in the WideString parameter variant as the string // has length 0 already. // Force min. string length of 1 if (ch = #0) then Result := #0 else Result := WideStringUpperCase(WideString(ch)); end; function WideStringUpperCase(const str: WideString): WideString; begin // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls. // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()). // The Unicode manager cwstring does not work with MacOSX at the moment because // of missing references to iconv. {.$IFNDEF DARWIN} {$IFDEF NOIGNORE} Result := WideUpperCase(str) {$ELSE} Result := UTF8Decode(UpperCase(UTF8Encode(str))); {$ENDIF} end; function StringReplaceW(const text : WideString; search, rep: WideChar) : WideString; var iPos : integer; // sTemp : WideString; begin (* result := text; iPos := Pos(search, result); while (iPos > 0) do begin sTemp := copy(result, iPos + length(search), length(result)); result := copy(result, 1, iPos - 1) + rep + sTEmp; iPos := Pos(search, result); end; *) result := text; if search = rep then exit; for iPos := 1 to length(result) do begin if result[iPos] = search then result[iPos] := rep; end; end; end.