diff options
Diffstat (limited to 'src/base/UUnicodeUtils.pas')
-rw-r--r-- | src/base/UUnicodeUtils.pas | 670 |
1 files changed, 0 insertions, 670 deletions
diff --git a/src/base/UUnicodeUtils.pas b/src/base/UUnicodeUtils.pas deleted file mode 100644 index 37b53a67..00000000 --- a/src/base/UUnicodeUtils.pas +++ /dev/null @@ -1,670 +0,0 @@ -{* UltraStar Deluxe - Karaoke Game - * - * UltraStar Deluxe is the legal property of its developers, whose names - * are too numerous to list here. Please refer to the COPYRIGHT - * file distributed with this source distribution. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - * - * $URL$ - * $Id$ - *} - -unit UUnicodeUtils; - -interface - -{$IFDEF FPC} - {$MODE Delphi} -{$ENDIF} - -uses -{$IFDEF MSWINDOWS} - Windows, -{$ENDIF} - StrUtils, - SysUtils; - -type - // String with unknown encoding. Introduced with Delphi 2009 and maybe soon - // with FPC. - RawByteString = AnsiString; - -{** - * Returns true if the system uses UTF-8 as default string type - * (filesystem or API calls). - * This is always true on Mac OS X and always false on Win32. On Unix it depends - * on the LC_CTYPE setting. - * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true. - *} -function IsNativeUTF8(): boolean; - -(* - * Character classes - *) - -function IsAlphaChar(ch: WideChar): boolean; overload; -function IsAlphaChar(ch: UCS4Char): boolean; overload; - -function IsNumericChar(ch: WideChar): boolean; overload; -function IsNumericChar(ch: UCS4Char): boolean; overload; - -function IsAlphaNumericChar(ch: WideChar): boolean; overload; -function IsAlphaNumericChar(ch: UCS4Char): boolean; overload; - -function IsPunctuationChar(ch: WideChar): boolean; overload; -function IsPunctuationChar(ch: UCS4Char): boolean; overload; - -function IsControlChar(ch: WideChar): boolean; overload; -function IsControlChar(ch: UCS4Char): boolean; overload; - -function IsPrintableChar(ch: WideChar): boolean; overload; -function IsPrintableChar(ch: UCS4Char): boolean; overload; - -{** - * Checks if the given string is a valid UTF-8 string. - * If an ANSI encoded string (with char codes >= 128) is passed, the - * function will most probably return false, as most ANSI strings sequences - * are illegal in UTF-8. - *} -function IsUTF8String(const str: RawByteString): boolean; - -{** - * Iterates over an UTF-8 encoded string. - * StrPtr will be increased to the beginning of the next character on each - * call. - * Results true if the given string starts with an UTF-8 encoded char. - *} -function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean; - -{** - * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index. - * Index values start with 1. - *} -procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer); -procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer); - -{** - * Checks if the string is composed of ASCII characters. - *} -function IsASCIIString(const str: RawByteString): boolean; - -{* - * String format conversion - *} - -function UTF8ToUCS4String(const str: UTF8String): UCS4String; -function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload; -function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload; - -{** - * Returns the number of characters (not bytes) in string str. - *} -function LengthUTF8(const str: UTF8String): integer; - -{** - * Returns the length of an UCS4String. Note that Length(UCS4String) returns - * the length+1 as UCS4Strings are zero-terminated. - *} -function LengthUCS4(const str: UCS4String): integer; - -{** @seealso WideCompareStr *} -function UTF8CompareStr(const S1, S2: UTF8String): integer; -{** @seealso WideCompareText *} -function UTF8CompareText(const S1, S2: UTF8String): integer; - -function UTF8StartsText(const SubText, Text: UTF8String): boolean; - -function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; -function UTF8ContainsText(const Text, SubText: UTF8String): boolean; - -{** @seealso WideUpperCase *} -function UTF8UpperCase(const str: UTF8String): UTF8String; -{** @seealso WideCompareText *} -function UTF8LowerCase(const str: UTF8String): UTF8String; - -{** - * Converts a UCS-4 char ch to its upper-case representation. - *} -function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload; - -{** - * Converts a UCS-4 string str to its upper-case representation. - *} -function UCS4UpperCase(const str: UCS4String): UCS4String; overload; - -{** - * Converts a UCS4Char to an UCS4String. - * Note that UCS4Strings are zero-terminated dynamic arrays. - *} -function UCS4CharToString(ch: UCS4Char): UCS4String; - -{** - * @seealso System.Pos() - *} -function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer; - -{** - * Copies a segment of str starting with Index (1-based) with Count characters (not bytes). - *} -function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String; - -{** - * Copies a segment of str starting with Index (0-based) with Count characters. - * Note: Do not use Copy() to copy UCS4Strings as the result will not contain - * a trailing #0 character and hence is invalid. - *} -function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String; - -(* - * Converts a WideString to its upper- or lower-case representation. - * Wrapper for WideUpper/LowerCase. Needed because some plattforms have - * problems with unicode support. - * - * Note that characters in UTF-16 might consist of one or two WideChar valus - * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single - * character access, convert to UCS-4 where each character is represented by - * one UCS4Char. - *) -function WideStringUpperCase(const str: WideString) : WideString; overload; -function WideStringUpperCase(ch: WideChar): WideString; overload; -function WideStringLowerCase(const str: WideString): WideString; overload; -function WideStringLowerCase(ch: WideChar): WideString; overload; - -function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString; - -implementation - -{$IFDEF UNIX} -{$IFNDEF DARWIN} -const - LC_CTYPE = 0; - -function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c'; -{$ENDIF} -{$ENDIF} - -var - NativeUTF8: boolean; - -procedure InitUnicodeUtils(); -{$IFDEF UNIX} -{$IFNDEF DARWIN} -var - localeName: PChar; -{$ENDIF} -{$ENDIF} -begin - {$IF Defined(DARWIN)} - NativeUTF8 := true; - {$ELSEIF Defined(MSWindows)} - NativeUTF8 := false; - {$ELSEIF Defined(UNIX)} - // check if locale name contains UTF8 or UTF-8 - localeName := setlocale(LC_CTYPE, nil); - NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0; - {$ELSE} - raise Exception.Create('Unknown system'); - {$IFEND} -end; - -function IsNativeUTF8(): boolean; -begin - Result := NativeUTF8; -end; - -function IsAlphaChar(ch: WideChar): boolean; -begin - {$IFDEF MSWINDOWS} - Result := IsCharAlphaW(ch); - {$ELSE} - // TODO: add chars > 255 (or replace with libxml2 functions?) - case ch of - 'A'..'Z', // A-Z - 'a'..'z', // a-z - #170,#181,#186, - #192..#214, - #216..#246, - #248..#255: - Result := true; - else - Result := false; - end; - {$ENDIF} -end; - -function IsAlphaChar(ch: UCS4Char): boolean; -begin - Result := IsAlphaChar(WideChar(Ord(ch))); -end; - -function IsNumericChar(ch: WideChar): boolean; -begin - // TODO: replace with libxml2 functions? - // ignore non-arabic numerals as we do not want to handle them - case ch of - '0'..'9': - Result := true; - else - Result := false; - end; -end; - -function IsNumericChar(ch: UCS4Char): boolean; -begin - Result := IsNumericChar(WideChar(Ord(ch))); -end; - -function IsAlphaNumericChar(ch: WideChar): boolean; -begin - Result := (IsAlphaChar(ch) or IsNumericChar(ch)); -end; - -function IsAlphaNumericChar(ch: UCS4Char): boolean; -begin - Result := (IsAlphaChar(ch) or IsNumericChar(ch)); -end; - -function IsPunctuationChar(ch: WideChar): boolean; -begin - // TODO: add chars > 255 (or replace with libxml2 functions?) - case ch of - ' '..'/',':'..'@','['..'`','{'..'~', - #160..#191,#215,#247: - Result := true; - else - Result := false; - end; -end; - -function IsPunctuationChar(ch: UCS4Char): boolean; -begin - Result := IsPunctuationChar(WideChar(Ord(ch))); -end; - -function IsControlChar(ch: WideChar): boolean; -begin - case ch of - #0..#31, - #127..#159: - Result := true; - else - Result := false; - end; -end; - -function IsControlChar(ch: UCS4Char): boolean; -begin - Result := IsControlChar(WideChar(Ord(ch))); -end; - -function IsPrintableChar(ch: WideChar): boolean; -begin - Result := not IsControlChar(ch); -end; - -function IsPrintableChar(ch: UCS4Char): boolean; -begin - Result := IsPrintableChar(WideChar(Ord(ch))); -end; - - -function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean; - - // find the most significant zero bit (Result: [7..-1]) - function FindZeroMSB(b: byte): integer; - var - Mask: byte; - begin - Mask := $80; - Result := 7; - while (b and Mask <> 0) do - begin - Mask := Mask shr 1; - Dec(Result); - end; - end; - -var - ZeroBit: integer; - SeqCount: integer; // number of trailing bytes to follow -const - Mask: array[1..3] of byte = ($1F, $0F, $07); -begin - Result := false; - SeqCount := 0; - Ch := 0; - - while (StrPtr^ <> #0) do - begin - if (StrPtr^ < #128) then - begin - // check that no more trailing bytes are expected - if (SeqCount = 0) then - begin - Ch := Ord(StrPtr^); - Inc(StrPtr); - Result := true; - end; - Break; - end - else - begin - ZeroBit := FindZeroMSB(Ord(StrPtr^)); - // trailing byte expected - if (SeqCount > 0) then - begin - // check if trailing byte has pattern 10xxxxxx - if (ZeroBit <> 6) then - begin - Inc(StrPtr); - Break; - end; - - Dec(SeqCount); - Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F); - - // check if char is finished - if (SeqCount = 0) then - begin - Inc(StrPtr); - Result := true; - Break; - end; - end - else // leading byte expected - begin - // check if pattern is one of 110xxxxx/1110xxxx/11110xxx - if (ZeroBit > 5) or (ZeroBit < 3) then - begin - Inc(StrPtr); - Break; - end; - // calculate number of trailing bytes (1, 2 or 3) - SeqCount := 6 - ZeroBit; - // extract first part of char - Ch := Ord(StrPtr^) and Mask[SeqCount]; - end; - end; - - Inc(StrPtr); - end; - - if (not Result) then - Ch := Ord('?'); -end; - -function IsUTF8String(const str: RawByteString): boolean; -var - Ch: UCS4Char; - StrPtr: PAnsiChar; -begin - Result := true; - StrPtr := PChar(str); - while (StrPtr^ <> #0) do - begin - if (not NextCharUTF8(StrPtr, Ch)) then - begin - Result := false; - Exit; - end; - end; -end; - -function IsASCIIString(const str: RawByteString): boolean; -var - I: integer; -begin - for I := 1 to Length(str) do - begin - if (str[I] >= #128) then - begin - Result := false; - Exit; - end; - end; - Result := true; -end; - - -function UTF8ToUCS4String(const str: UTF8String): UCS4String; -begin - Result := WideStringToUCS4String(UTF8Decode(str)); -end; - -function UCS4ToUTF8String(const str: UCS4String): UTF8String; -begin - Result := UTF8Encode(UCS4StringToWideString(str)); -end; - -function UCS4ToUTF8String(ch: UCS4Char): UTF8String; -begin - Result := UCS4ToUTF8String(UCS4CharToString(ch)); -end; - -function LengthUTF8(const str: UTF8String): integer; -begin - Result := LengthUCS4(UTF8ToUCS4String(str)); -end; - -function LengthUCS4(const str: UCS4String): integer; -begin - Result := High(str); - if (Result = -1) then - Result := 0; -end; - -function UTF8CompareStr(const S1, S2: UTF8String): integer; -begin - Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2)); -end; - -function UTF8CompareText(const S1, S2: UTF8String): integer; -begin - Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2)); -end; - -function UTF8StartsStr(const SubText, Text: UTF8String): boolean; -begin - // TODO: use WideSameStr (slower but handles different representations of the same char)? - Result := (Pos(SubText, Text) = 1); -end; - -function UTF8StartsText(const SubText, Text: UTF8String): boolean; -begin - // TODO: use WideSameText (slower but handles different representations of the same char)? - Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1); -end; - -function UTF8ContainsStr(const Text, SubText: UTF8String): boolean; -begin - Result := Pos(SubText, Text) > 0; -end; - -function UTF8ContainsText(const Text, SubText: UTF8String): boolean; -begin - Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0; -end; - -function UTF8UpperCase(const str: UTF8String): UTF8String; -begin - Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str))); -end; - -function UTF8LowerCase(const str: UTF8String): UTF8String; -begin - Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str))); -end; - -function UCS4UpperCase(ch: UCS4Char): UCS4Char; -begin - Result := UCS4UpperCase(UCS4CharToString(ch))[0]; -end; - -function UCS4UpperCase(const str: UCS4String): UCS4String; -begin - // convert to upper-case as WideString and convert result back to UCS-4 - Result := WideStringToUCS4String( - WideStringUpperCase( - UCS4StringToWideString(str))); -end; - -function UCS4CharToString(ch: UCS4Char): UCS4String; -begin - SetLength(Result, 2); - Result[0] := ch; - Result[1] := 0; -end; - -function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer; -begin - Result := Pos(substr, str); -end; - -function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String; -begin - Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count)); -end; - -function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String; -var - I: integer; - MaxCount: integer; -begin - // calculate max. copy count - MaxCount := LengthUCS4(str)-Index; - if (MaxCount < 0) then - MaxCount := 0; - // adjust copy count - if (Count > MaxCount) or (Count < 0) then - Count := MaxCount; - - // copy (and add zero terminator) - SetLength(Result, Count + 1); - for I := 0 to Count-1 do - Result[I] := str[Index+I]; - Result[Count] := 0; -end; - -procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer); -var - StrUCS4: UCS4String; -begin - StrUCS4 := UTF8ToUCS4String(str); - UCS4Delete(StrUCS4, Index-1, Count); - Str := UCS4ToUTF8String(StrUCS4); -end; - -procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer); -var - Len: integer; - OldStr: UCS4String; - I: integer; -begin - Len := LengthUCS4(Str); - if (Count <= 0) or (Index < 0) or (Index >= Len) then - Exit; - if (Index + Count > Len) then - Count := Len-Index; - - OldStr := Str; - SetLength(Str, Len-Count+1); - for I := 0 to Index-1 do - Str[I] := OldStr[I]; - for I := Index+Count to Len-1 do - Str[I-Count] := OldStr[I]; - Str[High(Str)] := 0; -end; - -function WideStringUpperCase(ch: WideChar): WideString; -begin - // If WideChar #0 is converted to a WideString in Delphi, a string with - // length 1 and a single char #0 is returned. In FPC an empty (length=0) - // string will be returned. This will crash, if a non printable key was - // pressed, its char code (#0) is translated to upper-case and the the first - // character is accessed with Result[1]. - // We cannot catch this error in the WideString parameter variant as the string - // has length 0 already. - - // Force min. string length of 1 - if (ch = #0) then - Result := #0 - else - Result := WideStringUpperCase(WideString(ch)); -end; - -function WideStringUpperCase(const str: WideString): WideString; -begin - // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls. - // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()). - // The Unicode manager cwstring does not work with MacOSX at the moment because - // of missing references to iconv. - // Note: Should be fixed now - - {.$IFNDEF DARWIN} - {.$IFDEF NOIGNORE} - Result := WideUpperCase(str) - {.$ELSE} - //Result := UTF8Decode(UpperCase(UTF8Encode(str))); - {.$ENDIF} -end; - -function WideStringLowerCase(ch: WideChar): WideString; -begin - // see WideStringUpperCase - if (ch = #0) then - Result := #0 - else - Result := WideStringLowerCase(WideString(ch)); -end; - -function WideStringLowerCase(const str: WideString): WideString; -begin - // see WideStringUpperCase - Result := WideLowerCase(str) -end; - -function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString; -var - iPos : integer; -// sTemp : WideString; -begin -(* - result := text; - iPos := Pos(search, result); - while (iPos > 0) do - begin - sTemp := copy(result, iPos + length(search), length(result)); - result := copy(result, 1, iPos - 1) + rep + sTEmp; - iPos := Pos(search, result); - end; -*) - result := text; - - if search = rep then - exit; - - for iPos := 1 to length(result) do - begin - if result[iPos] = search then - result[iPos] := rep; - end; -end; - -initialization - InitUnicodeUtils; - -end. |