aboutsummaryrefslogtreecommitdiffstats
path: root/src/base/UUnicodeUtils.pas
diff options
context:
space:
mode:
authorAlexander Sulfrian <alexander@sulfrian.net>2011-11-07 15:26:44 +0100
committerAlexander Sulfrian <alexander@sulfrian.net>2013-01-05 17:17:49 +0100
commit3260749d369d3466c345d40a8b2189c32c8c1b60 (patch)
treebdf235d333e6b4d0b0edb11bde421617a180ff92 /src/base/UUnicodeUtils.pas
parentde5a3593ae7bc6fb5aab9d76d01d3faa47b91bba (diff)
downloadusdx-3260749d369d3466c345d40a8b2189c32c8c1b60.tar.gz
usdx-3260749d369d3466c345d40a8b2189c32c8c1b60.tar.xz
usdx-3260749d369d3466c345d40a8b2189c32c8c1b60.zip
removed pascal code
Diffstat (limited to 'src/base/UUnicodeUtils.pas')
-rw-r--r--src/base/UUnicodeUtils.pas670
1 files changed, 0 insertions, 670 deletions
diff --git a/src/base/UUnicodeUtils.pas b/src/base/UUnicodeUtils.pas
deleted file mode 100644
index 37b53a67..00000000
--- a/src/base/UUnicodeUtils.pas
+++ /dev/null
@@ -1,670 +0,0 @@
-{* UltraStar Deluxe - Karaoke Game
- *
- * UltraStar Deluxe is the legal property of its developers, whose names
- * are too numerous to list here. Please refer to the COPYRIGHT
- * file distributed with this source distribution.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- *
- * $URL$
- * $Id$
- *}
-
-unit UUnicodeUtils;
-
-interface
-
-{$IFDEF FPC}
- {$MODE Delphi}
-{$ENDIF}
-
-uses
-{$IFDEF MSWINDOWS}
- Windows,
-{$ENDIF}
- StrUtils,
- SysUtils;
-
-type
- // String with unknown encoding. Introduced with Delphi 2009 and maybe soon
- // with FPC.
- RawByteString = AnsiString;
-
-{**
- * Returns true if the system uses UTF-8 as default string type
- * (filesystem or API calls).
- * This is always true on Mac OS X and always false on Win32. On Unix it depends
- * on the LC_CTYPE setting.
- * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true.
- *}
-function IsNativeUTF8(): boolean;
-
-(*
- * Character classes
- *)
-
-function IsAlphaChar(ch: WideChar): boolean; overload;
-function IsAlphaChar(ch: UCS4Char): boolean; overload;
-
-function IsNumericChar(ch: WideChar): boolean; overload;
-function IsNumericChar(ch: UCS4Char): boolean; overload;
-
-function IsAlphaNumericChar(ch: WideChar): boolean; overload;
-function IsAlphaNumericChar(ch: UCS4Char): boolean; overload;
-
-function IsPunctuationChar(ch: WideChar): boolean; overload;
-function IsPunctuationChar(ch: UCS4Char): boolean; overload;
-
-function IsControlChar(ch: WideChar): boolean; overload;
-function IsControlChar(ch: UCS4Char): boolean; overload;
-
-function IsPrintableChar(ch: WideChar): boolean; overload;
-function IsPrintableChar(ch: UCS4Char): boolean; overload;
-
-{**
- * Checks if the given string is a valid UTF-8 string.
- * If an ANSI encoded string (with char codes >= 128) is passed, the
- * function will most probably return false, as most ANSI strings sequences
- * are illegal in UTF-8.
- *}
-function IsUTF8String(const str: RawByteString): boolean;
-
-{**
- * Iterates over an UTF-8 encoded string.
- * StrPtr will be increased to the beginning of the next character on each
- * call.
- * Results true if the given string starts with an UTF-8 encoded char.
- *}
-function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;
-
-{**
- * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index.
- * Index values start with 1.
- *}
-procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
-procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);
-
-{**
- * Checks if the string is composed of ASCII characters.
- *}
-function IsASCIIString(const str: RawByteString): boolean;
-
-{*
- * String format conversion
- *}
-
-function UTF8ToUCS4String(const str: UTF8String): UCS4String;
-function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload;
-function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload;
-
-{**
- * Returns the number of characters (not bytes) in string str.
- *}
-function LengthUTF8(const str: UTF8String): integer;
-
-{**
- * Returns the length of an UCS4String. Note that Length(UCS4String) returns
- * the length+1 as UCS4Strings are zero-terminated.
- *}
-function LengthUCS4(const str: UCS4String): integer;
-
-{** @seealso WideCompareStr *}
-function UTF8CompareStr(const S1, S2: UTF8String): integer;
-{** @seealso WideCompareText *}
-function UTF8CompareText(const S1, S2: UTF8String): integer;
-
-function UTF8StartsText(const SubText, Text: UTF8String): boolean;
-
-function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
-function UTF8ContainsText(const Text, SubText: UTF8String): boolean;
-
-{** @seealso WideUpperCase *}
-function UTF8UpperCase(const str: UTF8String): UTF8String;
-{** @seealso WideCompareText *}
-function UTF8LowerCase(const str: UTF8String): UTF8String;
-
-{**
- * Converts a UCS-4 char ch to its upper-case representation.
- *}
-function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload;
-
-{**
- * Converts a UCS-4 string str to its upper-case representation.
- *}
-function UCS4UpperCase(const str: UCS4String): UCS4String; overload;
-
-{**
- * Converts a UCS4Char to an UCS4String.
- * Note that UCS4Strings are zero-terminated dynamic arrays.
- *}
-function UCS4CharToString(ch: UCS4Char): UCS4String;
-
-{**
- * @seealso System.Pos()
- *}
-function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;
-
-{**
- * Copies a segment of str starting with Index (1-based) with Count characters (not bytes).
- *}
-function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String;
-
-{**
- * Copies a segment of str starting with Index (0-based) with Count characters.
- * Note: Do not use Copy() to copy UCS4Strings as the result will not contain
- * a trailing #0 character and hence is invalid.
- *}
-function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String;
-
-(*
- * Converts a WideString to its upper- or lower-case representation.
- * Wrapper for WideUpper/LowerCase. Needed because some plattforms have
- * problems with unicode support.
- *
- * Note that characters in UTF-16 might consist of one or two WideChar valus
- * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single
- * character access, convert to UCS-4 where each character is represented by
- * one UCS4Char.
- *)
-function WideStringUpperCase(const str: WideString) : WideString; overload;
-function WideStringUpperCase(ch: WideChar): WideString; overload;
-function WideStringLowerCase(const str: WideString): WideString; overload;
-function WideStringLowerCase(ch: WideChar): WideString; overload;
-
-function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;
-
-implementation
-
-{$IFDEF UNIX}
-{$IFNDEF DARWIN}
-const
- LC_CTYPE = 0;
-
-function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c';
-{$ENDIF}
-{$ENDIF}
-
-var
- NativeUTF8: boolean;
-
-procedure InitUnicodeUtils();
-{$IFDEF UNIX}
-{$IFNDEF DARWIN}
-var
- localeName: PChar;
-{$ENDIF}
-{$ENDIF}
-begin
- {$IF Defined(DARWIN)}
- NativeUTF8 := true;
- {$ELSEIF Defined(MSWindows)}
- NativeUTF8 := false;
- {$ELSEIF Defined(UNIX)}
- // check if locale name contains UTF8 or UTF-8
- localeName := setlocale(LC_CTYPE, nil);
- NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0;
- {$ELSE}
- raise Exception.Create('Unknown system');
- {$IFEND}
-end;
-
-function IsNativeUTF8(): boolean;
-begin
- Result := NativeUTF8;
-end;
-
-function IsAlphaChar(ch: WideChar): boolean;
-begin
- {$IFDEF MSWINDOWS}
- Result := IsCharAlphaW(ch);
- {$ELSE}
- // TODO: add chars > 255 (or replace with libxml2 functions?)
- case ch of
- 'A'..'Z', // A-Z
- 'a'..'z', // a-z
- #170,#181,#186,
- #192..#214,
- #216..#246,
- #248..#255:
- Result := true;
- else
- Result := false;
- end;
- {$ENDIF}
-end;
-
-function IsAlphaChar(ch: UCS4Char): boolean;
-begin
- Result := IsAlphaChar(WideChar(Ord(ch)));
-end;
-
-function IsNumericChar(ch: WideChar): boolean;
-begin
- // TODO: replace with libxml2 functions?
- // ignore non-arabic numerals as we do not want to handle them
- case ch of
- '0'..'9':
- Result := true;
- else
- Result := false;
- end;
-end;
-
-function IsNumericChar(ch: UCS4Char): boolean;
-begin
- Result := IsNumericChar(WideChar(Ord(ch)));
-end;
-
-function IsAlphaNumericChar(ch: WideChar): boolean;
-begin
- Result := (IsAlphaChar(ch) or IsNumericChar(ch));
-end;
-
-function IsAlphaNumericChar(ch: UCS4Char): boolean;
-begin
- Result := (IsAlphaChar(ch) or IsNumericChar(ch));
-end;
-
-function IsPunctuationChar(ch: WideChar): boolean;
-begin
- // TODO: add chars > 255 (or replace with libxml2 functions?)
- case ch of
- ' '..'/',':'..'@','['..'`','{'..'~',
- #160..#191,#215,#247:
- Result := true;
- else
- Result := false;
- end;
-end;
-
-function IsPunctuationChar(ch: UCS4Char): boolean;
-begin
- Result := IsPunctuationChar(WideChar(Ord(ch)));
-end;
-
-function IsControlChar(ch: WideChar): boolean;
-begin
- case ch of
- #0..#31,
- #127..#159:
- Result := true;
- else
- Result := false;
- end;
-end;
-
-function IsControlChar(ch: UCS4Char): boolean;
-begin
- Result := IsControlChar(WideChar(Ord(ch)));
-end;
-
-function IsPrintableChar(ch: WideChar): boolean;
-begin
- Result := not IsControlChar(ch);
-end;
-
-function IsPrintableChar(ch: UCS4Char): boolean;
-begin
- Result := IsPrintableChar(WideChar(Ord(ch)));
-end;
-
-
-function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;
-
- // find the most significant zero bit (Result: [7..-1])
- function FindZeroMSB(b: byte): integer;
- var
- Mask: byte;
- begin
- Mask := $80;
- Result := 7;
- while (b and Mask <> 0) do
- begin
- Mask := Mask shr 1;
- Dec(Result);
- end;
- end;
-
-var
- ZeroBit: integer;
- SeqCount: integer; // number of trailing bytes to follow
-const
- Mask: array[1..3] of byte = ($1F, $0F, $07);
-begin
- Result := false;
- SeqCount := 0;
- Ch := 0;
-
- while (StrPtr^ <> #0) do
- begin
- if (StrPtr^ < #128) then
- begin
- // check that no more trailing bytes are expected
- if (SeqCount = 0) then
- begin
- Ch := Ord(StrPtr^);
- Inc(StrPtr);
- Result := true;
- end;
- Break;
- end
- else
- begin
- ZeroBit := FindZeroMSB(Ord(StrPtr^));
- // trailing byte expected
- if (SeqCount > 0) then
- begin
- // check if trailing byte has pattern 10xxxxxx
- if (ZeroBit <> 6) then
- begin
- Inc(StrPtr);
- Break;
- end;
-
- Dec(SeqCount);
- Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F);
-
- // check if char is finished
- if (SeqCount = 0) then
- begin
- Inc(StrPtr);
- Result := true;
- Break;
- end;
- end
- else // leading byte expected
- begin
- // check if pattern is one of 110xxxxx/1110xxxx/11110xxx
- if (ZeroBit > 5) or (ZeroBit < 3) then
- begin
- Inc(StrPtr);
- Break;
- end;
- // calculate number of trailing bytes (1, 2 or 3)
- SeqCount := 6 - ZeroBit;
- // extract first part of char
- Ch := Ord(StrPtr^) and Mask[SeqCount];
- end;
- end;
-
- Inc(StrPtr);
- end;
-
- if (not Result) then
- Ch := Ord('?');
-end;
-
-function IsUTF8String(const str: RawByteString): boolean;
-var
- Ch: UCS4Char;
- StrPtr: PAnsiChar;
-begin
- Result := true;
- StrPtr := PChar(str);
- while (StrPtr^ <> #0) do
- begin
- if (not NextCharUTF8(StrPtr, Ch)) then
- begin
- Result := false;
- Exit;
- end;
- end;
-end;
-
-function IsASCIIString(const str: RawByteString): boolean;
-var
- I: integer;
-begin
- for I := 1 to Length(str) do
- begin
- if (str[I] >= #128) then
- begin
- Result := false;
- Exit;
- end;
- end;
- Result := true;
-end;
-
-
-function UTF8ToUCS4String(const str: UTF8String): UCS4String;
-begin
- Result := WideStringToUCS4String(UTF8Decode(str));
-end;
-
-function UCS4ToUTF8String(const str: UCS4String): UTF8String;
-begin
- Result := UTF8Encode(UCS4StringToWideString(str));
-end;
-
-function UCS4ToUTF8String(ch: UCS4Char): UTF8String;
-begin
- Result := UCS4ToUTF8String(UCS4CharToString(ch));
-end;
-
-function LengthUTF8(const str: UTF8String): integer;
-begin
- Result := LengthUCS4(UTF8ToUCS4String(str));
-end;
-
-function LengthUCS4(const str: UCS4String): integer;
-begin
- Result := High(str);
- if (Result = -1) then
- Result := 0;
-end;
-
-function UTF8CompareStr(const S1, S2: UTF8String): integer;
-begin
- Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2));
-end;
-
-function UTF8CompareText(const S1, S2: UTF8String): integer;
-begin
- Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2));
-end;
-
-function UTF8StartsStr(const SubText, Text: UTF8String): boolean;
-begin
- // TODO: use WideSameStr (slower but handles different representations of the same char)?
- Result := (Pos(SubText, Text) = 1);
-end;
-
-function UTF8StartsText(const SubText, Text: UTF8String): boolean;
-begin
- // TODO: use WideSameText (slower but handles different representations of the same char)?
- Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1);
-end;
-
-function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
-begin
- Result := Pos(SubText, Text) > 0;
-end;
-
-function UTF8ContainsText(const Text, SubText: UTF8String): boolean;
-begin
- Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0;
-end;
-
-function UTF8UpperCase(const str: UTF8String): UTF8String;
-begin
- Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str)));
-end;
-
-function UTF8LowerCase(const str: UTF8String): UTF8String;
-begin
- Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str)));
-end;
-
-function UCS4UpperCase(ch: UCS4Char): UCS4Char;
-begin
- Result := UCS4UpperCase(UCS4CharToString(ch))[0];
-end;
-
-function UCS4UpperCase(const str: UCS4String): UCS4String;
-begin
- // convert to upper-case as WideString and convert result back to UCS-4
- Result := WideStringToUCS4String(
- WideStringUpperCase(
- UCS4StringToWideString(str)));
-end;
-
-function UCS4CharToString(ch: UCS4Char): UCS4String;
-begin
- SetLength(Result, 2);
- Result[0] := ch;
- Result[1] := 0;
-end;
-
-function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;
-begin
- Result := Pos(substr, str);
-end;
-
-function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String;
-begin
- Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count));
-end;
-
-function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String;
-var
- I: integer;
- MaxCount: integer;
-begin
- // calculate max. copy count
- MaxCount := LengthUCS4(str)-Index;
- if (MaxCount < 0) then
- MaxCount := 0;
- // adjust copy count
- if (Count > MaxCount) or (Count < 0) then
- Count := MaxCount;
-
- // copy (and add zero terminator)
- SetLength(Result, Count + 1);
- for I := 0 to Count-1 do
- Result[I] := str[Index+I];
- Result[Count] := 0;
-end;
-
-procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
-var
- StrUCS4: UCS4String;
-begin
- StrUCS4 := UTF8ToUCS4String(str);
- UCS4Delete(StrUCS4, Index-1, Count);
- Str := UCS4ToUTF8String(StrUCS4);
-end;
-
-procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);
-var
- Len: integer;
- OldStr: UCS4String;
- I: integer;
-begin
- Len := LengthUCS4(Str);
- if (Count <= 0) or (Index < 0) or (Index >= Len) then
- Exit;
- if (Index + Count > Len) then
- Count := Len-Index;
-
- OldStr := Str;
- SetLength(Str, Len-Count+1);
- for I := 0 to Index-1 do
- Str[I] := OldStr[I];
- for I := Index+Count to Len-1 do
- Str[I-Count] := OldStr[I];
- Str[High(Str)] := 0;
-end;
-
-function WideStringUpperCase(ch: WideChar): WideString;
-begin
- // If WideChar #0 is converted to a WideString in Delphi, a string with
- // length 1 and a single char #0 is returned. In FPC an empty (length=0)
- // string will be returned. This will crash, if a non printable key was
- // pressed, its char code (#0) is translated to upper-case and the the first
- // character is accessed with Result[1].
- // We cannot catch this error in the WideString parameter variant as the string
- // has length 0 already.
-
- // Force min. string length of 1
- if (ch = #0) then
- Result := #0
- else
- Result := WideStringUpperCase(WideString(ch));
-end;
-
-function WideStringUpperCase(const str: WideString): WideString;
-begin
- // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls.
- // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()).
- // The Unicode manager cwstring does not work with MacOSX at the moment because
- // of missing references to iconv.
- // Note: Should be fixed now
-
- {.$IFNDEF DARWIN}
- {.$IFDEF NOIGNORE}
- Result := WideUpperCase(str)
- {.$ELSE}
- //Result := UTF8Decode(UpperCase(UTF8Encode(str)));
- {.$ENDIF}
-end;
-
-function WideStringLowerCase(ch: WideChar): WideString;
-begin
- // see WideStringUpperCase
- if (ch = #0) then
- Result := #0
- else
- Result := WideStringLowerCase(WideString(ch));
-end;
-
-function WideStringLowerCase(const str: WideString): WideString;
-begin
- // see WideStringUpperCase
- Result := WideLowerCase(str)
-end;
-
-function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;
-var
- iPos : integer;
-// sTemp : WideString;
-begin
-(*
- result := text;
- iPos := Pos(search, result);
- while (iPos > 0) do
- begin
- sTemp := copy(result, iPos + length(search), length(result));
- result := copy(result, 1, iPos - 1) + rep + sTEmp;
- iPos := Pos(search, result);
- end;
-*)
- result := text;
-
- if search = rep then
- exit;
-
- for iPos := 1 to length(result) do
- begin
- if result[iPos] = search then
- result[iPos] := rep;
- end;
-end;
-
-initialization
- InitUnicodeUtils;
-
-end.