aboutsummaryrefslogtreecommitdiffstats
path: root/Lua/src/base/UTextEncoding.pas
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--Lua/src/base/UTextEncoding.pas278
1 files changed, 189 insertions, 89 deletions
diff --git a/Lua/src/base/UTextEncoding.pas b/Lua/src/base/UTextEncoding.pas
index 6eec8eec..148cd5d4 100644
--- a/Lua/src/base/UTextEncoding.pas
+++ b/Lua/src/base/UTextEncoding.pas
@@ -19,8 +19,8 @@
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
- * $URL: https://ultrastardx.svn.sourceforge.net/svnroot/ultrastardx/trunk/src/menu/UMenuText.pas $
- * $Id: UMenuText.pas 1485 2008-10-28 20:16:05Z tobigun $
+ * $URL$
+ * $Id$
*}
unit UTextEncoding;
@@ -34,114 +34,214 @@ interface
{$I switches.inc}
uses
- SysUtils;
+ SysUtils,
+ UUnicodeUtils;
type
- TEncoding = (encCP1250, encCP1252, encUTF8, encNative);
+ TEncoding = (
+ encLocale, // current locale (needs cwstring on linux)
+ encUTF8, // UTF-8
+ encCP1250, // Windows-1250 Central/Eastern Europe (used by Ultrastar)
+ encCP1252, // Windows-1252 Western Europe (used by UltraStar Deluxe < 1.1)
+ encAuto // try to match the w3c regex and decode as unicode on match
+ // and as fallback if not match
+ );
+
+const
+ UTF8_BOM: UTF8String = #$EF#$BB#$BF;
+
+{**
+ * Decodes Src encoded in SrcEncoding to a UTF-16 or UTF-8 encoded Dst string.
+ * Returns true if the conversion was successful.
+ *}
+function DecodeString(const Src: RawByteString; out Dst: WideString; SrcEncoding: TEncoding): boolean; overload;
+function DecodeString(const Src: RawByteString; SrcEncoding: TEncoding): WideString; overload;
+function DecodeStringUTF8(const Src: RawByteString; out Dst: UTF8String; SrcEncoding: TEncoding): boolean; overload;
+function DecodeStringUTF8(const Src: RawByteString; SrcEncoding: TEncoding): UTF8String; overload;
+
+{**
+ * Encodes the UTF-16 or UTF-8 encoded Src string to Dst using DstEncoding
+ * Returns true if the conversion was successful.
+ *}
+function EncodeString(const Src: WideString; out Dst: RawByteString; DstEncoding: TEncoding): boolean; overload;
+function EncodeString(const Src: WideString; DstEncoding: TEncoding): RawByteString; overload;
+function EncodeStringUTF8(const Src: UTF8String; out Dst: RawByteString; DstEncoding: TEncoding): boolean; overload;
+function EncodeStringUTF8(const Src: UTF8String; DstEncoding: TEncoding): RawByteString; overload;
+
+{**
+ * If Text starts with an UTF-8 BOM, the BOM is removed and true will
+ * be returned.
+ *}
+function CheckReplaceUTF8BOM(var Text: RawByteString): boolean;
-function RecodeString(const Src: string; SrcEncoding: TEncoding): WideString;
+{**
+ * Parses an encoding string to its TEncoding equivalent.
+ * Surrounding whitespace and dashes ('-') are removed, the upper-cased
+ * resulting value is then compared with TEncodingNames.
+ * If the encoding was not found, the result is set to the Default encoding.
+ *}
+function ParseEncoding(const EncodingStr: AnsiString; Default: TEncoding): TEncoding;
+
+{**
+ * Returns the name of an encoding.
+ *}
+function EncodingName(Encoding: TEncoding): AnsiString;
implementation
+uses
+ StrUtils,
+ pcre,
+ ULog;
+
type
- TConversionTable = array[0..127] of WideChar;
+ IEncoder = interface
+ function GetName(): AnsiString;
+ function Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean;
+ function Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean;
+ end;
+
+ TEncoder = class(TInterfacedObject, IEncoder)
+ public
+ function GetName(): AnsiString; virtual; abstract;
+ function Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean; virtual; abstract;
+ function Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean; virtual; abstract;
+ end;
+
+ TSingleByteEncoder = class(TEncoder)
+ public
+ function Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean; override;
+ function Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean; override;
+ function DecodeChar(InChr: AnsiChar; out OutChr: UCS4Char): boolean; virtual; abstract;
+ function EncodeChar(InChr: UCS4Char; out OutChr: AnsiChar): boolean; virtual; abstract;
+ end;
const
- // Windows-1250 Central/Eastern Europe (used by Ultrastar)
- CP1250Table: TConversionTable = (
- { $80 }
- #$20AC, #0, #$201A, #0, #$201E, #$2026, #$2020, #$2021,
- #0, #$2030, #$0160, #$2039, #$015A, #$0164, #$017D, #$0179,
- { $90 }
- #0, #$2018, #$2019, #$201C, #$201D, #$2022, #$2013, #$2014,
- #0, #$2122, #$0161, #$203A, #$015B, #$0165, #$017E, #$017A,
- { $A0 }
- #$00A0, #$02C7, #$02D8, #$0141, #$00A4, #$0104, #$00A6, #$00A7,
- #$00A8, #$00A9, #$015E, #$00AB, #$00AC, #$00AD, #$00AE, #$017B,
- { $B0 }
- #$00B0, #$00B1, #$02DB, #$0142, #$00B4, #$00B5, #$00B6, #$00B7,
- #$00B8, #$0105, #$015F, #$00BB, #$013D, #$02DD, #$013E, #$017C,
- { $C0 }
- #$0154, #$00C1, #$00C2, #$0102, #$00C4, #$0139, #$0106, #$00C7,
- #$010C, #$00C9, #$0118, #$00CB, #$011A, #$00CD, #$00CE, #$010E,
- { $D0 }
- #$0110, #$0143, #$0147, #$00D3, #$00D4, #$0150, #$00D6, #$00D7,
- #$0158, #$016E, #$00DA, #$0170, #$00DC, #$00DD, #$0162, #$00DF,
- { $E0 }
- #$0155, #$00E1, #$00E2, #$0103, #$00E4, #$013A, #$0107, #$00E7,
- #$010D, #$00E9, #$0119, #$00EB, #$011B, #$00ED, #$00EE, #$010F,
- { $F0 }
- #$0111, #$0144, #$0148, #$00F3, #$00F4, #$0151, #$00F6, #$00F7,
- #$0159, #$016F, #$00FA, #$0171, #$00FC, #$00FD, #$0163, #$02D9
- );
+ ERROR_CHAR = '?';
- // Windows-1252 Western Europe (used by UltraStar Deluxe < 1.1)
- CP1252Table: TConversionTable = (
- { $80 }
- #$20AC, #0, #$201A, #$0192, #$201E, #$2026, #$2020, #$2021,
- #$02C6, #$2030, #$0160, #$2039, #$0152, #0, #$017D, #0,
- { $90 }
- #0, #$2018, #$2019, #$201C, #$201D, #$2022, #$2013, #$2014,
- #$02DC, #$2122, #$0161, #$203A, #$0153, #0, #$017E, #$0178,
- { $A0 }
- #$00A0, #$00A1, #$00A2, #$00A3, #$00A4, #$00A5, #$00A6, #$00A7,
- #$00A8, #$00A9, #$00AA, #$00AB, #$00AC, #$00AD, #$00AE, #$00AF,
- { $B0 }
- #$00B0, #$00B1, #$00B2, #$00B3, #$00B4, #$00B5, #$00B6, #$00B7,
- #$00B8, #$00B9, #$00BA, #$00BB, #$00BC, #$00BD, #$00BE, #$00BF,
- { $C0 }
- #$00C0, #$00C1, #$00C2, #$00C3, #$00C4, #$00C5, #$00C6, #$00C7,
- #$00C8, #$00C9, #$00CA, #$00CB, #$00CC, #$00CD, #$00CE, #$00CF,
- { $D0 }
- #$00D0, #$00D1, #$00D2, #$00D3, #$00D4, #$00D5, #$00D6, #$00D7,
- #$00D8, #$00D9, #$00DA, #$00DB, #$00DC, #$00DD, #$00DE, #$00DF,
- { $E0 }
- #$00E0, #$00E1, #$00E2, #$00E3, #$00E4, #$00E5, #$00E6, #$00E7,
- #$00E8, #$00E9, #$00EA, #$00EB, #$00EC, #$00ED, #$00EE, #$00EF,
- { $F0 }
- #$00F0, #$00F1, #$00F2, #$00F3, #$00F4, #$00F5, #$00F6, #$00F7,
- #$00F8, #$00F9, #$00FA, #$00FB, #$00FC, #$00FD, #$00FE, #$00FF
- );
+var
+ Encoders: array[TEncoding] of IEncoder;
+function TSingleByteEncoder.Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean;
+var
+ I: integer;
+begin
+ SetLength(OutStr, LengthUCS4(InStr));
+ Result := true;
+ for I := 1 to Length(OutStr) do
+ begin
+ if (not EncodeChar(InStr[I-1], OutStr[I])) then
+ Result := false;
+ end;
+end;
-function Convert(const Src: string; const Table: TConversionTable): WideString;
+function TSingleByteEncoder.Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean;
var
- SrcPos, DstPos: integer;
+ I: integer;
begin
- SetLength(Result, Length(Src));
- DstPos := 1;
- for SrcPos := 1 to Length(Src) do
+ SetLength(OutStr, Length(InStr)+1);
+ Result := true;
+ for I := 1 to Length(InStr) do
begin
- if (Src[SrcPos] < #128) then
- begin
- // copy ASCII char
- Result[DstPos] := Src[SrcPos];
- Inc(DstPos);
- end
- else
+ if (not DecodeChar(InStr[I], OutStr[I-1])) then
+ Result := false;
+ end;
+ OutStr[High(OutStr)] := 0;
+end;
+
+function DecodeString(const Src: RawByteString; out Dst: WideString; SrcEncoding: TEncoding): boolean;
+var
+ DstUCS4: UCS4String;
+begin
+ Result := Encoders[SrcEncoding].Decode(Src, DstUCS4);
+ Dst := UCS4StringToWideString(DstUCS4);
+end;
+
+function DecodeString(const Src: RawByteString; SrcEncoding: TEncoding): WideString;
+begin
+ DecodeString(Src, Result, SrcEncoding);
+end;
+
+function DecodeStringUTF8(const Src: RawByteString; out Dst: UTF8String; SrcEncoding: TEncoding): boolean;
+var
+ DstUCS4: UCS4String;
+begin
+ Result := Encoders[SrcEncoding].Decode(Src, DstUCS4);
+ Dst := UCS4ToUTF8String(DstUCS4);
+end;
+
+function DecodeStringUTF8(const Src: RawByteString; SrcEncoding: TEncoding): UTF8String;
+begin
+ DecodeStringUTF8(Src, Result, SrcEncoding);
+end;
+
+function EncodeString(const Src: WideString; out Dst: RawByteString; DstEncoding: TEncoding): boolean;
+begin
+ Result := Encoders[DstEncoding].Encode(WideStringToUCS4String(Src), Dst);
+end;
+
+function EncodeString(const Src: WideString; DstEncoding: TEncoding): RawByteString;
+begin
+ EncodeString(Src, Result, DstEncoding);
+end;
+
+function EncodeStringUTF8(const Src: UTF8String; out Dst: RawByteString; DstEncoding: TEncoding): boolean;
+begin
+ Result := Encoders[DstEncoding].Encode(UTF8ToUCS4String(Src), Dst);
+end;
+
+function EncodeStringUTF8(const Src: UTF8String; DstEncoding: TEncoding): RawByteString;
+begin
+ EncodeStringUTF8(Src, Result, DstEncoding);
+end;
+
+function CheckReplaceUTF8BOM(var Text: RawByteString): boolean;
+begin
+ if AnsiStartsStr(UTF8_BOM, Text) then
+ begin
+ Text := Copy(Text, Length(UTF8_BOM)+1, Length(Text)-Length(UTF8_BOM));
+ Result := true;
+ Exit;
+ end;
+ Result := false;
+end;
+
+function ParseEncoding(const EncodingStr: AnsiString; Default: TEncoding): TEncoding;
+var
+ PrepStr: AnsiString; // prepared encoding string
+ Encoding: TEncoding;
+begin
+ // remove surrounding whitespace, replace dashes, to upper case
+ PrepStr := UpperCase(AnsiReplaceStr(Trim(EncodingStr), '-', ''));
+ for Encoding := Low(TEncoding) to High(TEncoding) do
+ begin
+ if (Encoders[Encoding].GetName() = PrepStr) then
begin
- // look-up char
- Result[DstPos] := Table[Ord(Src[SrcPos]) - 128];
- // ignore invalid characters
- if (Result[DstPos] <> #0) then
- Inc(DstPos);
+ Result := Encoding;
+ Exit;
end;
end;
- SetLength(Result, DstPos-1);
+ Result := Default;
end;
-function RecodeString(const Src: string; SrcEncoding: TEncoding): WideString;
+function EncodingName(Encoding: TEncoding): AnsiString;
begin
- case SrcEncoding of
- encCP1250:
- Result := Convert(Src, CP1250Table);
- encCP1252:
- Result := Convert(Src, CP1252Table);
- encUTF8:
- Result := UTF8Decode(Src);
- encNative:
- Result := UTF8Decode(AnsiToUtf8(Src));
- end;
+ Result := Encoders[Encoding].GetName();
end;
+{$I ..\\encoding\\Locale.inc}
+{$I ..\\encoding\\UTF8.inc}
+{$I ..\\encoding\\CP1250.inc}
+{$I ..\\encoding\\CP1252.inc}
+{$I ..\\encoding\\Auto.inc}
+
+initialization
+ Encoders[encLocale] := TEncoderLocale.Create;
+ Encoders[encUTF8] := TEncoderUTF8.Create;
+ Encoders[encCP1250] := TEncoderCP1250.Create;
+ Encoders[encCP1252] := TEncoderCP1252.Create;
+
+ // use USDX < 1.1 encoding for backward compatibility (encCP1252)
+ Encoders[encAuto] := TEncoderAuto.Create(Encoders[encUTF8], Encoders[encCP1252]);
+
end.