aboutsummaryrefslogblamecommitdiffstats
path: root/songmanagement/src/base/UTextEncoding.pas
blob: 0c9ba4cc83f7ba6e8904e74dd1fd38d1928a8d08 (plain) (tree)























































































































































































































































                                                                                                                    
{* UltraStar Deluxe - Karaoke Game
 *
 * UltraStar Deluxe is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING. If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 * $URL$
 * $Id$
 *}

unit UTextEncoding;

interface

{$IFDEF FPC}
  {$MODE Delphi}
{$ENDIF}

{$I switches.inc}

uses
  SysUtils,
  UUnicodeUtils;

type
  TEncoding = (
    encLocale,  // current locale (needs cwstring on linux)
    encUTF8,    // UTF-8
    encCP1250,  // Windows-1250 Central/Eastern Europe (used by Ultrastar)
    encCP1252,  // Windows-1252 Western Europe (used by UltraStar Deluxe < 1.1)
    encAuto     // try to match the w3c regex and decode as unicode on match
                // and as fallback if not match
  );

const
  UTF8_BOM: UTF8String = #$EF#$BB#$BF;

{**
 * Decodes Src encoded in SrcEncoding to a UTF-16 or UTF-8 encoded Dst string.
 * Returns true if the conversion was successful.
 *}
function DecodeString(const Src: RawByteString; out Dst: WideString; SrcEncoding: TEncoding): boolean; overload;
function DecodeString(const Src: RawByteString; SrcEncoding: TEncoding): WideString; overload;
function DecodeStringUTF8(const Src: RawByteString; out Dst: UTF8String; SrcEncoding: TEncoding): boolean; overload;
function DecodeStringUTF8(const Src: RawByteString; SrcEncoding: TEncoding): UTF8String; overload;

{**
 * Encodes the UTF-16 or UTF-8 encoded Src string to Dst using DstEncoding
 * Returns true if the conversion was successful.
 *}
function EncodeString(const Src: WideString; out Dst: RawByteString; DstEncoding: TEncoding): boolean; overload;
function EncodeString(const Src: WideString; DstEncoding: TEncoding): RawByteString; overload;
function EncodeStringUTF8(const Src: UTF8String; out Dst: RawByteString; DstEncoding: TEncoding): boolean; overload;
function EncodeStringUTF8(const Src: UTF8String; DstEncoding: TEncoding): RawByteString; overload;

{**
 * If Text starts with an UTF-8 BOM, the BOM is removed and true will
 * be returned.
 *}
function CheckReplaceUTF8BOM(var Text: RawByteString): boolean;

{**
 * Parses an encoding string to its TEncoding equivalent.
 * Surrounding whitespace and dashes ('-') are removed, the upper-cased
 * resulting value is then compared with TEncodingNames.
 * If the encoding was not found, the result is set to the Default encoding.
 *}
function ParseEncoding(const EncodingStr: AnsiString; Default: TEncoding): TEncoding;

{**
 * Returns the name of an encoding.
 *}
function EncodingName(Encoding: TEncoding): AnsiString;

implementation

uses
  StrUtils,
  pcre,
  UCommon,
  ULog;

type
  IEncoder = interface
    function GetName(): AnsiString;
    function Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean;
    function Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean;
  end;

  TEncoder = class(TInterfacedObject, IEncoder)
  public
    function GetName(): AnsiString; virtual; abstract;
    function Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean; virtual; abstract;
    function Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean; virtual; abstract;
  end;

  TSingleByteEncoder = class(TEncoder)
  public
    function Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean; override;
    function Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean; override;
    function DecodeChar(InChr: AnsiChar; out OutChr: UCS4Char): boolean; virtual; abstract;
    function EncodeChar(InChr: UCS4Char; out OutChr: AnsiChar): boolean; virtual; abstract;
  end;

const
  ERROR_CHAR = '?';

var
  Encoders: array[TEncoding] of IEncoder;

function TSingleByteEncoder.Encode(const InStr: UCS4String; out OutStr: RawByteString): boolean;
var
  I: integer;
begin
  SetLength(OutStr, LengthUCS4(InStr));
  Result := true;
  for I := 1 to Length(OutStr) do
  begin
    if (not EncodeChar(InStr[I-1], OutStr[I])) then
      Result := false;
  end;
end;

function TSingleByteEncoder.Decode(const InStr: RawByteString; out OutStr: UCS4String): boolean;
var
  I: integer;
begin
  SetLength(OutStr, Length(InStr)+1);
  Result := true;
  for I := 1 to Length(InStr) do
  begin
    if (not DecodeChar(InStr[I], OutStr[I-1])) then
      Result := false;
  end;
  OutStr[High(OutStr)] := 0;
end;

function DecodeString(const Src: RawByteString; out Dst: WideString; SrcEncoding: TEncoding): boolean;
var
  DstUCS4: UCS4String;
begin
  Result := Encoders[SrcEncoding].Decode(Src, DstUCS4);
  Dst := UCS4StringToWideString(DstUCS4);
end;

function DecodeString(const Src: RawByteString; SrcEncoding: TEncoding): WideString;
begin
  DecodeString(Src, Result, SrcEncoding);
end;

function DecodeStringUTF8(const Src: RawByteString; out Dst: UTF8String; SrcEncoding: TEncoding): boolean;
var
  DstUCS4: UCS4String;
begin
  Result := Encoders[SrcEncoding].Decode(Src, DstUCS4);
  Dst := UCS4ToUTF8String(DstUCS4);
end;

function DecodeStringUTF8(const Src: RawByteString; SrcEncoding: TEncoding): UTF8String;
begin
  DecodeStringUTF8(Src, Result, SrcEncoding);
end;

function EncodeString(const Src: WideString; out Dst: RawByteString; DstEncoding: TEncoding): boolean;
begin
  Result := Encoders[DstEncoding].Encode(WideStringToUCS4String(Src), Dst);
end;

function EncodeString(const Src: WideString; DstEncoding: TEncoding): RawByteString;
begin
  EncodeString(Src, Result, DstEncoding);
end;

function EncodeStringUTF8(const Src: UTF8String; out Dst: RawByteString; DstEncoding: TEncoding): boolean;
begin
  Result := Encoders[DstEncoding].Encode(UTF8ToUCS4String(Src), Dst);
end;

function EncodeStringUTF8(const Src: UTF8String; DstEncoding: TEncoding): RawByteString;
begin
  EncodeStringUTF8(Src, Result, DstEncoding);
end;

function CheckReplaceUTF8BOM(var Text: RawByteString): boolean;
begin
  if AnsiStartsStr(UTF8_BOM, Text) then
  begin
    Text := Copy(Text, Length(UTF8_BOM)+1, Length(Text)-Length(UTF8_BOM));
    Result := true;
    Exit;
  end;
  Result := false;
end;

function ParseEncoding(const EncodingStr: AnsiString; Default: TEncoding): TEncoding;
var
  PrepStr: AnsiString; // prepared encoding string
  Encoding: TEncoding;
begin
  // remove surrounding whitespace, replace dashes, to upper case
  PrepStr := UpperCase(AnsiReplaceStr(Trim(EncodingStr), '-', ''));
  for Encoding := Low(TEncoding) to High(TEncoding) do
  begin
    if (Encoders[Encoding].GetName() = PrepStr) then
    begin
      Result := Encoding;
      Exit;
    end;
  end;
  Result := Default;
end;

function EncodingName(Encoding: TEncoding): AnsiString;
begin
  Result := Encoders[Encoding].GetName();
end;

{$I ..\\encoding\\Locale.inc}
{$I ..\\encoding\\UTF8.inc}
{$I ..\\encoding\\CP1250.inc}
{$I ..\\encoding\\CP1252.inc}
{$I ..\\encoding\\Auto.inc}

initialization
  Encoders[encLocale] := TEncoderLocale.Create;
  Encoders[encUTF8]   := TEncoderUTF8.Create;
  Encoders[encCP1250] := TEncoderCP1250.Create;
  Encoders[encCP1252] := TEncoderCP1252.Create;

  // use USDX < 1.1 encoding for backward compatibility (encCP1252)
  Encoders[encAuto] := TEncoderAuto.Create(Encoders[encUTF8], Encoders[encCP1252]);

end.