aboutsummaryrefslogblamecommitdiffstats
path: root/src/encoding/Auto.inc
blob: f404c2f65067aada60619f714e19cfe9f8d38c6b (plain) (tree)








































































































































                                                                                                                                                                                                                                                                                                     
{* UltraStar Deluxe - Karaoke Game
 *
 * UltraStar Deluxe is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING. If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 * $URL$
 * $Id$
 *}

// Auto
// try to match the w3c regex and decode as unicode on match and as fallback if not match
// (copied from http://www.w3.org/International/questions/qa-forms-utf-8.en.php)
//
// m/\A(
//    [\x09\x0A\x0D\x20-\x7E]            # ASCII
//  | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
//  |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
//  | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
//  |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
//  |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
//  | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
//  |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
// )*\z/x

type
  TEncoderAuto = class(TEncoder)
  public
    function GetName(): AnsiString; override;
    function Encode(const InStr: UCS4String; out OutStr: AnsiString): boolean; override;
    function Decode(const InStr: AnsiString; out OutStr: UCS4String): boolean; override;

    constructor Create(const UTF8Encoder, FallbackEncoder: IEncoder);

  private
    FallbackEncoder: IEncoder;
    UTF8Encoder: IEncoder;
    Regex: PPCRE;
    RegexExtra: PPCREExtra;
  end;

function PCREGetMem(Size: SizeInt): Pointer; cdecl;
begin
  GetMem(Result, Size);
end;

procedure PCREFreeMem(P: Pointer); cdecl;
begin
  FreeMem(P);
end;

// NOTICE: Log.LogError/ConsoleWriteLn/DebugWriteLn are initialized yet
procedure ShowError(const msg: string);
begin
  {$IFDEF CONSOLE}
  WriteLn('ERROR: ', msg);
  {$ENDIF}
end;

constructor TEncoderAuto.Create(const UTF8Encoder, FallbackEncoder: IEncoder);
var
  Error: PChar;
  ErrorOffset: Integer;
begin
  inherited Create();
  self.FallbackEncoder := FallbackEncoder;
  self.UTF8Encoder := UTF8Encoder;

  // Load and initialize PCRE Library
  if LoadPCRE() then
  begin
    // compile regex
    self.Regex := pcre_compile('\A([\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*\z', 0, @Error, @ErrorOffset, nil);

    if self.Regex = Nil then
    begin
      ShowError(Format('UTF8 Regex compilation failed: %s at %d', [Error, ErrorOffset]));
    end
    else
    begin
      // if compiled successfull, try to get more informations the speed up the matching
      self.RegexExtra := pcre_study(self.Regex, 0, @Error);

      if Error <> Nil then
      begin
        ShowError('UTF8 Regex study failed: ' + Error);
      end;
    end;
  end
  else
  begin
    ShowError('pcre not loaded. utf-8 autodetection will not work.');
  end;
end;

function TEncoderAuto.GetName(): AnsiString;
begin
  Result := 'Auto';
end;

function TEncoderAuto.Decode(const InStr: AnsiString; out OutStr: UCS4String): boolean;
var
  RegexResults: Integer;
begin
  if (self.Regex <> Nil) then
  begin
    RegexResults := pcre_exec(Regex, RegexExtra, PChar(InStr), Length(InStr), 0, 0, Nil, 0);

    if RegexResults >= 0 then
    begin
      Result := UTF8Encoder.Decode(InStr, OutStr);
      Exit;
    end;
  end;

  Result := FallbackEncoder.Decode(InStr, OutStr);
end;

function TEncoderAuto.Encode(const InStr: UCS4String; out OutStr: AnsiString): boolean;
begin
  Result := UTF8Encoder.Encode(InStr, OutStr);
end;