diff options
author | s_alexander <s_alexander@b956fd51-792f-4845-bead-9b4dfca2ff2c> | 2009-12-05 12:26:00 +0000 |
---|---|---|
committer | s_alexander <s_alexander@b956fd51-792f-4845-bead-9b4dfca2ff2c> | 2009-12-05 12:26:00 +0000 |
commit | d589e6221ffcafc077eeefaa60cdc3e33a800558 (patch) | |
tree | c466f21bcbfa9546a7249b8ed2093e1bf84eeae2 /src/encoding | |
parent | e0d74e92c0c7aa5b4e0fd7ee5fae0bff8e513a27 (diff) | |
download | usdx-d589e6221ffcafc077eeefaa60cdc3e33a800558.tar.gz usdx-d589e6221ffcafc077eeefaa60cdc3e33a800558.tar.xz usdx-d589e6221ffcafc077eeefaa60cdc3e33a800558.zip |
added autodetection of utf8
used w3c regex to match all song lines whether they are utf8 lines and
decode it on match as utf8 and as latin1 otherwise
git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/trunk@1964 b956fd51-792f-4845-bead-9b4dfca2ff2c
Diffstat (limited to 'src/encoding')
-rw-r--r-- | src/encoding/Auto.inc | 127 |
1 files changed, 127 insertions, 0 deletions
diff --git a/src/encoding/Auto.inc b/src/encoding/Auto.inc new file mode 100644 index 00000000..bf512f95 --- /dev/null +++ b/src/encoding/Auto.inc @@ -0,0 +1,127 @@ +{* UltraStar Deluxe - Karaoke Game + * + * UltraStar Deluxe is the legal property of its developers, whose names + * are too numerous to list here. Please refer to the COPYRIGHT + * file distributed with this source distribution. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * $URL$ + * $Id$ + *} + +// Auto +// try to match the w3c regex and decode as unicode on match and as fallback if not match +// (copied from http://www.w3.org/International/questions/qa-forms-utf-8.en.php) +// +// m/\A( +// [\x09\x0A\x0D\x20-\x7E] # ASCII +// | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte +// | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs +// | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte +// | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates +// | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 +// | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 +// | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 +// )*\z/x + +type + TEncoderAuto = class(TEncoder) + public + function GetName(): AnsiString; override; + function Encode(const InStr: UCS4String; out OutStr: AnsiString): boolean; override; + function Decode(const InStr: AnsiString; out OutStr: UCS4String): boolean; override; + + constructor Create(const UTF8Encoder, FallbackEncoder: IEncoder); + + private + FallbackEncoder: IEncoder; + UTF8Encoder: IEncoder; + Regex: PPCRE; + RegexExtra: PPCREExtra; + end; + +function PCREGetMem(Size: SizeInt): Pointer; cdecl; +begin + GetMem(Result, Size); +end; + +procedure PCREFreeMem(P: Pointer); cdecl; +begin + FreeMem(P); +end; + +constructor TEncoderAuto.Create(const UTF8Encoder, FallbackEncoder: IEncoder); +var + Error: PChar; + ErrorOffset: Integer; +begin + // NOTICE: Log.LogError() is not possible here because it isn't loaded + inherited Create(); + self.FallbackEncoder := FallbackEncoder; + self.UTF8Encoder := UTF8Encoder; + + // Load and initialize PCRE Library + LoadPCRE(); + SetPCREMallocCallback(PCREGetMem); + SetPCREFreeCallback(PCREFreeMem); + + // compile regex + self.Regex := pcre_compile('\A([\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*\z', 0, @Error, @ErrorOffset, nil); + + if self.Regex = Nil then + begin + writeln('ERROR: UTF8 Regex compilation failed: ', AnsiString(Error), ' at ', ErrorOffset); + end + else + begin + // if compiled successfull, try to get more informations the speed up the matching + self.RegexExtra := pcre_study(self.Regex, 0, @Error); + + if Error <> Nil then + begin + writeln('ERROR: UTF8 Regex study failed: ', AnsiString(Error)); + end; + end; +end; + +function TEncoderAuto.GetName(): AnsiString; +begin + Result := 'Auto'; +end; + +function TEncoderAuto.Decode(const InStr: AnsiString; out OutStr: UCS4String): boolean; +var + RegexResults: Integer; +begin + if (self.Regex <> Nil) then + begin + RegexResults := pcre_exec(Regex, RegexExtra, PChar(InStr), Length(InStr), 0, 0, Nil, 0); + + if RegexResults >= 0 then + begin + Result := UTF8Encoder.Decode(InStr, OutStr); + Exit; + end; + end; + + Result := FallbackEncoder.Decode(InStr, OutStr); +end; + +function TEncoderAuto.Encode(const InStr: UCS4String; out OutStr: AnsiString): boolean; +begin + Result := UTF8Encoder.Encode(InStr, OutStr); +end; |