aboutsummaryrefslogtreecommitdiffstats
path: root/src/encoding
diff options
context:
space:
mode:
authors_alexander <s_alexander@b956fd51-792f-4845-bead-9b4dfca2ff2c>2009-12-05 12:26:00 +0000
committers_alexander <s_alexander@b956fd51-792f-4845-bead-9b4dfca2ff2c>2009-12-05 12:26:00 +0000
commitd589e6221ffcafc077eeefaa60cdc3e33a800558 (patch)
treec466f21bcbfa9546a7249b8ed2093e1bf84eeae2 /src/encoding
parente0d74e92c0c7aa5b4e0fd7ee5fae0bff8e513a27 (diff)
downloadusdx-d589e6221ffcafc077eeefaa60cdc3e33a800558.tar.gz
usdx-d589e6221ffcafc077eeefaa60cdc3e33a800558.tar.xz
usdx-d589e6221ffcafc077eeefaa60cdc3e33a800558.zip
added autodetection of utf8
used w3c regex to match all song lines whether they are utf8 lines and decode it on match as utf8 and as latin1 otherwise git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/trunk@1964 b956fd51-792f-4845-bead-9b4dfca2ff2c
Diffstat (limited to 'src/encoding')
-rw-r--r--src/encoding/Auto.inc127
1 files changed, 127 insertions, 0 deletions
diff --git a/src/encoding/Auto.inc b/src/encoding/Auto.inc
new file mode 100644
index 00000000..bf512f95
--- /dev/null
+++ b/src/encoding/Auto.inc
@@ -0,0 +1,127 @@
+{* UltraStar Deluxe - Karaoke Game
+ *
+ * UltraStar Deluxe is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ * $URL$
+ * $Id$
+ *}
+
+// Auto
+// try to match the w3c regex and decode as unicode on match and as fallback if not match
+// (copied from http://www.w3.org/International/questions/qa-forms-utf-8.en.php)
+//
+// m/\A(
+// [\x09\x0A\x0D\x20-\x7E] # ASCII
+// | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
+// | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
+// | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
+// | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
+// | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
+// | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
+// | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
+// )*\z/x
+
+type
+ TEncoderAuto = class(TEncoder)
+ public
+ function GetName(): AnsiString; override;
+ function Encode(const InStr: UCS4String; out OutStr: AnsiString): boolean; override;
+ function Decode(const InStr: AnsiString; out OutStr: UCS4String): boolean; override;
+
+ constructor Create(const UTF8Encoder, FallbackEncoder: IEncoder);
+
+ private
+ FallbackEncoder: IEncoder;
+ UTF8Encoder: IEncoder;
+ Regex: PPCRE;
+ RegexExtra: PPCREExtra;
+ end;
+
+function PCREGetMem(Size: SizeInt): Pointer; cdecl;
+begin
+ GetMem(Result, Size);
+end;
+
+procedure PCREFreeMem(P: Pointer); cdecl;
+begin
+ FreeMem(P);
+end;
+
+constructor TEncoderAuto.Create(const UTF8Encoder, FallbackEncoder: IEncoder);
+var
+ Error: PChar;
+ ErrorOffset: Integer;
+begin
+ // NOTICE: Log.LogError() is not possible here because it isn't loaded
+ inherited Create();
+ self.FallbackEncoder := FallbackEncoder;
+ self.UTF8Encoder := UTF8Encoder;
+
+ // Load and initialize PCRE Library
+ LoadPCRE();
+ SetPCREMallocCallback(PCREGetMem);
+ SetPCREFreeCallback(PCREFreeMem);
+
+ // compile regex
+ self.Regex := pcre_compile('\A([\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*\z', 0, @Error, @ErrorOffset, nil);
+
+ if self.Regex = Nil then
+ begin
+ writeln('ERROR: UTF8 Regex compilation failed: ', AnsiString(Error), ' at ', ErrorOffset);
+ end
+ else
+ begin
+ // if compiled successfull, try to get more informations the speed up the matching
+ self.RegexExtra := pcre_study(self.Regex, 0, @Error);
+
+ if Error <> Nil then
+ begin
+ writeln('ERROR: UTF8 Regex study failed: ', AnsiString(Error));
+ end;
+ end;
+end;
+
+function TEncoderAuto.GetName(): AnsiString;
+begin
+ Result := 'Auto';
+end;
+
+function TEncoderAuto.Decode(const InStr: AnsiString; out OutStr: UCS4String): boolean;
+var
+ RegexResults: Integer;
+begin
+ if (self.Regex <> Nil) then
+ begin
+ RegexResults := pcre_exec(Regex, RegexExtra, PChar(InStr), Length(InStr), 0, 0, Nil, 0);
+
+ if RegexResults >= 0 then
+ begin
+ Result := UTF8Encoder.Decode(InStr, OutStr);
+ Exit;
+ end;
+ end;
+
+ Result := FallbackEncoder.Decode(InStr, OutStr);
+end;
+
+function TEncoderAuto.Encode(const InStr: UCS4String; out OutStr: AnsiString): boolean;
+begin
+ Result := UTF8Encoder.Encode(InStr, OutStr);
+end;