path: root/src/base/UUnicodeUtils.pas



{* UltraStar Deluxe - Karaoke Game
 *
 * UltraStar Deluxe is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING. If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 * $URL$
 * $Id$
 *}

unit UUnicodeUtils;

interface

{$IFDEF FPC}
  {$MODE Delphi}
{$ENDIF}

uses
{$IFDEF MSWINDOWS}
  Windows,
{$ENDIF}
  StrUtils,
  SysUtils;

type
  // String with unknown encoding. Introduced with Delphi 2009 and maybe soon
  // with FPC.
  RawByteString = AnsiString;

{**
 * Returns true if the system uses UTF-8 as default string type
 * (filesystem or API calls).
 * This is always true on Mac OS X and always false on Win32. On Unix it depends
 * on the LC_CTYPE setting.
 * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true.
 *}
function IsNativeUTF8(): boolean;

(*
 * Character classes
 *)

function IsAlphaChar(ch: WideChar): boolean; overload;
function IsAlphaChar(ch: UCS4Char): boolean; overload;

function IsNumericChar(ch: WideChar): boolean; overload;
function IsNumericChar(ch: UCS4Char): boolean; overload;

function IsAlphaNumericChar(ch: WideChar): boolean; overload;
function IsAlphaNumericChar(ch: UCS4Char): boolean; overload;

function IsPunctuationChar(ch: WideChar): boolean; overload;
function IsPunctuationChar(ch: UCS4Char): boolean; overload;

function IsControlChar(ch: WideChar): boolean; overload;
function IsControlChar(ch: UCS4Char): boolean; overload;

function IsPrintableChar(ch: WideChar): boolean; overload;
function IsPrintableChar(ch: UCS4Char): boolean; overload;

{**
 * Checks if the given string is a valid UTF-8 string.
 * If an ANSI encoded string (with char codes >= 128) is passed, the
 * function will most probably return false, as most ANSI strings sequences
 * are illegal in UTF-8.
 *}
function IsUTF8String(const str: RawByteString): boolean;

{**
 * Iterates over an UTF-8 encoded string.
 * StrPtr will be  increased to the beginning of the next character on each
 * call.
 * Results true if the given string starts with an UTF-8 encoded char.
 *}
function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;

{**
 * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index.
 * Index values start with 1.
 *}
procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);

{**
 * Checks if the string is composed of ASCII characters.
 *}
function IsASCIIString(const str: RawByteString): boolean;

{*
 * String format conversion
 *}

function UTF8ToUCS4String(const str: UTF8String): UCS4String;
function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload;
function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload;

{**
 * Returns the number of characters (not bytes) in string str.
 *}
function LengthUTF8(const str: UTF8String): integer;

{**
 * Returns the length of an UCS4String. Note that Length(UCS4String) returns
 * the length+1 as UCS4Strings are zero-terminated.
 *}
function LengthUCS4(const str: UCS4String): integer;

{** @seealso WideCompareStr *}
function UTF8CompareStr(const S1, S2: UTF8String): integer;
{** @seealso WideCompareText *}
function UTF8CompareText(const S1, S2: UTF8String): integer;

function UTF8StartsText(const SubText, Text: UTF8String): boolean;

function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
function UTF8ContainsText(const Text, SubText: UTF8String): boolean;

{** @seealso WideUpperCase *}
function UTF8UpperCase(const str: UTF8String): UTF8String;
{** @seealso WideCompareText *}
function UTF8LowerCase(const str: UTF8String): UTF8String;

{**
 * Converts a UCS-4 char ch to its upper-case representation.
 *}
function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload;

{**
 * Converts a UCS-4 string str to its upper-case representation.
 *}
function UCS4UpperCase(const str: UCS4String): UCS4String; overload;

{**
 * Converts a UCS4Char to an UCS4String.
 * Note that UCS4Strings are zero-terminated dynamic arrays.
 *}
function UCS4CharToString(ch: UCS4Char): UCS4String;

{**
 * @seealso System.Pos()
 *}
function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;

{**
 * Copies a segment of str starting with Index (1-based) with Count characters (not bytes).
 *}
function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String;

{**
 * Copies a segment of str starting with Index (0-based) with Count characters.
 * Note: Do not use Copy() to copy UCS4Strings as the result will not contain
 * a trailing #0 character and hence is invalid.
 *}
function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String;

(*
 * Converts a WideString to its upper- or lower-case representation.
 * Wrapper for WideUpper/LowerCase. Needed because some plattforms have
 * problems with unicode support.
 *
 * Note that characters in UTF-16 might consist of one or two WideChar valus
 * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single
 * character access, convert to UCS-4 where each character is represented by
 * one UCS4Char. 
 *)
function WideStringUpperCase(const str: WideString) : WideString; overload;
function WideStringUpperCase(ch: WideChar): WideString; overload;
function WideStringLowerCase(const str: WideString): WideString; overload;
function WideStringLowerCase(ch: WideChar): WideString; overload;

function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;

implementation

{$IFDEF UNIX}
{$IFNDEF DARWIN}
const
  LC_CTYPE = 0;

function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c';
{$ENDIF}
{$ENDIF}

var
  NativeUTF8: boolean;

procedure InitUnicodeUtils();
{$IFDEF UNIX}
{$IFNDEF DARWIN}
var
  localeName: PChar;
{$ENDIF}
{$ENDIF}
begin
  {$IF Defined(DARWIN)}
    NativeUTF8 := true;
  {$ELSEIF Defined(MSWindows)}
    NativeUTF8 := false;
  {$ELSEIF Defined(UNIX)}
    // check if locale name contains UTF8 or UTF-8
    localeName := setlocale(LC_CTYPE, nil);
    NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0;
  {$ELSE}
    raise Exception.Create('Unknown system');
  {$IFEND}
end;

function IsNativeUTF8(): boolean;
begin
  Result := NativeUTF8;
end;

function IsAlphaChar(ch: WideChar): boolean;
begin
  {$IFDEF MSWINDOWS}
    Result := IsCharAlphaW(ch);
  {$ELSE}
    // TODO: add chars > 255 (or replace with libxml2 functions?)
    case ch of
      'A'..'Z',  // A-Z
      'a'..'z',  // a-z
      #170,#181,#186,
      #192..#214,
      #216..#246,
      #248..#255:
        Result := true;
      else
        Result := false;
    end;
  {$ENDIF}
end;

function IsAlphaChar(ch: UCS4Char): boolean;
begin
  Result := IsAlphaChar(WideChar(Ord(ch)));
end;

function IsNumericChar(ch: WideChar): boolean;
begin
  // TODO: replace with libxml2 functions?
  // ignore non-arabic numerals as we do not want to handle them
  case ch of
    '0'..'9':
      Result := true;
    else
      Result := false;
  end;
end;

function IsNumericChar(ch: UCS4Char): boolean;
begin
  Result := IsNumericChar(WideChar(Ord(ch)));
end;

function IsAlphaNumericChar(ch: WideChar): boolean;
begin
  Result := (IsAlphaChar(ch) or IsNumericChar(ch));
end;

function IsAlphaNumericChar(ch: UCS4Char): boolean;
begin
  Result := (IsAlphaChar(ch) or IsNumericChar(ch));
end;

function IsPunctuationChar(ch: WideChar): boolean;
begin
  // TODO: add chars > 255 (or replace with libxml2 functions?)
  case ch of
    ' '..'/',':'..'@','['..'`','{'..'~',
    #160..#191,#215,#247:
      Result := true;
    else
      Result := false;
  end;
end;

function IsPunctuationChar(ch: UCS4Char): boolean;
begin
  Result := IsPunctuationChar(WideChar(Ord(ch)));
end;

function IsControlChar(ch: WideChar): boolean;
begin
  case ch of
    #0..#31,
    #127..#159:
      Result := true;
    else
      Result := false;
  end;
end;

function IsControlChar(ch: UCS4Char): boolean;
begin
  Result := IsControlChar(WideChar(Ord(ch)));
end;

function IsPrintableChar(ch: WideChar): boolean;
begin
  Result := not IsControlChar(ch);
end;

function IsPrintableChar(ch: UCS4Char): boolean;
begin
  Result := IsPrintableChar(WideChar(Ord(ch)));
end;


function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;

  // find the most significant zero bit (Result: [7..-1])
  function FindZeroMSB(b: byte): integer;
  var
    Mask: byte;
  begin
    Mask := $80;
    Result := 7;
    while (b and Mask <> 0) do
    begin
      Mask := Mask shr 1;
      Dec(Result);
    end;
  end;

var
  ZeroBit: integer;
  SeqCount: integer; // number of trailing bytes to follow
const
  Mask: array[1..3] of byte = ($1F, $0F, $07);
begin
  Result := false;
  SeqCount := 0;
  Ch := 0;

  while (StrPtr^ <> #0) do
  begin
    if (StrPtr^ < #128) then
    begin
      // check that no more trailing bytes are expected
      if (SeqCount = 0) then
      begin
        Ch := Ord(StrPtr^);
        Inc(StrPtr);
        Result := true;
      end;
      Break;
    end
    else
    begin
      ZeroBit := FindZeroMSB(Ord(StrPtr^));
      // trailing byte expected
      if (SeqCount > 0) then
      begin
        // check if trailing byte has pattern 10xxxxxx
        if (ZeroBit <> 6) then
        begin
          Inc(StrPtr);
          Break;
        end;

        Dec(SeqCount);
        Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F);

        // check if char is finished
        if (SeqCount = 0) then
        begin
          Inc(StrPtr);
          Result := true;
          Break;
        end;
      end
      else // leading byte expected
      begin
        // check if pattern is one of 110xxxxx/1110xxxx/11110xxx
        if (ZeroBit > 5) or (ZeroBit < 3) then
        begin
          Inc(StrPtr);
          Break;
        end;
        // calculate number of trailing bytes (1, 2 or 3)
        SeqCount := 6 - ZeroBit;
        // extract first part of char
        Ch := Ord(StrPtr^) and Mask[SeqCount];
      end;
    end;

    Inc(StrPtr);
  end;

  if (not Result) then
    Ch := Ord('?');
end;

function IsUTF8String(const str: RawByteString): boolean;
var
  Ch: UCS4Char;
  StrPtr: PAnsiChar;
begin
  Result := true;
  StrPtr := PChar(str);
  while (StrPtr^ <> #0) do
  begin
    if (not NextCharUTF8(StrPtr, Ch)) then
    begin
      Result := false;
      Exit;
    end;
  end;
end;

function IsASCIIString(const str: RawByteString): boolean;
var
  I: integer;
begin
  for I := 1 to Length(str) do
  begin
    if (str[I] >= #128) then
    begin
      Result := false;
      Exit;
    end;
  end;    
  Result := true;
end;


function UTF8ToUCS4String(const str: UTF8String): UCS4String;
begin
  Result := WideStringToUCS4String(UTF8Decode(str));
end;

function UCS4ToUTF8String(const str: UCS4String): UTF8String;
begin
  Result := UTF8Encode(UCS4StringToWideString(str));
end;

function UCS4ToUTF8String(ch: UCS4Char): UTF8String;
begin
  Result := UCS4ToUTF8String(UCS4CharToString(ch));
end;

function LengthUTF8(const str: UTF8String): integer;
begin
  Result := LengthUCS4(UTF8ToUCS4String(str));
end;

function LengthUCS4(const str: UCS4String): integer;
begin
  Result := High(str);
  if (Result = -1) then
    Result := 0;
end;

function UTF8CompareStr(const S1, S2: UTF8String): integer;
begin
  Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2));
end;

function UTF8CompareText(const S1, S2: UTF8String): integer;
begin
  Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2));
end;

function UTF8StartsStr(const SubText, Text: UTF8String): boolean;
begin
  // TODO: use WideSameStr (slower but handles different representations of the same char)?
  Result := (Pos(SubText, Text) = 1);
end;

function UTF8StartsText(const SubText, Text: UTF8String): boolean;
begin
  // TODO: use WideSameText (slower but handles different representations of the same char)?
  Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1);
end;

function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
begin
  Result := Pos(SubText, Text) > 0;
end;

function UTF8ContainsText(const Text, SubText: UTF8String): boolean;
begin
  Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0;
end;

function UTF8UpperCase(const str: UTF8String): UTF8String;
begin
  Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str)));
end;

function UTF8LowerCase(const str: UTF8String): UTF8String;
begin
  Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str)));
end;

function UCS4UpperCase(ch: UCS4Char): UCS4Char;
begin
  Result := UCS4UpperCase(UCS4CharToString(ch))[0];
end;

function UCS4UpperCase(const str: UCS4String): UCS4String;
begin
  // convert to upper-case as WideString and convert result back to UCS-4
  Result := WideStringToUCS4String(
            WideStringUpperCase(
            UCS4StringToWideString(str)));
end;

function UCS4CharToString(ch: UCS4Char): UCS4String;
begin
  SetLength(Result, 2);
  Result[0] := ch;
  Result[1] := 0;
end;

function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;
begin
  Result := Pos(substr, str);
end;

function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String;
begin
  Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count));
end;

function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String;
var
  I: integer;
  MaxCount: integer;
begin
  // calculate max. copy count
  MaxCount := LengthUCS4(str)-Index;
  if (MaxCount < 0) then
    MaxCount := 0;
  // adjust copy count
  if (Count > MaxCount) or (Count < 0) then
    Count := MaxCount;

  // copy (and add zero terminator)
  SetLength(Result, Count + 1);
  for I := 0 to Count-1 do
    Result[I] := str[Index+I];
  Result[Count] := 0;
end;

procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
var
  StrUCS4: UCS4String;
begin
  StrUCS4 := UTF8ToUCS4String(str);
  UCS4Delete(StrUCS4, Index-1, Count);
  Str := UCS4ToUTF8String(StrUCS4);
end;

procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);
var
  Len: integer;
  OldStr: UCS4String;
  I: integer;
begin
  Len := LengthUCS4(Str);
  if (Count <= 0) or (Index < 0) or (Index >= Len) then
    Exit;
  if (Index + Count > Len) then
    Count := Len-Index;
  
  OldStr := Str;
  SetLength(Str, Len-Count+1);
  for I := 0 to Index-1 do
    Str[I] := OldStr[I];
  for I := Index+Count to Len-1 do
    Str[I-Count] := OldStr[I];
  Str[High(Str)] := 0;
end;

function WideStringUpperCase(ch: WideChar): WideString;
begin
  // If WideChar #0 is converted to a WideString in Delphi, a string with
  // length 1 and a single char #0 is returned. In FPC an empty (length=0)
  // string will be returned. This will crash, if a non printable key was
  // pressed, its char code (#0) is translated to upper-case and the the first
  // character is accessed with Result[1].
  // We cannot catch this error in the WideString parameter variant as the string
  // has length 0 already.
  
  // Force min. string length of 1
  if (ch = #0) then
    Result := #0
  else
    Result := WideStringUpperCase(WideString(ch));
end;

function WideStringUpperCase(const str: WideString): WideString;
begin
  // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls.
  // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()).
  // The Unicode manager cwstring does not work with MacOSX at the moment because
  // of missing references to iconv.
  // Note: Should be fixed now

  {.$IFNDEF DARWIN}
  {.$IFDEF NOIGNORE}
    Result := WideUpperCase(str)
  {.$ELSE}
    //Result := UTF8Decode(UpperCase(UTF8Encode(str)));
  {.$ENDIF}
end;

function WideStringLowerCase(ch: WideChar): WideString;
begin
  // see WideStringUpperCase
  if (ch = #0) then
    Result := #0
  else
    Result := WideStringLowerCase(WideString(ch));
end;

function WideStringLowerCase(const str: WideString): WideString;
begin
  // see WideStringUpperCase
  Result := WideLowerCase(str)
end;

function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;
var
  iPos  : integer;
//  sTemp : WideString;
begin
(*
  result := text;
  iPos   := Pos(search, result);
  while (iPos > 0) do
  begin
    sTemp  := copy(result, iPos + length(search), length(result));
    result := copy(result, 1, iPos - 1) + rep + sTEmp;
    iPos   := Pos(search, result);
  end;
*)
  result := text;

  if search = rep then
    exit;

  for iPos := 1 to length(result) do
  begin
    if result[iPos] = search then
      result[iPos] := rep;
  end;
end;

initialization
  InitUnicodeUtils;

end.
{* UltraStar Deluxe - Karaoke Game
 *
 * UltraStar Deluxe is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING. If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 * $URL$
 * $Id$
 *}

unit UUnicodeUtils;

interface

{$IFDEF FPC}
  {$MODE Delphi}
{$ENDIF}

uses
{$IFDEF MSWINDOWS}
  Windows,
{$ENDIF}
  StrUtils,
  SysUtils;

type
  // String with unknown encoding. Introduced with Delphi 2009 and maybe soon
  // with FPC.
  RawByteString = AnsiString;

{**
 * Returns true if the system uses UTF-8 as default string type
 * (filesystem or API calls).
 * This is always true on Mac OS X and always false on Win32. On Unix it depends
 * on the LC_CTYPE setting.
 * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true.
 *}
function IsNativeUTF8(): boolean;

(*
 * Character classes
 *)

function IsAlphaChar(ch: WideChar): boolean; overload;
function IsAlphaChar(ch: UCS4Char): boolean; overload;

function IsNumericChar(ch: WideChar): boolean; overload;
function IsNumericChar(ch: UCS4Char): boolean; overload;

function IsAlphaNumericChar(ch: WideChar): boolean; overload;
function IsAlphaNumericChar(ch: UCS4Char): boolean; overload;

function IsPunctuationChar(ch: WideChar): boolean; overload;
function IsPunctuationChar(ch: UCS4Char): boolean; overload;

function IsControlChar(ch: WideChar): boolean; overload;
function IsControlChar(ch: UCS4Char): boolean; overload;

function IsPrintableChar(ch: WideChar): boolean; overload;
function IsPrintableChar(ch: UCS4Char): boolean; overload;

{**
 * Checks if the given string is a valid UTF-8 string.
 * If an ANSI encoded string (with char codes >= 128) is passed, the
 * function will most probably return false, as most ANSI strings sequences
 * are illegal in UTF-8.
 *}
function IsUTF8String(const str: RawByteString): boolean;

{**
 * Iterates over an UTF-8 encoded string.
 * StrPtr will be  increased to the beginning of the next character on each
 * call.
 * Results true if the given string starts with an UTF-8 encoded char.
 *}
function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;

{**
 * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index.
 * Index values start with 1.
 *}
procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);

{**
 * Checks if the string is composed of ASCII characters.
 *}
function IsASCIIString(const str: RawByteString): boolean;

{*
 * String format conversion
 *}

function UTF8ToUCS4String(const str: UTF8String): UCS4String;
function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload;
function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload;

{**
 * Returns the number of characters (not bytes) in string str.
 *}
function LengthUTF8(const str: UTF8String): integer;

{**
 * Returns the length of an UCS4String. Note that Length(UCS4String) returns
 * the length+1 as UCS4Strings are zero-terminated.
 *}
function LengthUCS4(const str: UCS4String): integer;

{** @seealso WideCompareStr *}
function UTF8CompareStr(const S1, S2: UTF8String): integer;
{** @seealso WideCompareText *}
function UTF8CompareText(const S1, S2: UTF8String): integer;

function UTF8StartsText(const SubText, Text: UTF8String): boolean;

function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
function UTF8ContainsText(const Text, SubText: UTF8String): boolean;

{** @seealso WideUpperCase *}
function UTF8UpperCase(const str: UTF8String): UTF8String;
{** @seealso WideCompareText *}
function UTF8LowerCase(const str: UTF8String): UTF8String;

{**
 * Converts a UCS-4 char ch to its upper-case representation.
 *}
function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload;

{**
 * Converts a UCS-4 string str to its upper-case representation.
 *}
function UCS4UpperCase(const str: UCS4String): UCS4String; overload;

{**
 * Converts a UCS4Char to an UCS4String.
 * Note that UCS4Strings are zero-terminated dynamic arrays.
 *}
function UCS4CharToString(ch: UCS4Char): UCS4String;

{**
 * @seealso System.Pos()
 *}
function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;

{**
 * Copies a segment of str starting with Index (1-based) with Count characters (not bytes).
 *}
function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String;

{**
 * Copies a segment of str starting with Index (0-based) with Count characters.
 * Note: Do not use Copy() to copy UCS4Strings as the result will not contain
 * a trailing #0 character and hence is invalid.
 *}
function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String;

(*
 * Converts a WideString to its upper- or lower-case representation.
 * Wrapper for WideUpper/LowerCase. Needed because some plattforms have
 * problems with unicode support.
 *
 * Note that characters in UTF-16 might consist of one or two WideChar valus
 * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single
 * character access, convert to UCS-4 where each character is represented by
 * one UCS4Char. 
 *)
function WideStringUpperCase(const str: WideString) : WideString; overload;
function WideStringUpperCase(ch: WideChar): WideString; overload;
function WideStringLowerCase(const str: WideString): WideString; overload;
function WideStringLowerCase(ch: WideChar): WideString; overload;

function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;

implementation

{$IFDEF UNIX}
{$IFNDEF DARWIN}
const
  LC_CTYPE = 0;

function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c';
{$ENDIF}
{$ENDIF}

var
  NativeUTF8: boolean;

procedure InitUnicodeUtils();
{$IFDEF UNIX}
{$IFNDEF DARWIN}
var
  localeName: PChar;
{$ENDIF}
{$ENDIF}
begin
  {$IF Defined(DARWIN)}
    NativeUTF8 := true;
  {$ELSEIF Defined(MSWindows)}
    NativeUTF8 := false;
  {$ELSEIF Defined(UNIX)}
    // check if locale name contains UTF8 or UTF-8
    localeName := setlocale(LC_CTYPE, nil);
    NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0;
  {$ELSE}
    raise Exception.Create('Unknown system');
  {$IFEND}
end;

function IsNativeUTF8(): boolean;
begin
  Result := NativeUTF8;
end;

function IsAlphaChar(ch: WideChar): boolean;
begin
  {$IFDEF MSWINDOWS}
    Result := IsCharAlphaW(ch);
  {$ELSE}
    // TODO: add chars > 255 (or replace with libxml2 functions?)
    case ch of
      'A'..'Z',  // A-Z
      'a'..'z',  // a-z
      #170,#181,#186,
      #192..#214,
      #216..#246,
      #248..#255:
        Result := true;
      else
        Result := false;
    end;
  {$ENDIF}
end;

function IsAlphaChar(ch: UCS4Char): boolean;
begin
  Result := IsAlphaChar(WideChar(Ord(ch)));
end;

function IsNumericChar(ch: WideChar): boolean;
begin
  // TODO: replace with libxml2 functions?
  // ignore non-arabic numerals as we do not want to handle them
  case ch of
    '0'..'9':
      Result := true;
    else
      Result := false;
  end;
end;

function IsNumericChar(ch: UCS4Char): boolean;
begin
  Result := IsNumericChar(WideChar(Ord(ch)));
end;

function IsAlphaNumericChar(ch: WideChar): boolean;
begin
  Result := (IsAlphaChar(ch) or IsNumericChar(ch));
end;

function IsAlphaNumericChar(ch: UCS4Char): boolean;
begin
  Result := (IsAlphaChar(ch) or IsNumericChar(ch));
end;

function IsPunctuationChar(ch: WideChar): boolean;
begin
  // TODO: add chars > 255 (or replace with libxml2 functions?)
  case ch of
    ' '..'/',':'..'@','['..'`','{'..'~',
    #160..#191,#215,#247:
      Result := true;
    else
      Result := false;
  end;
end;

function IsPunctuationChar(ch: UCS4Char): boolean;
begin
  Result := IsPunctuationChar(WideChar(Ord(ch)));
end;

function IsControlChar(ch: WideChar): boolean;
begin
  case ch of
    #0..#31,
    #127..#159:
      Result := true;
    else
      Result := false;
  end;
end;

function IsControlChar(ch: UCS4Char): boolean;
begin
  Result := IsControlChar(WideChar(Ord(ch)));
end;

function IsPrintableChar(ch: WideChar): boolean;
begin
  Result := not IsControlChar(ch);
end;

function IsPrintableChar(ch: UCS4Char): boolean;
begin
  Result := IsPrintableChar(WideChar(Ord(ch)));
end;


function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;

  // find the most significant zero bit (Result: [7..-1])
  function FindZeroMSB(b: byte): integer;
  var
    Mask: byte;
  begin
    Mask := $80;
    Result := 7;
    while (b and Mask <> 0) do
    begin
      Mask := Mask shr 1;
      Dec(Result);
    end;
  end;

var
  ZeroBit: integer;
  SeqCount: integer; // number of trailing bytes to follow
const
  Mask: array[1..3] of byte = ($1F, $0F, $07);
begin
  Result := false;
  SeqCount := 0;
  Ch := 0;

  while (StrPtr^ <> #0) do
  begin
    if (StrPtr^ < #128) then
    begin
      // check that no more trailing bytes are expected
      if (SeqCount = 0) then
      begin
        Ch := Ord(StrPtr^);
        Inc(StrPtr);
        Result := true;
      end;
      Break;
    end
    else
    begin
      ZeroBit := FindZeroMSB(Ord(StrPtr^));
      // trailing byte expected
      if (SeqCount > 0) then
      begin
        // check if trailing byte has pattern 10xxxxxx
        if (ZeroBit <> 6) then
        begin
          Inc(StrPtr);
          Break;
        end;

        Dec(SeqCount);
        Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F);

        // check if char is finished
        if (SeqCount = 0) then
        begin
          Inc(StrPtr);
          Result := true;
          Break;
        end;
      end
      else // leading byte expected
      begin
        // check if pattern is one of 110xxxxx/1110xxxx/11110xxx
        if (ZeroBit > 5) or (ZeroBit < 3) then
        begin
          Inc(StrPtr);
          Break;
        end;
        // calculate number of trailing bytes (1, 2 or 3)
        SeqCount := 6 - ZeroBit;
        // extract first part of char
        Ch := Ord(StrPtr^) and Mask[SeqCount];
      end;
    end;

    Inc(StrPtr);
  end;

  if (not Result) then
    Ch := Ord('?');
end;

function IsUTF8String(const str: RawByteString): boolean;
var
  Ch: UCS4Char;
  StrPtr: PAnsiChar;
begin
  Result := true;
  StrPtr := PChar(str);
  while (StrPtr^ <> #0) do
  begin
    if (not NextCharUTF8(StrPtr, Ch)) then
    begin
      Result := false;
      Exit;
    end;
  end;
end;

function IsASCIIString(const str: RawByteString): boolean;
var
  I: integer;
begin
  for I := 1 to Length(str) do
  begin
    if (str[I] >= #128) then
    begin
      Result := false;
      Exit;
    end;
  end;    
  Result := true;
end;


function UTF8ToUCS4String(const str: UTF8String): UCS4String;
begin
  Result := WideStringToUCS4String(UTF8Decode(str));
end;

function UCS4ToUTF8String(const str: UCS4String): UTF8String;
begin
  Result := UTF8Encode(UCS4StringToWideString(str));
end;

function UCS4ToUTF8String(ch: UCS4Char): UTF8String;
begin
  Result := UCS4ToUTF8String(UCS4CharToString(ch));
end;

function LengthUTF8(const str: UTF8String): integer;
begin
  Result := LengthUCS4(UTF8ToUCS4String(str));
end;

function LengthUCS4(const str: UCS4String): integer;
begin
  Result := High(str);
  if (Result = -1) then
    Result := 0;
end;

function UTF8CompareStr(const S1, S2: UTF8String): integer;
begin
  Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2));
end;

function UTF8CompareText(const S1, S2: UTF8String): integer;
begin
  Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2));
end;

function UTF8StartsStr(const SubText, Text: UTF8String): boolean;
begin
  // TODO: use WideSameStr (slower but handles different representations of the same char)?
  Result := (Pos(SubText, Text) = 1);
end;

function UTF8StartsText(const SubText, Text: UTF8String): boolean;
begin
  // TODO: use WideSameText (slower but handles different representations of the same char)?
  Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1);
end;

function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
begin
  Result := Pos(SubText, Text) > 0;
end;

function UTF8ContainsText(const Text, SubText: UTF8String): boolean;
begin
  Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0;
end;

function UTF8UpperCase(const str: UTF8String): UTF8String;
begin
  Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str)));
end;

function UTF8LowerCase(const str: UTF8String): UTF8String;
begin
  Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str)));
end;

function UCS4UpperCase(ch: UCS4Char): UCS4Char;
begin
  Result := UCS4UpperCase(UCS4CharToString(ch))[0];
end;

function UCS4UpperCase(const str: UCS4String): UCS4String;
begin
  // convert to upper-case as WideString and convert result back to UCS-4
  Result := WideStringToUCS4String(
            WideStringUpperCase(
            UCS4StringToWideString(str)));
end;

function UCS4CharToString(ch: UCS4Char): UCS4String;
begin
  SetLength(Result, 2);
  Result[0] := ch;
  Result[1] := 0;
end;

function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;
begin
  Result := Pos(substr, str);
end;

function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String;
begin
  Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count));
end;

function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String;
var
  I: integer;
  MaxCount: integer;
begin
  // calculate max. copy count
  MaxCount := LengthUCS4(str)-Index;
  if (MaxCount < 0) then
    MaxCount := 0;
  // adjust copy count
  if (Count > MaxCount) or (Count < 0) then
    Count := MaxCount;

  // copy (and add zero terminator)
  SetLength(Result, Count + 1);
  for I := 0 to Count-1 do
    Result[I] := str[Index+I];
  Result[Count] := 0;
end;

procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
var
  StrUCS4: UCS4String;
begin
  StrUCS4 := UTF8ToUCS4String(str);
  UCS4Delete(StrUCS4, Index-1, Count);
  Str := UCS4ToUTF8String(StrUCS4);
end;

procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);
var
  Len: integer;
  OldStr: UCS4String;
  I: integer;
begin
  Len := LengthUCS4(Str);
  if (Count <= 0) or (Index < 0) or (Index >= Len) then
    Exit;
  if (Index + Count > Len) then
    Count := Len-Index;
  
  OldStr := Str;
  SetLength(Str, Len-Count+1);
  for I := 0 to Index-1 do
    Str[I] := OldStr[I];
  for I := Index+Count to Len-1 do
    Str[I-Count] := OldStr[I];
  Str[High(Str)] := 0;
end;

function WideStringUpperCase(ch: WideChar): WideString;
begin
  // If WideChar #0 is converted to a WideString in Delphi, a string with
  // length 1 and a single char #0 is returned. In FPC an empty (length=0)
  // string will be returned. This will crash, if a non printable key was
  // pressed, its char code (#0) is translated to upper-case and the the first
  // character is accessed with Result[1].
  // We cannot catch this error in the WideString parameter variant as the string
  // has length 0 already.
  
  // Force min. string length of 1
  if (ch = #0) then
    Result := #0
  else
    Result := WideStringUpperCase(WideString(ch));
end;

function WideStringUpperCase(const str: WideString): WideString;
begin
  // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls.
  // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()).
  // The Unicode manager cwstring does not work with MacOSX at the moment because
  // of missing references to iconv.
  // Note: Should be fixed now

  {.$IFNDEF DARWIN}
  {.$IFDEF NOIGNORE}
    Result := WideUpperCase(str)
  {.$ELSE}
    //Result := UTF8Decode(UpperCase(UTF8Encode(str)));
  {.$ENDIF}
end;

function WideStringLowerCase(ch: WideChar): WideString;
begin
  // see WideStringUpperCase
  if (ch = #0) then
    Result := #0
  else
    Result := WideStringLowerCase(WideString(ch));
end;

function WideStringLowerCase(const str: WideString): WideString;
begin
  // see WideStringUpperCase
  Result := WideLowerCase(str)
end;

function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;
var
  iPos  : integer;
//  sTemp : WideString;
begin
(*
  result := text;
  iPos   := Pos(search, result);
  while (iPos > 0) do
  begin
    sTemp  := copy(result, iPos + length(search), length(result));
    result := copy(result, 1, iPos - 1) + rep + sTEmp;
    iPos   := Pos(search, result);
  end;
*)
  result := text;

  if search = rep then
    exit;

  for iPos := 1 to length(result) do
  begin
    if result[iPos] = search then
      result[iPos] := rep;
  end;
end;

initialization
  InitUnicodeUtils;

end.