aboutsummaryrefslogtreecommitdiffstats
path: root/src/base/UUnicodeUtils.pas
blob: 37b53a67b9600667da17ebff83b0aff603dcd154 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
{* UltraStar Deluxe - Karaoke Game
 *
 * UltraStar Deluxe is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING. If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 * $URL$
 * $Id$
 *}

unit UUnicodeUtils;

interface

{$IFDEF FPC}
  {$MODE Delphi}
{$ENDIF}

uses
{$IFDEF MSWINDOWS}
  Windows,
{$ENDIF}
  StrUtils,
  SysUtils;

type
  // String with unknown encoding. Introduced with Delphi 2009 and maybe soon
  // with FPC.
  RawByteString = AnsiString;

{**
 * Returns true if the system uses UTF-8 as default string type
 * (filesystem or API calls).
 * This is always true on Mac OS X and always false on Win32. On Unix it depends
 * on the LC_CTYPE setting.
 * Do not use AnsiToUTF8() or UTF8ToAnsi() if this function returns true.
 *}
function IsNativeUTF8(): boolean;

(*
 * Character classes
 *)

function IsAlphaChar(ch: WideChar): boolean; overload;
function IsAlphaChar(ch: UCS4Char): boolean; overload;

function IsNumericChar(ch: WideChar): boolean; overload;
function IsNumericChar(ch: UCS4Char): boolean; overload;

function IsAlphaNumericChar(ch: WideChar): boolean; overload;
function IsAlphaNumericChar(ch: UCS4Char): boolean; overload;

function IsPunctuationChar(ch: WideChar): boolean; overload;
function IsPunctuationChar(ch: UCS4Char): boolean; overload;

function IsControlChar(ch: WideChar): boolean; overload;
function IsControlChar(ch: UCS4Char): boolean; overload;

function IsPrintableChar(ch: WideChar): boolean; overload;
function IsPrintableChar(ch: UCS4Char): boolean; overload;

{**
 * Checks if the given string is a valid UTF-8 string.
 * If an ANSI encoded string (with char codes >= 128) is passed, the
 * function will most probably return false, as most ANSI strings sequences
 * are illegal in UTF-8.
 *}
function IsUTF8String(const str: RawByteString): boolean;

{**
 * Iterates over an UTF-8 encoded string.
 * StrPtr will be  increased to the beginning of the next character on each
 * call.
 * Results true if the given string starts with an UTF-8 encoded char.
 *}
function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;

{**
 * Deletes Count chars (not bytes) beginning at char- (not byte-) position Index.
 * Index values start with 1.
 *}
procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);

{**
 * Checks if the string is composed of ASCII characters.
 *}
function IsASCIIString(const str: RawByteString): boolean;

{*
 * String format conversion
 *}

function UTF8ToUCS4String(const str: UTF8String): UCS4String;
function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload;
function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload;

{**
 * Returns the number of characters (not bytes) in string str.
 *}
function LengthUTF8(const str: UTF8String): integer;

{**
 * Returns the length of an UCS4String. Note that Length(UCS4String) returns
 * the length+1 as UCS4Strings are zero-terminated.
 *}
function LengthUCS4(const str: UCS4String): integer;

{** @seealso WideCompareStr *}
function UTF8CompareStr(const S1, S2: UTF8String): integer;
{** @seealso WideCompareText *}
function UTF8CompareText(const S1, S2: UTF8String): integer;

function UTF8StartsText(const SubText, Text: UTF8String): boolean;

function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
function UTF8ContainsText(const Text, SubText: UTF8String): boolean;

{** @seealso WideUpperCase *}
function UTF8UpperCase(const str: UTF8String): UTF8String;
{** @seealso WideCompareText *}
function UTF8LowerCase(const str: UTF8String): UTF8String;

{**
 * Converts a UCS-4 char ch to its upper-case representation.
 *}
function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload;

{**
 * Converts a UCS-4 string str to its upper-case representation.
 *}
function UCS4UpperCase(const str: UCS4String): UCS4String; overload;

{**
 * Converts a UCS4Char to an UCS4String.
 * Note that UCS4Strings are zero-terminated dynamic arrays.
 *}
function UCS4CharToString(ch: UCS4Char): UCS4String;

{**
 * @seealso System.Pos()
 *}
function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;

{**
 * Copies a segment of str starting with Index (1-based) with Count characters (not bytes).
 *}
function UTF8Copy(const str: UTF8String; Index: Integer = 1; Count: Integer = -1): UTF8String;

{**
 * Copies a segment of str starting with Index (0-based) with Count characters.
 * Note: Do not use Copy() to copy UCS4Strings as the result will not contain
 * a trailing #0 character and hence is invalid.
 *}
function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String;

(*
 * Converts a WideString to its upper- or lower-case representation.
 * Wrapper for WideUpper/LowerCase. Needed because some plattforms have
 * problems with unicode support.
 *
 * Note that characters in UTF-16 might consist of one or two WideChar valus
 * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single
 * character access, convert to UCS-4 where each character is represented by
 * one UCS4Char. 
 *)
function WideStringUpperCase(const str: WideString) : WideString; overload;
function WideStringUpperCase(ch: WideChar): WideString; overload;
function WideStringLowerCase(const str: WideString): WideString; overload;
function WideStringLowerCase(ch: WideChar): WideString; overload;

function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;

implementation

{$IFDEF UNIX}
{$IFNDEF DARWIN}
const
  LC_CTYPE = 0;

function setlocale(category: integer; locale: PChar): PChar; cdecl; external 'c';
{$ENDIF}
{$ENDIF}

var
  NativeUTF8: boolean;

procedure InitUnicodeUtils();
{$IFDEF UNIX}
{$IFNDEF DARWIN}
var
  localeName: PChar;
{$ENDIF}
{$ENDIF}
begin
  {$IF Defined(DARWIN)}
    NativeUTF8 := true;
  {$ELSEIF Defined(MSWindows)}
    NativeUTF8 := false;
  {$ELSEIF Defined(UNIX)}
    // check if locale name contains UTF8 or UTF-8
    localeName := setlocale(LC_CTYPE, nil);
    NativeUTF8 := Pos('UTF8', UpperCase(AnsiReplaceStr(localeName, '-', ''))) > 0;
  {$ELSE}
    raise Exception.Create('Unknown system');
  {$IFEND}
end;

function IsNativeUTF8(): boolean;
begin
  Result := NativeUTF8;
end;

function IsAlphaChar(ch: WideChar): boolean;
begin
  {$IFDEF MSWINDOWS}
    Result := IsCharAlphaW(ch);
  {$ELSE}
    // TODO: add chars > 255 (or replace with libxml2 functions?)
    case ch of
      'A'..'Z',  // A-Z
      'a'..'z',  // a-z
      #170,#181,#186,
      #192..#214,
      #216..#246,
      #248..#255:
        Result := true;
      else
        Result := false;
    end;
  {$ENDIF}
end;

function IsAlphaChar(ch: UCS4Char): boolean;
begin
  Result := IsAlphaChar(WideChar(Ord(ch)));
end;

function IsNumericChar(ch: WideChar): boolean;
begin
  // TODO: replace with libxml2 functions?
  // ignore non-arabic numerals as we do not want to handle them
  case ch of
    '0'..'9':
      Result := true;
    else
      Result := false;
  end;
end;

function IsNumericChar(ch: UCS4Char): boolean;
begin
  Result := IsNumericChar(WideChar(Ord(ch)));
end;

function IsAlphaNumericChar(ch: WideChar): boolean;
begin
  Result := (IsAlphaChar(ch) or IsNumericChar(ch));
end;

function IsAlphaNumericChar(ch: UCS4Char): boolean;
begin
  Result := (IsAlphaChar(ch) or IsNumericChar(ch));
end;

function IsPunctuationChar(ch: WideChar): boolean;
begin
  // TODO: add chars > 255 (or replace with libxml2 functions?)
  case ch of
    ' '..'/',':'..'@','['..'`','{'..'~',
    #160..#191,#215,#247:
      Result := true;
    else
      Result := false;
  end;
end;

function IsPunctuationChar(ch: UCS4Char): boolean;
begin
  Result := IsPunctuationChar(WideChar(Ord(ch)));
end;

function IsControlChar(ch: WideChar): boolean;
begin
  case ch of
    #0..#31,
    #127..#159:
      Result := true;
    else
      Result := false;
  end;
end;

function IsControlChar(ch: UCS4Char): boolean;
begin
  Result := IsControlChar(WideChar(Ord(ch)));
end;

function IsPrintableChar(ch: WideChar): boolean;
begin
  Result := not IsControlChar(ch);
end;

function IsPrintableChar(ch: UCS4Char): boolean;
begin
  Result := IsPrintableChar(WideChar(Ord(ch)));
end;


function NextCharUTF8(var StrPtr: PAnsiChar; out Ch: UCS4Char): boolean;

  // find the most significant zero bit (Result: [7..-1])
  function FindZeroMSB(b: byte): integer;
  var
    Mask: byte;
  begin
    Mask := $80;
    Result := 7;
    while (b and Mask <> 0) do
    begin
      Mask := Mask shr 1;
      Dec(Result);
    end;
  end;

var
  ZeroBit: integer;
  SeqCount: integer; // number of trailing bytes to follow
const
  Mask: array[1..3] of byte = ($1F, $0F, $07);
begin
  Result := false;
  SeqCount := 0;
  Ch := 0;

  while (StrPtr^ <> #0) do
  begin
    if (StrPtr^ < #128) then
    begin
      // check that no more trailing bytes are expected
      if (SeqCount = 0) then
      begin
        Ch := Ord(StrPtr^);
        Inc(StrPtr);
        Result := true;
      end;
      Break;
    end
    else
    begin
      ZeroBit := FindZeroMSB(Ord(StrPtr^));
      // trailing byte expected
      if (SeqCount > 0) then
      begin
        // check if trailing byte has pattern 10xxxxxx
        if (ZeroBit <> 6) then
        begin
          Inc(StrPtr);
          Break;
        end;

        Dec(SeqCount);
        Ch := (Ch shl 6) or (Ord(StrPtr^) and $3F);

        // check if char is finished
        if (SeqCount = 0) then
        begin
          Inc(StrPtr);
          Result := true;
          Break;
        end;
      end
      else // leading byte expected
      begin
        // check if pattern is one of 110xxxxx/1110xxxx/11110xxx
        if (ZeroBit > 5) or (ZeroBit < 3) then
        begin
          Inc(StrPtr);
          Break;
        end;
        // calculate number of trailing bytes (1, 2 or 3)
        SeqCount := 6 - ZeroBit;
        // extract first part of char
        Ch := Ord(StrPtr^) and Mask[SeqCount];
      end;
    end;

    Inc(StrPtr);
  end;

  if (not Result) then
    Ch := Ord('?');
end;

function IsUTF8String(const str: RawByteString): boolean;
var
  Ch: UCS4Char;
  StrPtr: PAnsiChar;
begin
  Result := true;
  StrPtr := PChar(str);
  while (StrPtr^ <> #0) do
  begin
    if (not NextCharUTF8(StrPtr, Ch)) then
    begin
      Result := false;
      Exit;
    end;
  end;
end;

function IsASCIIString(const str: RawByteString): boolean;
var
  I: integer;
begin
  for I := 1 to Length(str) do
  begin
    if (str[I] >= #128) then
    begin
      Result := false;
      Exit;
    end;
  end;    
  Result := true;
end;


function UTF8ToUCS4String(const str: UTF8String): UCS4String;
begin
  Result := WideStringToUCS4String(UTF8Decode(str));
end;

function UCS4ToUTF8String(const str: UCS4String): UTF8String;
begin
  Result := UTF8Encode(UCS4StringToWideString(str));
end;

function UCS4ToUTF8String(ch: UCS4Char): UTF8String;
begin
  Result := UCS4ToUTF8String(UCS4CharToString(ch));
end;

function LengthUTF8(const str: UTF8String): integer;
begin
  Result := LengthUCS4(UTF8ToUCS4String(str));
end;

function LengthUCS4(const str: UCS4String): integer;
begin
  Result := High(str);
  if (Result = -1) then
    Result := 0;
end;

function UTF8CompareStr(const S1, S2: UTF8String): integer;
begin
  Result := WideCompareStr(UTF8Decode(S1), UTF8Decode(S2));
end;

function UTF8CompareText(const S1, S2: UTF8String): integer;
begin
  Result := WideCompareText(UTF8Decode(S1), UTF8Decode(S2));
end;

function UTF8StartsStr(const SubText, Text: UTF8String): boolean;
begin
  // TODO: use WideSameStr (slower but handles different representations of the same char)?
  Result := (Pos(SubText, Text) = 1);
end;

function UTF8StartsText(const SubText, Text: UTF8String): boolean;
begin
  // TODO: use WideSameText (slower but handles different representations of the same char)?
  Result := (Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) = 1);
end;

function UTF8ContainsStr(const Text, SubText: UTF8String): boolean;
begin
  Result := Pos(SubText, Text) > 0;
end;

function UTF8ContainsText(const Text, SubText: UTF8String): boolean;
begin
  Result := Pos(UTF8UpperCase(SubText), UTF8UpperCase(Text)) > 0;
end;

function UTF8UpperCase(const str: UTF8String): UTF8String;
begin
  Result := UTF8Encode(WideStringUpperCase(UTF8Decode(str)));
end;

function UTF8LowerCase(const str: UTF8String): UTF8String;
begin
  Result := UTF8Encode(WideStringLowerCase(UTF8Decode(str)));
end;

function UCS4UpperCase(ch: UCS4Char): UCS4Char;
begin
  Result := UCS4UpperCase(UCS4CharToString(ch))[0];
end;

function UCS4UpperCase(const str: UCS4String): UCS4String;
begin
  // convert to upper-case as WideString and convert result back to UCS-4
  Result := WideStringToUCS4String(
            WideStringUpperCase(
            UCS4StringToWideString(str)));
end;

function UCS4CharToString(ch: UCS4Char): UCS4String;
begin
  SetLength(Result, 2);
  Result[0] := ch;
  Result[1] := 0;
end;

function UTF8Pos(const substr: UTF8String; const str: UTF8String): Integer;
begin
  Result := Pos(substr, str);
end;

function UTF8Copy(const str: UTF8String; Index: Integer; Count: Integer): UTF8String;
begin
  Result := UCS4ToUTF8String(UCS4Copy(UTF8ToUCS4String(str), Index-1, Count));
end;

function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String;
var
  I: integer;
  MaxCount: integer;
begin
  // calculate max. copy count
  MaxCount := LengthUCS4(str)-Index;
  if (MaxCount < 0) then
    MaxCount := 0;
  // adjust copy count
  if (Count > MaxCount) or (Count < 0) then
    Count := MaxCount;

  // copy (and add zero terminator)
  SetLength(Result, Count + 1);
  for I := 0 to Count-1 do
    Result[I] := str[Index+I];
  Result[Count] := 0;
end;

procedure UTF8Delete(var Str: UTF8String; Index: Integer; Count: Integer);
var
  StrUCS4: UCS4String;
begin
  StrUCS4 := UTF8ToUCS4String(str);
  UCS4Delete(StrUCS4, Index-1, Count);
  Str := UCS4ToUTF8String(StrUCS4);
end;

procedure UCS4Delete(var Str: UCS4String; Index: Integer; Count: Integer);
var
  Len: integer;
  OldStr: UCS4String;
  I: integer;
begin
  Len := LengthUCS4(Str);
  if (Count <= 0) or (Index < 0) or (Index >= Len) then
    Exit;
  if (Index + Count > Len) then
    Count := Len-Index;
  
  OldStr := Str;
  SetLength(Str, Len-Count+1);
  for I := 0 to Index-1 do
    Str[I] := OldStr[I];
  for I := Index+Count to Len-1 do
    Str[I-Count] := OldStr[I];
  Str[High(Str)] := 0;
end;

function WideStringUpperCase(ch: WideChar): WideString;
begin
  // If WideChar #0 is converted to a WideString in Delphi, a string with
  // length 1 and a single char #0 is returned. In FPC an empty (length=0)
  // string will be returned. This will crash, if a non printable key was
  // pressed, its char code (#0) is translated to upper-case and the the first
  // character is accessed with Result[1].
  // We cannot catch this error in the WideString parameter variant as the string
  // has length 0 already.
  
  // Force min. string length of 1
  if (ch = #0) then
    Result := #0
  else
    Result := WideStringUpperCase(WideString(ch));
end;

function WideStringUpperCase(const str: WideString): WideString;
begin
  // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls.
  // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()).
  // The Unicode manager cwstring does not work with MacOSX at the moment because
  // of missing references to iconv.
  // Note: Should be fixed now

  {.$IFNDEF DARWIN}
  {.$IFDEF NOIGNORE}
    Result := WideUpperCase(str)
  {.$ELSE}
    //Result := UTF8Decode(UpperCase(UTF8Encode(str)));
  {.$ENDIF}
end;

function WideStringLowerCase(ch: WideChar): WideString;
begin
  // see WideStringUpperCase
  if (ch = #0) then
    Result := #0
  else
    Result := WideStringLowerCase(WideString(ch));
end;

function WideStringLowerCase(const str: WideString): WideString;
begin
  // see WideStringUpperCase
  Result := WideLowerCase(str)
end;

function WideStringReplaceChar(const text: WideString; search, rep: WideChar): WideString;
var
  iPos  : integer;
//  sTemp : WideString;
begin
(*
  result := text;
  iPos   := Pos(search, result);
  while (iPos > 0) do
  begin
    sTemp  := copy(result, iPos + length(search), length(result));
    result := copy(result, 1, iPos - 1) + rep + sTEmp;
    iPos   := Pos(search, result);
  end;
*)
  result := text;

  if search = rep then
    exit;

  for iPos := 1 to length(result) do
  begin
    if result[iPos] = search then
      result[iPos] := rep;
  end;
end;

initialization
  InitUnicodeUtils;

end.