From 17614ea059162f432f7feba5f39329667a335fa6 Mon Sep 17 00:00:00 2001 From: tobigun Date: Fri, 7 Nov 2008 20:49:01 +0000 Subject: - WideStringUpperCase moved to UUnicodeUtils.pas - WideCharUpperCase removed as single characters (code-point) can be represented by two WideChars (surrogates). Convert to UCS4 instead (one code-point <-> one UCS4Char). - UCS4 functions added to UUUnicodeUtils - string replaced with UTF8String (although it's just a typedef) to mark UTF8 strings. git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/branches/experimental@1507 b956fd51-792f-4845-bead-9b4dfca2ff2c --- unicode/src/base/UUnicodeUtils.pas | 87 +++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) (limited to 'unicode/src/base/UUnicodeUtils.pas') diff --git a/unicode/src/base/UUnicodeUtils.pas b/unicode/src/base/UUnicodeUtils.pas index 91c5966f..49be200f 100644 --- a/unicode/src/base/UUnicodeUtils.pas +++ b/unicode/src/base/UUnicodeUtils.pas @@ -47,11 +47,51 @@ function IsAlphaNumericChar(ch: WideChar): boolean; function IsPunctuationChar(ch: WideChar): boolean; function IsControlChar(ch: WideChar): boolean; +{* + * String format conversion + *} + function UTF8ToUCS4String(const str: UTF8String): UCS4String; -function UCS4ToUTF8String(const str: UCS4String): UTF8String; +function UCS4ToUTF8String(const str: UCS4String): UTF8String; overload; +function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload; + +{** + * Returns the number of characters (not bytes) in string str. + *} +function LengthUTF8(const str: UTF8String): integer; + +{** + * Converts a UCS-4 char ch to its upper-case representation. + *} +function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload; + +{** + * Converts a UCS-4 string str to its upper-case representation. + *} +function UCS4UpperCase(const str: UCS4String): UCS4String; overload; + +{** + * + *} +function UCS4CharToString(ch: UCS4Char): UCS4String; + +(* + + * Converts a WideString to its upper-case representation. + * Wrapper for WideUpperCase. Needed because some plattforms have problems with + * unicode support. + * + * Note that characters in UTF-16 might consist of one or two WideChar valus + * (see surrogates). So instead of using WideStringUpperCase(ch)[1] for single + * character access, convert to UCS-4 where each character is represented by + * one UCS4Char. + *) +function WideStringUpperCase(const str: WideString) : WideString; + implementation + function IsAlphaChar(ch: WideChar): boolean; begin {$IFDEF MSWINDOWS} @@ -121,4 +161,49 @@ begin Result := UTF8Encode(UCS4StringToWideString(str)); end; +function UCS4ToUTF8String(ch: UCS4Char): UTF8String; +begin + Result := UCS4ToUTF8String(UCS4CharToString(ch)); +end; + +function LengthUTF8(const str: UTF8String): integer; +begin + Result := Length(UTF8ToUCS4String(str)); +end; + +function UCS4UpperCase(ch: UCS4Char): UCS4Char; +begin + Result := UCS4UpperCase(UCS4CharToString(ch))[0]; +end; + +function UCS4UpperCase(const str: UCS4String): UCS4String; +begin + // convert to upper-case as WideString and convert result back to UCS-4 + Result := WideStringToUCS4String( + WideStringUpperCase( + UCS4StringToWideString(str))); +end; + +function UCS4CharToString(ch: UCS4Char): UCS4String; +begin + SetLength(Result, 2); + Result[0] := ch; + Result[1] := 0; +end; + +function WideStringUpperCase(const str: WideString): WideString; +begin + // On Linux and MacOSX the cwstring unit is necessary for Unicode function-calls. + // Otherwise you will get an EIntOverflow exception (thrown by unimplementedwidestring()). + // The Unicode manager cwstring does not work with MacOSX at the moment because + // of missing references to iconv. So we have to use Ansi... for the moment. + + {.$IFNDEF DARWIN} + {$IFDEF NOIGNORE} + Result := WideUpperCase(str) + {$ELSE} + Result := UTF8Decode(AnsiUpperCase(UTF8Encode(str))); + {$ENDIF} +end; + end. -- cgit v1.2.3