From 46bb010ca7c5eb04551c030105f9999ca80e472f Mon Sep 17 00:00:00 2001 From: tobigun Date: Sun, 8 Jun 2008 15:33:48 +0000 Subject: - set svn:eol-style to native - removed some svn:executable properties from non-executable files git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/trunk@1144 b956fd51-792f-4845-bead-9b4dfca2ff2c --- Game/Code/lib/JEDI-SDL/SDL/Pas/libxmlparser.pas | 5376 +++++++++++------------ 1 file changed, 2688 insertions(+), 2688 deletions(-) (limited to 'Game/Code/lib/JEDI-SDL/SDL/Pas/libxmlparser.pas') diff --git a/Game/Code/lib/JEDI-SDL/SDL/Pas/libxmlparser.pas b/Game/Code/lib/JEDI-SDL/SDL/Pas/libxmlparser.pas index 0cdf44c7..63e7b7fb 100644 --- a/Game/Code/lib/JEDI-SDL/SDL/Pas/libxmlparser.pas +++ b/Game/Code/lib/JEDI-SDL/SDL/Pas/libxmlparser.pas @@ -1,2688 +1,2688 @@ -(** -=============================================================================================== -Name : LibXmlParser -=============================================================================================== -Project : All Projects -=============================================================================================== -Subject : Progressive XML Parser for all types of XML Files -=============================================================================================== -Author : Stefan Heymann - Eschenweg 3 - 72076 Tübingen - GERMANY - -E-Mail: stefan@destructor.de -URL: www.destructor.de -=============================================================================================== -Source, Legals ("Licence") --------------------------- -The official site to get this parser is http://www.destructor.de/ - -Usage and Distribution of this Source Code is ruled by the -"Destructor.de Source code Licence" (DSL) which comes with this file or -can be downloaded at http://www.destructor.de/ - -IN SHORT: Usage and distribution of this source code is free. - You use it completely on your own risk. - -Postcardware ------------- -If you like this code, please send a postcard of your city to my above address. -=============================================================================================== -!!! All parts of this code which are not finished or not conforming exactly to - the XmlSpec are marked with three exclamation marks - --!- Parts where the parser may be able to detect errors in the document's syntax are - marked with the dash-exlamation mark-dash sequence. -=============================================================================================== -Terminology: ------------- -- Start: Start of a buffer part -- Final: End (last character) of a buffer part -- DTD: Document Type Definition -- DTDc: Document Type Declaration -- XMLSpec: The current W3C XML Recommendation (version 1.0 as of 1998-02-10), Chapter No. -- Cur*: Fields concerning the "Current" part passed back by the "Scan" method -=============================================================================================== -Scanning the XML document -------------------------- -- Create TXmlParser Instance MyXml := TXmlParser.Create; -- Load XML Document MyXml.LoadFromFile (Filename); -- Start Scanning MyXml.StartScan; -- Scan Loop WHILE MyXml.Scan DO -- Test for Part Type CASE MyXml.CurPartType OF -- Handle Parts ... : ;;; -- Handle Parts ... : ;;; -- Handle Parts ... : ;;; - END; -- Destroy MyXml.Free; -=============================================================================================== -Loading the XML document ------------------------- -You can load the XML document from a file with the "LoadFromFile" method. -It is beyond the scope of this parser to perform HTTP or FTP accesses. If you want your -application to handle such requests (URLs), you can load the XML via HTTP or FTP or whatever -protocol and hand over the data buffer using the "LoadFromBuffer" or "SetBuffer" method. -"LoadFromBuffer" loads the internal buffer of TXmlParser with the given null-terminated -string, thereby creating a copy of that buffer. -"SetBuffer" just takes the pointer to another buffer, which means that the given -buffer pointer must be valid while the document is accessed via TXmlParser. -=============================================================================================== -Encodings: ----------- -This XML parser kind of "understands" the following encodings: -- UTF-8 -- ISO-8859-1 -- Windows-1252 - -Any flavor of multi-byte characters (and this includes UTF-16) is not supported. Sorry. - -Every string which has to be passed to the application passes the virtual method -"TranslateEncoding" which translates the string from the current encoding (stored in -"CurEncoding") into the encoding the application wishes to receive. -The "TranslateEncoding" method that is built into TXmlParser assumes that the application -wants to receive Windows ANSI (Windows-1252, about the same as ISO-8859-1) and is able -to convert UTF-8 and ISO-8859-1 encodings. -For other source and target encodings, you will have to override "TranslateEncoding". -=============================================================================================== -Buffer Handling ---------------- -- The document must be loaded completely into a piece of RAM -- All character positions are referenced by PChar pointers -- The TXmlParser instance can either "own" the buffer itself (then, FBufferSize is > 0) - or reference the buffer of another instance or object (then, FBuffersize is 0 and - FBuffer is not NIL) -- The Property DocBuffer passes back a pointer to the first byte of the document. If there - is no document stored (FBuffer is NIL), the DocBuffer returns a pointer to a NULL character. -=============================================================================================== -Whitespace Handling -------------------- -The TXmlParser property "PackSpaces" determines how Whitespace is returned in Text Content: -While PackSpaces is true, all leading and trailing whitespace characters are trimmed of, all -Whitespace is converted to Space #x20 characters and contiguous Whitespace characters are -compressed to one. -If the "Scan" method reports a ptContent part, the application can get the original text -with all whitespace characters by extracting the characters from "CurStart" to "CurFinal". -If the application detects an xml:space attribute, it can set "PackSpaces" accordingly or -use CurStart/CurFinal. -Please note that TXmlParser does _not_ normalize Line Breaks to single LineFeed characters -as the XmlSpec requires (XmlSpec 2.11). -The xml:space attribute is not handled by TXmlParser. This is on behalf of the application. -=============================================================================================== -Non-XML-Conforming ------------------- -TXmlParser does not conform 100 % exactly to the XmlSpec: -- UTF-16 is not supported (XmlSpec 2.2) - (Workaround: Convert UTF-16 to UTF-8 and hand the buffer over to TXmlParser) -- As the parser only works with single byte strings, all Unicode characters > 255 - can currently not be handled correctly. -- Line breaks are not normalized to single Linefeed #x0A characters (XmlSpec 2.11) - (Workaround: The Application can access the text contents on its own [CurStart, CurFinal], - thereby applying every normalization it wishes to) -- The attribute value normalization does not work exactly as defined in the - Second Edition of the XML 1.0 specification. -- See also the code parts marked with three consecutive exclamation marks. These are - parts which are not finished in the current code release. - -This list may be incomplete, so it may grow if I get to know any other points. -As work on the parser proceeds, this list may also shrink. -=============================================================================================== -Things Todo ------------ -- Introduce a new event/callback which is called when there is an unresolvable - entity or character reference -- Support Unicode -- Use Streams instead of reading the whole XML into memory -=============================================================================================== -Change History, Version numbers -------------------------------- -The Date is given in ISO Year-Month-Day (YYYY-MM-DD) order. -Versions are counted from 1.0.0 beginning with the version from 2000-03-16. -Unreleased versions don't get a version number. - -Date Author Version Changes ------------------------------------------------------------------------------------------------ -2000-03-16 HeySt 1.0.0 Start -2000-03-28 HeySt 1.0.1 Initial Publishing of TXmlParser on the destructor.de Web Site -2000-03-30 HeySt 1.0.2 TXmlParser.AnalyzeCData: Call "TranslateEncoding" for CurContent -2000-03-31 HeySt 1.0.3 Deleted the StrPosE function (was not needed anyway) -2000-04-04 HeySt 1.0.4 TDtdElementRec modified: Start/Final for all Elements; - Should be backwards compatible. - AnalyzeDtdc: Set CurPartType to ptDtdc -2000-04-23 HeySt 1.0.5 New class TObjectList. Eliminated reference to the Delphi 5 - "Contnrs" unit so LibXmlParser is Delphi 4 compatible. -2000-07-03 HeySt 1.0.6 TNvpNode: Added Constructor -2000-07-11 HeySt 1.0.7 Removed "Windows" from USES clause - Added three-exclamation-mark comments for Utf8ToAnsi/AnsiToUtf8 - Added three-exclamation-mark comments for CHR function calls -2000-07-23 HeySt 1.0.8 TXmlParser.Clear: CurAttr.Clear; EntityStack.Clear; - (This was not a bug; just defensive programming) -2000-07-29 HeySt 1.0.9 TNvpList: Added methods: Node(Index), Value(Index), Name(Index); -2000-10-07 HeySt Introduced Conditional Defines - Uses Contnrs unit and its TObjectList class again for - Delphi 5 and newer versions -2001-01-30 HeySt Introduced Version Numbering - Made LoadFromFile and LoadFromBuffer BOOLEAN functions - Introduced FileMode parameter for LoadFromFile - BugFix: TAttrList.Analyze: Must add CWhitespace to ExtractName call - Comments worked over -2001-02-28 HeySt 1.0.10 Completely worked over and tested the UTF-8 functions - Fixed a bug in TXmlParser.Scan which caused it to start over when it - was called after the end of scanning, resulting in an endless loop - TEntityStack is now a TObjectList instead of TList -2001-07-03 HeySt 1.0.11 Updated Compiler Version IFDEFs for Kylix -2001-07-11 HeySt 1.0.12 New TCustomXmlScanner component (taken over from LibXmlComps.pas) -2001-07-14 HeySt 1.0.13 Bugfix TCustomXmlScanner.FOnTranslateEncoding -2001-10-22 HeySt Don't clear CurName anymore when the parser finds a CDATA section. -2001-12-03 HeySt 1.0.14 TObjectList.Clear: Make call to INHERITED method (fixes a memory leak) -2001-12-05 HeySt 1.0.15 TObjectList.Clear: removed call to INHERITED method - TObjectList.Destroy: Inserted SetCapacity call. - Reduces need for frequent re-allocation of pointer buffer - Dedicated to my father, Theodor Heymann -2002-06-26 HeySt 1.0.16 TXmlParser.Scan: Fixed a bug with PIs whose name is beginning - with 'xml'. Thanks to Uwe Kamm for submitting this bug. - The CurEncoding property is now always in uppercase letters (the XML - spec wants it to be treated case independently so when it's uppercase - comparisons are faster) -2002-03-04 HeySt 1.0.17 Included an IFDEF for Delphi 7 (VER150) and Kylix - There is a new symbol HAS_CONTNRS_UNIT which is used now to - distinguish between IDEs which come with the Contnrs unit and - those that don't. -*) - -UNIT libxmlparser; - -{$I jedi-sdl.inc} - -INTERFACE - -USES - SysUtils, Classes, - (*$IFDEF HAS_CONTNRS_UNIT *) // The Contnrs Unit was introduced in Delphi 5 - Contnrs, - (*$ENDIF*) - Math; - -CONST - CVersion = '1.0.17'; // This variable will be updated for every release - // (I hope, I won't forget to do it everytime ...) - -TYPE - TPartType = // --- Document Part Types - (ptNone, // Nothing - ptXmlProlog, // XML Prolog XmlSpec 2.8 / 4.3.1 - ptComment, // Comment XmlSpec 2.5 - ptPI, // Processing Instruction XmlSpec 2.6 - ptDtdc, // Document Type Declaration XmlSpec 2.8 - ptStartTag, // Start Tag XmlSpec 3.1 - ptEmptyTag, // Empty-Element Tag XmlSpec 3.1 - ptEndTag, // End Tag XmlSpec 3.1 - ptContent, // Text Content between Tags - ptCData); // CDATA Section XmlSpec 2.7 - - TDtdElemType = // --- DTD Elements - (deElement, // !ELEMENT declaration - deAttList, // !ATTLIST declaration - deEntity, // !ENTITY declaration - deNotation, // !NOTATION declaration - dePI, // PI in DTD - deComment, // Comment in DTD - deError); // Error found in the DTD - -TYPE - TAttrList = CLASS; - TEntityStack = CLASS; - TNvpList = CLASS; - TElemDef = CLASS; - TElemList = CLASS; - TEntityDef = CLASS; - TNotationDef = CLASS; - - TDtdElementRec = RECORD // --- This Record is returned by the DTD parser callback function - Start, Final : PChar; // Start/End of the Element's Declaration - CASE ElementType : TDtdElemType OF // Type of the Element - deElement, // - deAttList : (ElemDef : TElemDef); // - deEntity : (EntityDef : TEntityDef); // - deNotation : (NotationDef : TNotationDef); // - dePI : (Target : PChar; // - Content : PChar; - AttrList : TAttrList); - deError : (Pos : PChar); // Error - // deComment : ((No additional fields here)); // - END; - - TXmlParser = CLASS // --- Internal Properties and Methods - PROTECTED - FBuffer : PChar; // NIL if there is no buffer available - FBufferSize : INTEGER; // 0 if the buffer is not owned by the Document instance - FSource : STRING; // Name of Source of document. Filename for Documents loaded with LoadFromFile - - FXmlVersion : STRING; // XML version from Document header. Default is '1.0' - FEncoding : STRING; // Encoding from Document header. Default is 'UTF-8' - FStandalone : BOOLEAN; // Standalone declaration from Document header. Default is 'yes' - FRootName : STRING; // Name of the Root Element (= DTD name) - FDtdcFinal : PChar; // Pointer to the '>' character terminating the DTD declaration - - FNormalize : BOOLEAN; // If true: Pack Whitespace and don't return empty contents - EntityStack : TEntityStack; // Entity Stack for Parameter and General Entities - FCurEncoding : STRING; // Current Encoding during parsing (always uppercase) - - PROCEDURE AnalyzeProlog; // Analyze XML Prolog or Text Declaration - PROCEDURE AnalyzeComment (Start : PChar; VAR Final : PChar); // Analyze Comments - PROCEDURE AnalyzePI (Start : PChar; VAR Final : PChar); // Analyze Processing Instructions (PI) - PROCEDURE AnalyzeDtdc; // Analyze Document Type Declaration - PROCEDURE AnalyzeDtdElements (Start : PChar; VAR Final : PChar); // Analyze DTD declarations - PROCEDURE AnalyzeTag; // Analyze Start/End/Empty-Element Tags - PROCEDURE AnalyzeCData; // Analyze CDATA Sections - PROCEDURE AnalyzeText (VAR IsDone : BOOLEAN); // Analyze Text Content between Tags - PROCEDURE AnalyzeElementDecl (Start : PChar; VAR Final : PChar); - PROCEDURE AnalyzeAttListDecl (Start : PChar; VAR Final : PChar); - PROCEDURE AnalyzeEntityDecl (Start : PChar; VAR Final : PChar); - PROCEDURE AnalyzeNotationDecl (Start : PChar; VAR Final : PChar); - - PROCEDURE PushPE (VAR Start : PChar); - PROCEDURE ReplaceCharacterEntities (VAR Str : STRING); - PROCEDURE ReplaceParameterEntities (VAR Str : STRING); - PROCEDURE ReplaceGeneralEntities (VAR Str : STRING); - - FUNCTION GetDocBuffer : PChar; // Returns FBuffer or a pointer to a NUL char if Buffer is empty - - PUBLIC // --- Document Properties - PROPERTY XmlVersion : STRING READ FXmlVersion; // XML version from the Document Prolog - PROPERTY Encoding : STRING READ FEncoding; // Document Encoding from Prolog - PROPERTY Standalone : BOOLEAN READ FStandalone; // Standalone Declaration from Prolog - PROPERTY RootName : STRING READ FRootName; // Name of the Root Element - PROPERTY Normalize : BOOLEAN READ FNormalize WRITE FNormalize; // True if Content is to be normalized - PROPERTY Source : STRING READ FSource; // Name of Document Source (Filename) - PROPERTY DocBuffer : PChar READ GetDocBuffer; // Returns document buffer - PUBLIC // --- DTD Objects - Elements : TElemList; // Elements: List of TElemDef (contains Attribute Definitions) - Entities : TNvpList; // General Entities: List of TEntityDef - ParEntities : TNvpList; // Parameter Entities: List of TEntityDef - Notations : TNvpList; // Notations: List of TNotationDef - PUBLIC - CONSTRUCTOR Create; - DESTRUCTOR Destroy; OVERRIDE; - - // --- Document Handling - FUNCTION LoadFromFile (Filename : STRING; - FileMode : INTEGER = fmOpenRead OR fmShareDenyNone) : BOOLEAN; - // Loads Document from given file - FUNCTION LoadFromBuffer (Buffer : PChar) : BOOLEAN; // Loads Document from another buffer - PROCEDURE SetBuffer (Buffer : PChar); // References another buffer - PROCEDURE Clear; // Clear Document - - PUBLIC - // --- Scanning through the document - CurPartType : TPartType; // Current Type - CurName : STRING; // Current Name - CurContent : STRING; // Current Normalized Content - CurStart : PChar; // Current First character - CurFinal : PChar; // Current Last character - CurAttr : TAttrList; // Current Attribute List - PROPERTY CurEncoding : STRING READ FCurEncoding; // Current Encoding - PROCEDURE StartScan; - FUNCTION Scan : BOOLEAN; - - // --- Events / Callbacks - FUNCTION LoadExternalEntity (SystemId, PublicId, - Notation : STRING) : TXmlParser; VIRTUAL; - FUNCTION TranslateEncoding (CONST Source : STRING) : STRING; VIRTUAL; - PROCEDURE DtdElementFound (DtdElementRec : TDtdElementRec); VIRTUAL; - END; - - TValueType = // --- Attribute Value Type - (vtNormal, // Normal specified Attribute - vtImplied, // #IMPLIED attribute value - vtFixed, // #FIXED attribute value - vtDefault); // Attribute value from default value in !ATTLIST declaration - - TAttrDefault = // --- Attribute Default Type - (adDefault, // Normal default value - adRequired, // #REQUIRED attribute - adImplied, // #IMPLIED attribute - adFixed); // #FIXED attribute - - TAttrType = // --- Type of attribute - (atUnknown, // Unknown type - atCData, // Character data only - atID, // ID - atIdRef, // ID Reference - atIdRefs, // Several ID References, separated by Whitespace - atEntity, // Name of an unparsed Entity - atEntities, // Several unparsed Entity names, separated by Whitespace - atNmToken, // Name Token - atNmTokens, // Several Name Tokens, separated by Whitespace - atNotation, // A selection of Notation names (Unparsed Entity) - atEnumeration); // Enumeration - - TElemType = // --- Element content type - (etEmpty, // Element is always empty - etAny, // Element can have any mixture of PCDATA and any elements - etChildren, // Element must contain only elements - etMixed); // Mixed PCDATA and elements - - (*$IFDEF HAS_CONTNRS_UNIT *) - TObjectList = Contnrs.TObjectList; // Re-Export this identifier - (*$ELSE *) - TObjectList = CLASS (TList) - DESTRUCTOR Destroy; OVERRIDE; - PROCEDURE Delete (Index : INTEGER); - PROCEDURE Clear; OVERRIDE; - END; - (*$ENDIF *) - - TNvpNode = CLASS // Name-Value Pair Node - Name : STRING; - Value : STRING; - CONSTRUCTOR Create (TheName : STRING = ''; TheValue : STRING = ''); - END; - - TNvpList = CLASS (TObjectList) // Name-Value Pair List - PROCEDURE Add (Node : TNvpNode); - FUNCTION Node (Name : STRING) : TNvpNode; OVERLOAD; - FUNCTION Node (Index : INTEGER) : TNvpNode; OVERLOAD; - FUNCTION Value (Name : STRING) : STRING; OVERLOAD; - FUNCTION Value (Index : INTEGER) : STRING; OVERLOAD; - FUNCTION Name (Index : INTEGER) : STRING; - END; - - TAttr = CLASS (TNvpNode) // Attribute of a Start-Tag or Empty-Element-Tag - ValueType : TValueType; - AttrType : TAttrType; - END; - - TAttrList = CLASS (TNvpList) // List of Attributes - PROCEDURE Analyze (Start : PChar; VAR Final : PChar); - END; - - TEntityStack = CLASS (TObjectList) // Stack where current position is stored before parsing entities - PROTECTED - Owner : TXmlParser; - PUBLIC - CONSTRUCTOR Create (TheOwner : TXmlParser); - PROCEDURE Push (LastPos : PChar); OVERLOAD; - PROCEDURE Push (Instance : TObject; LastPos : PChar); OVERLOAD; - FUNCTION Pop : PChar; // Returns next char or NIL if EOF is reached. Frees Instance. - END; - - TAttrDef = CLASS (TNvpNode) // Represents a '; - - // --- Name Constants for the above enumeration types - CPartType_Name : ARRAY [TPartType] OF STRING = - ('', 'XML Prolog', 'Comment', 'PI', - 'DTD Declaration', 'Start Tag', 'Empty Tag', 'End Tag', - 'Text', 'CDATA'); - CValueType_Name : ARRAY [TValueType] OF STRING = ('Normal', 'Implied', 'Fixed', 'Default'); - CAttrDefault_Name : ARRAY [TAttrDefault] OF STRING = ('Default', 'Required', 'Implied', 'Fixed'); - CElemType_Name : ARRAY [TElemType] OF STRING = ('Empty', 'Any', 'Childs only', 'Mixed'); - CAttrType_Name : ARRAY [TAttrType] OF STRING = ('Unknown', 'CDATA', - 'ID', 'IDREF', 'IDREFS', - 'ENTITY', 'ENTITIES', - 'NMTOKEN', 'NMTOKENS', - 'Notation', 'Enumeration'); - -FUNCTION ConvertWs (Source: STRING; PackWs: BOOLEAN) : STRING; // Convert WS to spaces #x20 -PROCEDURE SetStringSF (VAR S : STRING; BufferStart, BufferFinal : PChar); // SetString by Start/Final of buffer -FUNCTION StrSFPas (Start, Finish : PChar) : STRING; // Convert buffer part to Pascal string -FUNCTION TrimWs (Source : STRING) : STRING; // Trim Whitespace - -FUNCTION AnsiToUtf8 (Source : ANSISTRING) : STRING; // Convert Win-1252 to UTF-8 -FUNCTION Utf8ToAnsi (Source : STRING; UnknownChar : CHAR = '¿') : ANSISTRING; // Convert UTF-8 to Win-1252 - - -(* -=============================================================================================== -TCustomXmlScanner event based component wrapper for TXmlParser -=============================================================================================== -*) - -TYPE - TCustomXmlScanner = CLASS; - TXmlPrologEvent = PROCEDURE (Sender : TObject; XmlVersion, Encoding: STRING; Standalone : BOOLEAN) OF OBJECT; - TCommentEvent = PROCEDURE (Sender : TObject; Comment : STRING) OF OBJECT; - TPIEvent = PROCEDURE (Sender : TObject; Target, Content: STRING; Attributes : TAttrList) OF OBJECT; - TDtdEvent = PROCEDURE (Sender : TObject; RootElementName : STRING) OF OBJECT; - TStartTagEvent = PROCEDURE (Sender : TObject; TagName : STRING; Attributes : TAttrList) OF OBJECT; - TEndTagEvent = PROCEDURE (Sender : TObject; TagName : STRING) OF OBJECT; - TContentEvent = PROCEDURE (Sender : TObject; Content : STRING) OF OBJECT; - TElementEvent = PROCEDURE (Sender : TObject; ElemDef : TElemDef) OF OBJECT; - TEntityEvent = PROCEDURE (Sender : TObject; EntityDef : TEntityDef) OF OBJECT; - TNotationEvent = PROCEDURE (Sender : TObject; NotationDef : TNotationDef) OF OBJECT; - TErrorEvent = PROCEDURE (Sender : TObject; ErrorPos : PChar) OF OBJECT; - TExternalEvent = PROCEDURE (Sender : TObject; SystemId, PublicId, NotationId : STRING; - VAR Result : TXmlParser) OF OBJECT; - TEncodingEvent = FUNCTION (Sender : TObject; CurrentEncoding, Source : STRING) : STRING OF OBJECT; - - - TCustomXmlScanner = CLASS (TComponent) - PROTECTED - FXmlParser : TXmlParser; - FOnXmlProlog : TXmlPrologEvent; - FOnComment : TCommentEvent; - FOnPI : TPIEvent; - FOnDtdRead : TDtdEvent; - FOnStartTag : TStartTagEvent; - FOnEmptyTag : TStartTagEvent; - FOnEndTag : TEndTagEvent; - FOnContent : TContentEvent; - FOnCData : TContentEvent; - FOnElement : TElementEvent; - FOnAttList : TElementEvent; - FOnEntity : TEntityEvent; - FOnNotation : TNotationEvent; - FOnDtdError : TErrorEvent; - FOnLoadExternal : TExternalEvent; - FOnTranslateEncoding : TEncodingEvent; - FStopParser : BOOLEAN; - FUNCTION GetNormalize : BOOLEAN; - PROCEDURE SetNormalize (Value : BOOLEAN); - - PROCEDURE WhenXmlProlog(XmlVersion, Encoding: STRING; Standalone : BOOLEAN); VIRTUAL; - PROCEDURE WhenComment (Comment : STRING); VIRTUAL; - PROCEDURE WhenPI (Target, Content: STRING; Attributes : TAttrList); VIRTUAL; - PROCEDURE WhenDtdRead (RootElementName : STRING); VIRTUAL; - PROCEDURE WhenStartTag (TagName : STRING; Attributes : TAttrList); VIRTUAL; - PROCEDURE WhenEmptyTag (TagName : STRING; Attributes : TAttrList); VIRTUAL; - PROCEDURE WhenEndTag (TagName : STRING); VIRTUAL; - PROCEDURE WhenContent (Content : STRING); VIRTUAL; - PROCEDURE WhenCData (Content : STRING); VIRTUAL; - PROCEDURE WhenElement (ElemDef : TElemDef); VIRTUAL; - PROCEDURE WhenAttList (ElemDef : TElemDef); VIRTUAL; - PROCEDURE WhenEntity (EntityDef : TEntityDef); VIRTUAL; - PROCEDURE WhenNotation (NotationDef : TNotationDef); VIRTUAL; - PROCEDURE WhenDtdError (ErrorPos : PChar); VIRTUAL; - - PUBLIC - CONSTRUCTOR Create (AOwner: TComponent); OVERRIDE; - DESTRUCTOR Destroy; OVERRIDE; - - PROCEDURE LoadFromFile (Filename : TFilename); // Load XML Document from file - PROCEDURE LoadFromBuffer (Buffer : PChar); // Load XML Document from buffer - PROCEDURE SetBuffer (Buffer : PChar); // Refer to Buffer - FUNCTION GetFilename : TFilename; - - PROCEDURE Execute; // Perform scanning - - PROTECTED - PROPERTY XmlParser : TXmlParser READ FXmlParser; - PROPERTY StopParser : BOOLEAN READ FStopParser WRITE FStopParser; - PROPERTY Filename : TFilename READ GetFilename WRITE LoadFromFile; - PROPERTY Normalize : BOOLEAN READ GetNormalize WRITE SetNormalize; - PROPERTY OnXmlProlog : TXmlPrologEvent READ FOnXmlProlog WRITE FOnXmlProlog; - PROPERTY OnComment : TCommentEvent READ FOnComment WRITE FOnComment; - PROPERTY OnPI : TPIEvent READ FOnPI WRITE FOnPI; - PROPERTY OnDtdRead : TDtdEvent READ FOnDtdRead WRITE FOnDtdRead; - PROPERTY OnStartTag : TStartTagEvent READ FOnStartTag WRITE FOnStartTag; - PROPERTY OnEmptyTag : TStartTagEvent READ FOnEmptyTag WRITE FOnEmptyTag; - PROPERTY OnEndTag : TEndTagEvent READ FOnEndTag WRITE FOnEndTag; - PROPERTY OnContent : TContentEvent READ FOnContent WRITE FOnContent; - PROPERTY OnCData : TContentEvent READ FOnCData WRITE FOnCData; - PROPERTY OnElement : TElementEvent READ FOnElement WRITE FOnElement; - PROPERTY OnAttList : TElementEvent READ FOnAttList WRITE FOnAttList; - PROPERTY OnEntity : TEntityEvent READ FOnEntity WRITE FOnEntity; - PROPERTY OnNotation : TNotationEvent READ FOnNotation WRITE FOnNotation; - PROPERTY OnDtdError : TErrorEvent READ FOnDtdError WRITE FOnDtdError; - PROPERTY OnLoadExternal : TExternalEvent READ FOnLoadExternal WRITE FOnLoadExternal; - PROPERTY OnTranslateEncoding : TEncodingEvent READ FOnTranslateEncoding WRITE FOnTranslateEncoding; - END; - -(* -=============================================================================================== -IMPLEMENTATION -=============================================================================================== -*) - -IMPLEMENTATION - - -(* -=============================================================================================== -Unicode and UTF-8 stuff -=============================================================================================== -*) - -CONST - // --- Character Translation Table for Unicode <-> Win-1252 - WIN1252_UNICODE : ARRAY [$00..$FF] OF WORD = ( - $0000, $0001, $0002, $0003, $0004, $0005, $0006, $0007, $0008, $0009, - $000A, $000B, $000C, $000D, $000E, $000F, $0010, $0011, $0012, $0013, - $0014, $0015, $0016, $0017, $0018, $0019, $001A, $001B, $001C, $001D, - $001E, $001F, $0020, $0021, $0022, $0023, $0024, $0025, $0026, $0027, - $0028, $0029, $002A, $002B, $002C, $002D, $002E, $002F, $0030, $0031, - $0032, $0033, $0034, $0035, $0036, $0037, $0038, $0039, $003A, $003B, - $003C, $003D, $003E, $003F, $0040, $0041, $0042, $0043, $0044, $0045, - $0046, $0047, $0048, $0049, $004A, $004B, $004C, $004D, $004E, $004F, - $0050, $0051, $0052, $0053, $0054, $0055, $0056, $0057, $0058, $0059, - $005A, $005B, $005C, $005D, $005E, $005F, $0060, $0061, $0062, $0063, - $0064, $0065, $0066, $0067, $0068, $0069, $006A, $006B, $006C, $006D, - $006E, $006F, $0070, $0071, $0072, $0073, $0074, $0075, $0076, $0077, - $0078, $0079, $007A, $007B, $007C, $007D, $007E, $007F, - - $20AC, $0081, $201A, $0192, $201E, $2026, $2020, $2021, $02C6, $2030, - $0160, $2039, $0152, $008D, $017D, $008F, $0090, $2018, $2019, $201C, - $201D, $2022, $2013, $2014, $02DC, $2122, $0161, $203A, $0153, $009D, - $017E, $0178, $00A0, $00A1, $00A2, $00A3, $00A4, $00A5, $00A6, $00A7, - $00A8, $00A9, $00AA, $00AB, $00AC, $00AD, $00AE, $00AF, $00B0, $00B1, - $00B2, $00B3, $00B4, $00B5, $00B6, $00B7, $00B8, $00B9, $00BA, $00BB, - $00BC, $00BD, $00BE, $00BF, $00C0, $00C1, $00C2, $00C3, $00C4, $00C5, - $00C6, $00C7, $00C8, $00C9, $00CA, $00CB, $00CC, $00CD, $00CE, $00CF, - $00D0, $00D1, $00D2, $00D3, $00D4, $00D5, $00D6, $00D7, $00D8, $00D9, - $00DA, $00DB, $00DC, $00DD, $00DE, $00DF, $00E0, $00E1, $00E2, $00E3, - $00E4, $00E5, $00E6, $00E7, $00E8, $00E9, $00EA, $00EB, $00EC, $00ED, - $00EE, $00EF, $00F0, $00F1, $00F2, $00F3, $00F4, $00F5, $00F6, $00F7, - $00F8, $00F9, $00FA, $00FB, $00FC, $00FD, $00FE, $00FF); - -(* UTF-8 (somewhat simplified) - ----- - Character Range Byte sequence - --------------- -------------------------- (x=Bits from original character) - $0000..$007F 0xxxxxxx - $0080..$07FF 110xxxxx 10xxxxxx - $8000..$FFFF 1110xxxx 10xxxxxx 10xxxxxx - - Example - -------- - Transforming the Unicode character U+00E4 LATIN SMALL LETTER A WITH DIAERESIS ("ä"): - - ISO-8859-1, Decimal 228 - Win1252, Hex $E4 - ANSI Bin 1110 0100 - abcd efgh - - UTF-8 Binary 1100xxab 10cdefgh - Binary 11000011 10100100 - Hex $C3 $A4 - Decimal 195 164 - ANSI Ã ¤ *) - - -FUNCTION AnsiToUtf8 (Source : ANSISTRING) : STRING; - (* Converts the given Windows ANSI (Win1252) String to UTF-8. *) -VAR - I : INTEGER; // Loop counter - U : WORD; // Current Unicode value - Len : INTEGER; // Current real length of "Result" string -BEGIN - SetLength (Result, Length (Source) * 3); // Worst case - Len := 0; - FOR I := 1 TO Length (Source) DO BEGIN - U := WIN1252_UNICODE [ORD (Source [I])]; - CASE U OF - $0000..$007F : BEGIN - INC (Len); - Result [Len] := CHR (U); - END; - $0080..$07FF : BEGIN - INC (Len); - Result [Len] := CHR ($C0 OR (U SHR 6)); - INC (Len); - Result [Len] := CHR ($80 OR (U AND $3F)); - END; - $0800..$FFFF : BEGIN - INC (Len); - Result [Len] := CHR ($E0 OR (U SHR 12)); - INC (Len); - Result [Len] := CHR ($80 OR ((U SHR 6) AND $3F)); - INC (Len); - Result [Len] := CHR ($80 OR (U AND $3F)); - END; - END; - END; - SetLength (Result, Len); -END; - - -FUNCTION Utf8ToAnsi (Source : STRING; UnknownChar : CHAR = '¿') : ANSISTRING; - (* Converts the given UTF-8 String to Windows ANSI (Win-1252). - If a character can not be converted, the "UnknownChar" is inserted. *) -VAR - SourceLen : INTEGER; // Length of Source string - I, K : INTEGER; - A : BYTE; // Current ANSI character value - U : WORD; - Ch : CHAR; // Dest char - Len : INTEGER; // Current real length of "Result" string -BEGIN - SourceLen := Length (Source); - SetLength (Result, SourceLen); // Enough room to live - Len := 0; - I := 1; - WHILE I <= SourceLen DO BEGIN - A := ORD (Source [I]); - IF A < $80 THEN BEGIN // Range $0000..$007F - INC (Len); - Result [Len] := Source [I]; - INC (I); - END - ELSE BEGIN // Determine U, Inc I - IF (A AND $E0 = $C0) AND (I < SourceLen) THEN BEGIN // Range $0080..$07FF - U := (WORD (A AND $1F) SHL 6) OR (ORD (Source [I+1]) AND $3F); - INC (I, 2); - END - ELSE IF (A AND $F0 = $E0) AND (I < SourceLen-1) THEN BEGIN // Range $0800..$FFFF - U := (WORD (A AND $0F) SHL 12) OR - (WORD (ORD (Source [I+1]) AND $3F) SHL 6) OR - ( ORD (Source [I+2]) AND $3F); - INC (I, 3); - END - ELSE BEGIN // Unknown/unsupported - INC (I); - FOR K := 7 DOWNTO 0 DO - IF A AND (1 SHL K) = 0 THEN BEGIN - INC (I, (A SHR (K+1))-1); - BREAK; - END; - U := WIN1252_UNICODE [ORD (UnknownChar)]; - END; - Ch := UnknownChar; // Retrieve ANSI char - FOR A := $00 TO $FF DO - IF WIN1252_UNICODE [A] = U THEN BEGIN - Ch := CHR (A); - BREAK; - END; - INC (Len); - Result [Len] := Ch; - END; - END; - SetLength (Result, Len); -END; - - -(* -=============================================================================================== -"Special" Helper Functions - -Don't ask me why. But including these functions makes the parser *DRAMATICALLY* faster -on my K6-233 machine. You can test it yourself just by commenting them out. -They do exactly the same as the Assembler routines defined in SysUtils. -(This is where you can see how great the Delphi compiler really is. The compiled code is -faster than hand-coded assembler!) -=============================================================================================== ---> Just move this line below the StrScan function --> *) - - -FUNCTION StrPos (CONST Str, SearchStr : PChar) : PChar; - // Same functionality as SysUtils.StrPos -VAR - First : CHAR; - Len : INTEGER; -BEGIN - First := SearchStr^; - Len := StrLen (SearchStr); - Result := Str; - REPEAT - IF Result^ = First THEN - IF StrLComp (Result, SearchStr, Len) = 0 THEN BREAK; - IF Result^ = #0 THEN BEGIN - Result := NIL; - BREAK; - END; - INC (Result); - UNTIL FALSE; -END; - - -FUNCTION StrScan (CONST Start : PChar; CONST Ch : CHAR) : PChar; - // Same functionality as SysUtils.StrScan -BEGIN - Result := Start; - WHILE Result^ <> Ch DO BEGIN - IF Result^ = #0 THEN BEGIN - Result := NIL; - EXIT; - END; - INC (Result); - END; -END; - - -(* -=============================================================================================== -Helper Functions -=============================================================================================== -*) - -FUNCTION DelChars (Source : STRING; CharsToDelete : TCharset) : STRING; - // Delete all "CharsToDelete" from the string -VAR - I : INTEGER; -BEGIN - Result := Source; - FOR I := Length (Result) DOWNTO 1 DO - IF Result [I] IN CharsToDelete THEN - Delete (Result, I, 1); -END; - - -FUNCTION TrimWs (Source : STRING) : STRING; - // Trimms off Whitespace characters from both ends of the string -VAR - I : INTEGER; -BEGIN - // --- Trim Left - I := 1; - WHILE (I <= Length (Source)) AND (Source [I] IN CWhitespace) DO - INC (I); - Result := Copy (Source, I, MaxInt); - - // --- Trim Right - I := Length (Result); - WHILE (I > 1) AND (Result [I] IN CWhitespace) DO - DEC (I); - Delete (Result, I+1, Length (Result)-I); -END; - - -FUNCTION ConvertWs (Source: STRING; PackWs: BOOLEAN) : STRING; - // Converts all Whitespace characters to the Space #x20 character - // If "PackWs" is true, contiguous Whitespace characters are packed to one -VAR - I : INTEGER; -BEGIN - Result := Source; - FOR I := Length (Result) DOWNTO 1 DO - IF (Result [I] IN CWhitespace) THEN - IF PackWs AND (I > 1) AND (Result [I-1] IN CWhitespace) - THEN Delete (Result, I, 1) - ELSE Result [I] := #32; -END; - - -PROCEDURE SetStringSF (VAR S : STRING; BufferStart, BufferFinal : PChar); -BEGIN - SetString (S, BufferStart, BufferFinal-BufferStart+1); -END; - - -FUNCTION StrLPas (Start : PChar; Len : INTEGER) : STRING; -BEGIN - SetString (Result, Start, Len); -END; - - -FUNCTION StrSFPas (Start, Finish : PChar) : STRING; -BEGIN - SetString (Result, Start, Finish-Start+1); -END; - - -FUNCTION StrScanE (CONST Source : PChar; CONST CharToScanFor : CHAR) : PChar; - // If "CharToScanFor" is not found, StrScanE returns the last char of the - // buffer instead of NIL -BEGIN - Result := StrScan (Source, CharToScanFor); - IF Result = NIL THEN - Result := StrEnd (Source)-1; -END; - - -PROCEDURE ExtractName (Start : PChar; Terminators : TCharset; VAR Final : PChar); - (* Extracts the complete Name beginning at "Start". - It is assumed that the name is contained in Markup, so the '>' character is - always a Termination. - Start: IN Pointer to first char of name. Is always considered to be valid - Terminators: IN Characters which terminate the name - Final: OUT Pointer to last char of name *) -BEGIN - Final := Start+1; - Include (Terminators, #0); - Include (Terminators, '>'); - WHILE NOT (Final^ IN Terminators) DO - INC (Final); - DEC (Final); -END; - - -PROCEDURE ExtractQuote (Start : PChar; VAR Content : STRING; VAR Final : PChar); - (* Extract a string which is contained in single or double Quotes. - Start: IN Pointer to opening quote - Content: OUT The quoted string - Final: OUT Pointer to closing quote *) -BEGIN - Final := StrScan (Start+1, Start^); - IF Final = NIL THEN BEGIN - Final := StrEnd (Start+1)-1; - SetString (Content, Start+1, Final-Start); - END - ELSE - SetString (Content, Start+1, Final-1-Start); -END; - - -(* -=============================================================================================== -TEntityStackNode -This Node is pushed to the "Entity Stack" whenever the parser parses entity replacement text. -The "Instance" field holds the Instance pointer of an External Entity buffer. When it is -popped, the Instance is freed. -The "Encoding" field holds the name of the Encoding. External Parsed Entities may have -another encoding as the document entity (XmlSpec 4.3.3). So when there is an " 0 THEN BEGIN - ESN := TEntityStackNode (Items [Count-1]); - Result := ESN.LastPos; - IF ESN.Instance <> NIL THEN - ESN.Instance.Free; - IF ESN.Encoding <> '' THEN - Owner.FCurEncoding := ESN.Encoding; // Restore current Encoding - Delete (Count-1); - END - ELSE - Result := NIL; -END; - - -(* -=============================================================================================== -TExternalID ------------ -XmlSpec 4.2.2: ExternalID ::= 'SYSTEM' S SystemLiteral | - 'PUBLIC' S PubidLiteral S SystemLiteral -XmlSpec 4.7: PublicID ::= 'PUBLIC' S PubidLiteral -SystemLiteral and PubidLiteral are quoted -=============================================================================================== -*) - -TYPE - TExternalID = CLASS - PublicId : STRING; - SystemId : STRING; - Final : PChar; - CONSTRUCTOR Create (Start : PChar); - END; - -CONSTRUCTOR TExternalID.Create (Start : PChar); -BEGIN - INHERITED Create; - Final := Start; - IF StrLComp (Start, 'SYSTEM', 6) = 0 THEN BEGIN - WHILE NOT (Final^ IN (CQuoteChar + [#0, '>', '['])) DO INC (Final); - IF NOT (Final^ IN CQuoteChar) THEN EXIT; - ExtractQuote (Final, SystemID, Final); - END - ELSE IF StrLComp (Start, 'PUBLIC', 6) = 0 THEN BEGIN - WHILE NOT (Final^ IN (CQuoteChar + [#0, '>', '['])) DO INC (Final); - IF NOT (Final^ IN CQuoteChar) THEN EXIT; - ExtractQuote (Final, PublicID, Final); - INC (Final); - WHILE NOT (Final^ IN (CQuoteChar + [#0, '>', '['])) DO INC (Final); - IF NOT (Final^ IN CQuoteChar) THEN EXIT; - ExtractQuote (Final, SystemID, Final); - END; -END; - - -(* -=============================================================================================== -TXmlParser -=============================================================================================== -*) - -CONSTRUCTOR TXmlParser.Create; -BEGIN - INHERITED Create; - FBuffer := NIL; - FBufferSize := 0; - Elements := TElemList.Create; - Entities := TNvpList.Create; - ParEntities := TNvpList.Create; - Notations := TNvpList.Create; - CurAttr := TAttrList.Create; - EntityStack := TEntityStack.Create (Self); - Clear; -END; - - -DESTRUCTOR TXmlParser.Destroy; -BEGIN - Clear; - Elements.Free; - Entities.Free; - ParEntities.Free; - Notations.Free; - CurAttr.Free; - EntityStack.Free; - INHERITED Destroy; -END; - - -PROCEDURE TXmlParser.Clear; - // Free Buffer and clear all object attributes -BEGIN - IF (FBufferSize > 0) AND (FBuffer <> NIL) THEN - FreeMem (FBuffer); - FBuffer := NIL; - FBufferSize := 0; - FSource := ''; - FXmlVersion := ''; - FEncoding := ''; - FStandalone := FALSE; - FRootName := ''; - FDtdcFinal := NIL; - FNormalize := TRUE; - Elements.Clear; - Entities.Clear; - ParEntities.Clear; - Notations.Clear; - CurAttr.Clear; - EntityStack.Clear; -END; - - -FUNCTION TXmlParser.LoadFromFile (Filename : STRING; FileMode : INTEGER = fmOpenRead OR fmShareDenyNone) : BOOLEAN; - // Loads Document from given file - // Returns TRUE if successful -VAR - f : FILE; - ReadIn : INTEGER; - OldFileMode : INTEGER; -BEGIN - Result := FALSE; - Clear; - - // --- Open File - OldFileMode := SYSTEM.FileMode; - TRY - SYSTEM.FileMode := FileMode; - TRY - AssignFile (f, Filename); - Reset (f, 1); - EXCEPT - EXIT; - END; - - TRY - // --- Allocate Memory - TRY - FBufferSize := Filesize (f) + 1; - GetMem (FBuffer, FBufferSize); - EXCEPT - Clear; - EXIT; - END; - - // --- Read File - TRY - BlockRead (f, FBuffer^, FBufferSize, ReadIn); - (FBuffer+ReadIn)^ := #0; // NULL termination - EXCEPT - Clear; - EXIT; - END; - FINALLY - CloseFile (f); - END; - - FSource := Filename; - Result := TRUE; - - FINALLY - SYSTEM.FileMode := OldFileMode; - END; -END; - - -FUNCTION TXmlParser.LoadFromBuffer (Buffer : PChar) : BOOLEAN; - // Loads Document from another buffer - // Returns TRUE if successful - // The "Source" property becomes '' if successful -BEGIN - Result := FALSE; - Clear; - FBufferSize := StrLen (Buffer) + 1; - TRY - GetMem (FBuffer, FBufferSize); - EXCEPT - Clear; - EXIT; - END; - StrCopy (FBuffer, Buffer); - FSource := ''; - Result := TRUE; -END; - - -PROCEDURE TXmlParser.SetBuffer (Buffer : PChar); // References another buffer -BEGIN - Clear; - FBuffer := Buffer; - FBufferSize := 0; - FSource := ''; -END; - - -//----------------------------------------------------------------------------------------------- -// Scanning through the document -//----------------------------------------------------------------------------------------------- - -PROCEDURE TXmlParser.StartScan; -BEGIN - CurPartType := ptNone; - CurName := ''; - CurContent := ''; - CurStart := NIL; - CurFinal := NIL; - CurAttr.Clear; - EntityStack.Clear; -END; - - -FUNCTION TXmlParser.Scan : BOOLEAN; - // Scans the next Part - // Returns TRUE if a part could be found, FALSE if there is no part any more - // - // "IsDone" can be set to FALSE by AnalyzeText in order to go to the next part - // if there is no Content due to normalization -VAR - IsDone : BOOLEAN; -BEGIN - REPEAT - IsDone := TRUE; - - // --- Start of next Part - IF CurStart = NIL - THEN CurStart := DocBuffer - ELSE CurStart := CurFinal+1; - CurFinal := CurStart; - - // --- End of Document of Pop off a new part from the Entity stack? - IF CurStart^ = #0 THEN - CurStart := EntityStack.Pop; - - // --- No Document or End Of Document: Terminate Scan - IF (CurStart = NIL) OR (CurStart^ = #0) THEN BEGIN - CurStart := StrEnd (DocBuffer); - CurFinal := CurStart-1; - EntityStack.Clear; - Result := FALSE; - EXIT; - END; - - IF (StrLComp (CurStart, ''); - IF CurFinal <> NIL - THEN INC (CurFinal) - ELSE CurFinal := StrEnd (CurStart)-1; - FCurEncoding := AnsiUpperCase (CurAttr.Value ('encoding')); - IF FCurEncoding = '' THEN - FCurEncoding := 'UTF-8'; // Default XML Encoding is UTF-8 - CurPartType := ptXmlProlog; - CurName := ''; - CurContent := ''; -END; - - -PROCEDURE TXmlParser.AnalyzeComment (Start : PChar; VAR Final : PChar); - // Analyze Comments -BEGIN - Final := StrPos (Start+4, '-->'); - IF Final = NIL - THEN Final := StrEnd (Start)-1 - ELSE INC (Final, 2); - CurPartType := ptComment; -END; - - -PROCEDURE TXmlParser.AnalyzePI (Start : PChar; VAR Final : PChar); - // Analyze Processing Instructions (PI) - // This is also called for Character -VAR - F : PChar; -BEGIN - CurPartType := ptPI; - Final := StrPos (Start+2, '?>'); - IF Final = NIL - THEN Final := StrEnd (Start)-1 - ELSE INC (Final); - ExtractName (Start+2, CWhitespace + ['?', '>'], F); - SetStringSF (CurName, Start+2, F); - SetStringSF (CurContent, F+1, Final-2); - CurAttr.Analyze (F+1, F); -END; - - -PROCEDURE TXmlParser.AnalyzeDtdc; - (* Analyze Document Type Declaration - doctypedecl ::= '' - markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment - PEReference ::= '%' Name ';' - - elementdecl ::= '' - AttlistDecl ::= '' - EntityDecl ::= '' | - '' - NotationDecl ::= '' - PI ::= '' Char* )))? '?>' - Comment ::= '' *) -TYPE - TPhase = (phName, phDtd, phInternal, phFinishing); -VAR - Phase : TPhase; - F : PChar; - ExternalID : TExternalID; - ExternalDTD : TXmlParser; - DER : TDtdElementRec; -BEGIN - DER.Start := CurStart; - EntityStack.Clear; // Clear stack for Parameter Entities - CurPartType := ptDtdc; - - // --- Don't read DTDc twice - IF FDtdcFinal <> NIL THEN BEGIN - CurFinal := FDtdcFinal; - EXIT; - END; - - // --- Scan DTDc - CurFinal := CurStart + 9; // First char after '' : BREAK; - ELSE IF NOT (CurFinal^ IN CWhitespace) THEN BEGIN - CASE Phase OF - phName : IF (CurFinal^ IN CNameStart) THEN BEGIN - ExtractName (CurFinal, CWhitespace + ['[', '>'], F); - SetStringSF (FRootName, CurFinal, F); - CurFinal := F; - Phase := phDtd; - END; - phDtd : IF (StrLComp (CurFinal, 'SYSTEM', 6) = 0) OR - (StrLComp (CurFinal, 'PUBLIC', 6) = 0) THEN BEGIN - ExternalID := TExternalID.Create (CurFinal); - ExternalDTD := LoadExternalEntity (ExternalId.SystemId, ExternalID.PublicId, ''); - F := StrPos (ExternalDtd.DocBuffer, ' NIL THEN - AnalyzeDtdElements (F, F); - ExternalDTD.Free; - CurFinal := ExternalID.Final; - ExternalID.Free; - END; - ELSE BEGIN - DER.ElementType := deError; - DER.Pos := CurFinal; - DER.Final := CurFinal; - DtdElementFound (DER); - END; - END; - - END; - END; - INC (CurFinal); - UNTIL FALSE; - - CurPartType := ptDtdc; - CurName := ''; - CurContent := ''; - - // It is an error in the document if "EntityStack" is not empty now - IF EntityStack.Count > 0 THEN BEGIN - DER.ElementType := deError; - DER.Final := CurFinal; - DER.Pos := CurFinal; - DtdElementFound (DER); - END; - - EntityStack.Clear; // Clear stack for General Entities - FDtdcFinal := CurFinal; -END; - - -PROCEDURE TXmlParser.AnalyzeDtdElements (Start : PChar; VAR Final : PChar); - // Analyze the "Elements" of a DTD contained in the external or - // internal DTD subset. -VAR - DER : TDtdElementRec; -BEGIN - Final := Start; - REPEAT - CASE Final^ OF - '%' : BEGIN - PushPE (Final); - CONTINUE; - END; - #0 : IF EntityStack.Count = 0 THEN - BREAK - ELSE BEGIN - CurFinal := EntityStack.Pop; - CONTINUE; - END; - ']', - '>' : BREAK; - '<' : IF StrLComp (Final, ''); - - // --- Set Default Attribute values for nonexistent attributes - IF (CurPartType = ptStartTag) OR (CurPartType = ptEmptyTag) THEN BEGIN - ElemDef := Elements.Node (CurName); - IF ElemDef <> NIL THEN BEGIN - FOR I := 0 TO ElemDef.Count-1 DO BEGIN - AttrDef := TAttrDef (ElemDef [I]); - Attr := TAttr (CurAttr.Node (AttrDef.Name)); - IF (Attr = NIL) AND (AttrDef.Value <> '') THEN BEGIN - Attr := TAttr.Create (AttrDef.Name, AttrDef.Value); - Attr.ValueType := vtDefault; - CurAttr.Add (Attr); - END; - IF Attr <> NIL THEN BEGIN - CASE AttrDef.DefaultType OF - adDefault : ; - adRequired : ; // -!- It is an error in the document if "Attr.Value" is an empty string - adImplied : Attr.ValueType := vtImplied; - adFixed : BEGIN - Attr.ValueType := vtFixed; - Attr.Value := AttrDef.Value; - END; - END; - Attr.AttrType := AttrDef.AttrType; - END; - END; - END; - - // --- Normalize Attribute Values. XmlSpec: - // - a character reference is processed by appending the referenced character to the attribute value - // - an entity reference is processed by recursively processing the replacement text of the entity - // - a whitespace character (#x20, #xD, #xA, #x9) is processed by appending #x20 to the normalized value, - // except that only a single #x20 is appended for a "#xD#xA" sequence that is part of an external - // parsed entity or the literal entity value of an internal parsed entity - // - other characters are processed by appending them to the normalized value - // If the declared value is not CDATA, then the XML processor must further process the - // normalized attribute value by discarding any leading and trailing space (#x20) characters, - // and by replacing sequences of space (#x20) characters by a single space (#x20) character. - // All attributes for which no declaration has been read should be treated by a - // non-validating parser as if declared CDATA. - // !!! The XML 1.0 SE specification is somewhat different here - // This code does not conform exactly to this specification - FOR I := 0 TO CurAttr.Count-1 DO - WITH TAttr (CurAttr [I]) DO BEGIN - ReplaceGeneralEntities (Value); - ReplaceCharacterEntities (Value); - IF (AttrType <> atCData) AND (AttrType <> atUnknown) - THEN Value := TranslateEncoding (TrimWs (ConvertWs (Value, TRUE))) - ELSE Value := TranslateEncoding (ConvertWs (Value, FALSE)); - END; - END; -END; - - -PROCEDURE TXmlParser.AnalyzeCData; - // Analyze CDATA Sections -BEGIN - CurPartType := ptCData; - CurFinal := StrPos (CurStart, CDEnd); - IF CurFinal = NIL THEN BEGIN - CurFinal := StrEnd (CurStart)-1; - CurContent := TranslateEncoding (StrPas (CurStart+Length (CDStart))); - END - ELSE BEGIN - SetStringSF (CurContent, CurStart+Length (CDStart), CurFinal-1); - INC (CurFinal, Length (CDEnd)-1); - CurContent := TranslateEncoding (CurContent); - END; -END; - - -PROCEDURE TXmlParser.AnalyzeText (VAR IsDone : BOOLEAN); - (* Analyzes Text Content between Tags. CurFinal will point to the last content character. - Content ends at a '<' character or at the end of the document. - Entity References and Character Entity references are resolved. - If PackSpaces is TRUE, contiguous Whitespace Characters will be compressed to - one Space #x20 character, Whitespace at the beginning and end of content will - be trimmed off and content which is or becomes empty is not returned to - the application (in this case, "IsDone" is set to FALSE which causes the - Scan method to proceed directly to the next part. *) - - PROCEDURE ProcessEntity; - (* Is called if there is an ampsersand '&' character found in the document. - IN "CurFinal" points to the ampersand - OUT "CurFinal" points to the first character after the semi-colon ';' *) - VAR - P : PChar; - Name : STRING; - EntityDef : TEntityDef; - ExternalEntity : TXmlParser; - BEGIN - P := StrScan (CurFinal , ';'); - IF P <> NIL THEN BEGIN - SetStringSF (Name, CurFinal+1, P-1); - - // Is it a Character Entity? - IF (CurFinal+1)^ = '#' THEN BEGIN - IF UpCase ((CurFinal+2)^) = 'X' // !!! Can't use "CHR" for Unicode characters > 255: - THEN CurContent := CurContent + CHR (StrToIntDef ('$'+Copy (Name, 3, MaxInt), 32)) - ELSE CurContent := CurContent + CHR (StrToIntDef (Copy (Name, 2, MaxInt), 32)); - CurFinal := P+1; - EXIT; - END - - // Is it a Predefined Entity? - ELSE IF Name = 'lt' THEN BEGIN CurContent := CurContent + '<'; CurFinal := P+1; EXIT; END - ELSE IF Name = 'gt' THEN BEGIN CurContent := CurContent + '>'; CurFinal := P+1; EXIT; END - ELSE IF Name = 'amp' THEN BEGIN CurContent := CurContent + '&'; CurFinal := P+1; EXIT; END - ELSE IF Name = 'apos' THEN BEGIN CurContent := CurContent + ''''; CurFinal := P+1; EXIT; END - ELSE IF Name = 'quot' THEN BEGIN CurContent := CurContent + '"'; CurFinal := P+1; EXIT; END; - - // Replace with Entity from DTD - EntityDef := TEntityDef (Entities.Node (Name)); - IF EntityDef <> NIL THEN BEGIN - IF EntityDef.Value <> '' THEN BEGIN - EntityStack.Push (P+1); - CurFinal := PChar (EntityDef.Value); - END - ELSE BEGIN - ExternalEntity := LoadExternalEntity (EntityDef.SystemId, EntityDef.PublicId, EntityDef.NotationName); - EntityStack.Push (ExternalEntity, P+1); - CurFinal := ExternalEntity.DocBuffer; - END; - END - ELSE BEGIN - CurContent := CurContent + Name; - CurFinal := P+1; - END; - END - ELSE BEGIN - INC (CurFinal); - END; - END; - -VAR - C : INTEGER; -BEGIN - CurFinal := CurStart; - CurPartType := ptContent; - CurContent := ''; - C := 0; - REPEAT - CASE CurFinal^ OF - '&' : BEGIN - CurContent := CurContent + TranslateEncoding (StrLPas (CurFinal-C, C)); - C := 0; - ProcessEntity; - CONTINUE; - END; - #0 : BEGIN - IF EntityStack.Count = 0 THEN - BREAK - ELSE BEGIN - CurContent := CurContent + TranslateEncoding (StrLPas (CurFinal-C, C)); - C := 0; - CurFinal := EntityStack.Pop; - CONTINUE; - END; - END; - '<' : BREAK; - ELSE INC (C); - END; - INC (CurFinal); - UNTIL FALSE; - CurContent := CurContent + TranslateEncoding (StrLPas (CurFinal-C, C)); - DEC (CurFinal); - - IF FNormalize THEN BEGIN - CurContent := ConvertWs (TrimWs (CurContent), TRUE); - IsDone := CurContent <> ''; // IsDone will only get FALSE if PackSpaces is TRUE - END; -END; - - -PROCEDURE TXmlParser.AnalyzeElementDecl (Start : PChar; VAR Final : PChar); - (* Parse ' character - XmlSpec 3.2: - elementdecl ::= '' - contentspec ::= 'EMPTY' | 'ANY' | Mixed | children - Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | - '(' S? '#PCDATA' S? ')' - children ::= (choice | seq) ('?' | '*' | '+')? - choice ::= '(' S? cp ( S? '|' S? cp )* S? ')' - cp ::= (Name | choice | seq) ('?' | '*' | '+')? - seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' - - More simply: - contentspec ::= EMPTY - ANY - '(#PCDATA)' - '(#PCDATA | A | B)*' - '(A, B, C)' - '(A | B | C)' - '(A?, B*, C+), - '(A, (B | C | D)* )' *) -VAR - Element : TElemDef; - Elem2 : TElemDef; - F : PChar; - DER : TDtdElementRec; -BEGIN - Element := TElemDef.Create; - Final := Start + 9; - DER.Start := Start; - REPEAT - IF Final^ = '>' THEN BREAK; - IF (Final^ IN CNameStart) AND (Element.Name = '') THEN BEGIN - ExtractName (Final, CWhitespace, F); - SetStringSF (Element.Name, Final, F); - Final := F; - F := StrScan (Final+1, '>'); - IF F = NIL THEN BEGIN - Element.Definition := STRING (Final); - Final := StrEnd (Final); - BREAK; - END - ELSE BEGIN - SetStringSF (Element.Definition, Final+1, F-1); - Final := F; - BREAK; - END; - END; - INC (Final); - UNTIL FALSE; - Element.Definition := DelChars (Element.Definition, CWhitespace); - ReplaceParameterEntities (Element.Definition); - IF Element.Definition = 'EMPTY' THEN Element.ElemType := etEmpty - ELSE IF Element.Definition = 'ANY' THEN Element.ElemType := etAny - ELSE IF Copy (Element.Definition, 1, 8) = '(#PCDATA' THEN Element.ElemType := etMixed - ELSE IF Copy (Element.Definition, 1, 1) = '(' THEN Element.ElemType := etChildren - ELSE Element.ElemType := etAny; - - Elem2 := Elements.Node (Element.Name); - IF Elem2 <> NIL THEN - Elements.Delete (Elements.IndexOf (Elem2)); - Elements.Add (Element); - Final := StrScanE (Final, '>'); - DER.ElementType := deElement; - DER.ElemDef := Element; - DER.Final := Final; - DtdElementFound (DER); -END; - - -PROCEDURE TXmlParser.AnalyzeAttListDecl (Start : PChar; VAR Final : PChar); - (* Parse ' character - XmlSpec 3.3: - AttlistDecl ::= '' - AttDef ::= S Name S AttType S DefaultDecl - AttType ::= StringType | TokenizedType | EnumeratedType - StringType ::= 'CDATA' - TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' - EnumeratedType ::= NotationType | Enumeration - NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' - Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' - DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) - AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" - Examples: - *) -TYPE - TPhase = (phElementName, phName, phType, phNotationContent, phDefault); -VAR - Phase : TPhase; - F : PChar; - ElementName : STRING; - ElemDef : TElemDef; - AttrDef : TAttrDef; - AttrDef2 : TAttrDef; - Strg : STRING; - DER : TDtdElementRec; -BEGIN - Final := Start + 9; // The character after ' : BREAK; - ELSE CASE Phase OF - phElementName : BEGIN - ExtractName (Final, CWhitespace + CQuoteChar + ['#'], F); - SetStringSF (ElementName, Final, F); - Final := F; - ElemDef := Elements.Node (ElementName); - IF ElemDef = NIL THEN BEGIN - ElemDef := TElemDef.Create; - ElemDef.Name := ElementName; - ElemDef.Definition := 'ANY'; - ElemDef.ElemType := etAny; - Elements.Add (ElemDef); - END; - Phase := phName; - END; - phName : BEGIN - AttrDef := TAttrDef.Create; - ExtractName (Final, CWhitespace + CQuoteChar + ['#'], F); - SetStringSF (AttrDef.Name, Final, F); - Final := F; - AttrDef2 := TAttrDef (ElemDef.Node (AttrDef.Name)); - IF AttrDef2 <> NIL THEN - ElemDef.Delete (ElemDef.IndexOf (AttrDef2)); - ElemDef.Add (AttrDef); - Phase := phType; - END; - phType : BEGIN - IF Final^ = '(' THEN BEGIN - F := StrScan (Final+1, ')'); - IF F <> NIL - THEN SetStringSF (AttrDef.TypeDef, Final+1, F-1) - ELSE AttrDef.TypeDef := STRING (Final+1); - AttrDef.TypeDef := DelChars (AttrDef.TypeDef, CWhitespace); - AttrDef.AttrType := atEnumeration; - ReplaceParameterEntities (AttrDef.TypeDef); - ReplaceCharacterEntities (AttrDef.TypeDef); - Phase := phDefault; - END - ELSE IF StrLComp (Final, 'NOTATION', 8) = 0 THEN BEGIN - INC (Final, 8); - AttrDef.AttrType := atNotation; - Phase := phNotationContent; - END - ELSE BEGIN - ExtractName (Final, CWhitespace+CQuoteChar+['#'], F); - SetStringSF (AttrDef.TypeDef, Final, F); - IF AttrDef.TypeDef = 'CDATA' THEN AttrDef.AttrType := atCData - ELSE IF AttrDef.TypeDef = 'ID' THEN AttrDef.AttrType := atId - ELSE IF AttrDef.TypeDef = 'IDREF' THEN AttrDef.AttrType := atIdRef - ELSE IF AttrDef.TypeDef = 'IDREFS' THEN AttrDef.AttrType := atIdRefs - ELSE IF AttrDef.TypeDef = 'ENTITY' THEN AttrDef.AttrType := atEntity - ELSE IF AttrDef.TypeDef = 'ENTITIES' THEN AttrDef.AttrType := atEntities - ELSE IF AttrDef.TypeDef = 'NMTOKEN' THEN AttrDef.AttrType := atNmToken - ELSE IF AttrDef.TypeDef = 'NMTOKENS' THEN AttrDef.AttrType := atNmTokens; - Phase := phDefault; - END - END; - phNotationContent : BEGIN - F := StrScan (Final, ')'); - IF F <> NIL THEN - SetStringSF (AttrDef.Notations, Final+1, F-1) - ELSE BEGIN - AttrDef.Notations := STRING (Final+1); - Final := StrEnd (Final); - END; - ReplaceParameterEntities (AttrDef.Notations); - AttrDef.Notations := DelChars (AttrDef.Notations, CWhitespace); - Phase := phDefault; - END; - phDefault : BEGIN - IF Final^ = '#' THEN BEGIN - ExtractName (Final, CWhiteSpace + CQuoteChar, F); - SetStringSF (Strg, Final, F); - Final := F; - ReplaceParameterEntities (Strg); - IF Strg = '#REQUIRED' THEN BEGIN AttrDef.DefaultType := adRequired; Phase := phName; END - ELSE IF Strg = '#IMPLIED' THEN BEGIN AttrDef.DefaultType := adImplied; Phase := phName; END - ELSE IF Strg = '#FIXED' THEN AttrDef.DefaultType := adFixed; - END - ELSE IF (Final^ IN CQuoteChar) THEN BEGIN - ExtractQuote (Final, AttrDef.Value, Final); - ReplaceParameterEntities (AttrDef.Value); - ReplaceCharacterEntities (AttrDef.Value); - Phase := phName; - END; - IF Phase = phName THEN BEGIN - AttrDef := NIL; - END; - END; - - END; - END; - INC (Final); - UNTIL FALSE; - - Final := StrScan (Final, '>'); - - DER.ElementType := deAttList; - DER.ElemDef := ElemDef; - DER.Final := Final; - DtdElementFound (DER); -END; - - -PROCEDURE TXmlParser.AnalyzeEntityDecl (Start : PChar; VAR Final : PChar); - (* Parse ' character - XmlSpec 4.2: - EntityDecl ::= '' | - '' - EntityDef ::= EntityValue | (ExternalID NDataDecl?) - PEDef ::= EntityValue | ExternalID - NDataDecl ::= S 'NDATA' S Name - EntityValue ::= '"' ([^%&"] | PEReference | EntityRef | CharRef)* '"' | - "'" ([^%&'] | PEReference | EntityRef | CharRef)* "'" - PEReference ::= '%' Name ';' - - Examples - - - - "> - - - Dies ist ein Test-Absatz

"> - *) -TYPE - TPhase = (phName, phContent, phNData, phNotationName, phFinalGT); -VAR - Phase : TPhase; - IsParamEntity : BOOLEAN; - F : PChar; - ExternalID : TExternalID; - EntityDef : TEntityDef; - EntityDef2 : TEntityDef; - DER : TDtdElementRec; -BEGIN - Final := Start + 8; // First char after ' : BREAK; - ELSE CASE Phase OF - phName : IF Final^ IN CNameStart THEN BEGIN - ExtractName (Final, CWhitespace + CQuoteChar, F); - SetStringSF (EntityDef.Name, Final, F); - Final := F; - Phase := phContent; - END; - phContent : IF Final^ IN CQuoteChar THEN BEGIN - ExtractQuote (Final, EntityDef.Value, Final); - Phase := phFinalGT; - END - ELSE IF (StrLComp (Final, 'SYSTEM', 6) = 0) OR - (StrLComp (Final, 'PUBLIC', 6) = 0) THEN BEGIN - ExternalID := TExternalID.Create (Final); - EntityDef.SystemId := ExternalID.SystemId; - EntityDef.PublicId := ExternalID.PublicId; - Final := ExternalID.Final; - Phase := phNData; - ExternalID.Free; - END; - phNData : IF StrLComp (Final, 'NDATA', 5) = 0 THEN BEGIN - INC (Final, 4); - Phase := phNotationName; - END; - phNotationName : IF Final^ IN CNameStart THEN BEGIN - ExtractName (Final, CWhitespace + ['>'], F); - SetStringSF (EntityDef.NotationName, Final, F); - Final := F; - Phase := phFinalGT; - END; - phFinalGT : ; // -!- There is an error in the document if this branch is called - END; - END; - INC (Final); - UNTIL FALSE; - IF IsParamEntity THEN BEGIN - EntityDef2 := TEntityDef (ParEntities.Node (EntityDef.Name)); - IF EntityDef2 <> NIL THEN - ParEntities.Delete (ParEntities.IndexOf (EntityDef2)); - ParEntities.Add (EntityDef); - ReplaceCharacterEntities (EntityDef.Value); - END - ELSE BEGIN - EntityDef2 := TEntityDef (Entities.Node (EntityDef.Name)); - IF EntityDef2 <> NIL THEN - Entities.Delete (Entities.IndexOf (EntityDef2)); - Entities.Add (EntityDef); - ReplaceParameterEntities (EntityDef.Value); // Create replacement texts (see XmlSpec 4.5) - ReplaceCharacterEntities (EntityDef.Value); - END; - Final := StrScanE (Final, '>'); - - DER.ElementType := deEntity; - DER.EntityDef := EntityDef; - DER.Final := Final; - DtdElementFound (DER); -END; - - -PROCEDURE TXmlParser.AnalyzeNotationDecl (Start : PChar; VAR Final : PChar); - // Parse ' character - // XmlSpec 4.7: NotationDecl ::= '' -TYPE - TPhase = (phName, phExtId, phEnd); -VAR - ExternalID : TExternalID; - Phase : TPhase; - F : PChar; - NotationDef : TNotationDef; - DER : TDtdElementRec; -BEGIN - Final := Start + 10; // Character after ', - #0 : BREAK; - ELSE CASE Phase OF - phName : BEGIN - ExtractName (Final, CWhitespace + ['>'], F); - SetStringSF (NotationDef.Name, Final, F); - Final := F; - Phase := phExtId; - END; - phExtId : BEGIN - ExternalID := TExternalID.Create (Final); - NotationDef.Value := ExternalID.SystemId; - NotationDef.PublicId := ExternalID.PublicId; - Final := ExternalId.Final; - ExternalId.Free; - Phase := phEnd; - END; - phEnd : ; // -!- There is an error in the document if this branch is called - END; - END; - INC (Final); - UNTIL FALSE; - Notations.Add (NotationDef); - Final := StrScanE (Final, '>'); - - DER.ElementType := deNotation; - DER.NotationDef := NotationDef; - DER.Final := Final; - DtdElementFound (DER); -END; - - -PROCEDURE TXmlParser.PushPE (VAR Start : PChar); - (* If there is a parameter entity reference found in the data stream, - the current position will be pushed to the entity stack. - Start: IN Pointer to the '%' character starting the PE reference - OUT Pointer to first character of PE replacement text *) -VAR - P : PChar; - EntityDef : TEntityDef; -BEGIN - P := StrScan (Start, ';'); - IF P <> NIL THEN BEGIN - EntityDef := TEntityDef (ParEntities.Node (StrSFPas (Start+1, P-1))); - IF EntityDef <> NIL THEN BEGIN - EntityStack.Push (P+1); - Start := PChar (EntityDef.Value); - END - ELSE - Start := P+1; - END; -END; - - -PROCEDURE TXmlParser.ReplaceCharacterEntities (VAR Str : STRING); - // Replaces all Character Entity References in the String -VAR - Start : INTEGER; - PAmp : PChar; - PSemi : PChar; - PosAmp : INTEGER; - Len : INTEGER; // Length of Entity Reference -BEGIN - IF Str = '' THEN EXIT; - Start := 1; - REPEAT - PAmp := StrPos (PChar (Str) + Start-1, '&#'); - IF PAmp = NIL THEN BREAK; - PSemi := StrScan (PAmp+2, ';'); - IF PSemi = NIL THEN BREAK; - PosAmp := PAmp - PChar (Str) + 1; - Len := PSemi-PAmp+1; - IF CompareText (Str [PosAmp+2], 'x') = 0 // !!! Can't use "CHR" for Unicode characters > 255 - THEN Str [PosAmp] := CHR (StrToIntDef ('$'+Copy (Str, PosAmp+3, Len-4), 0)) - ELSE Str [PosAmp] := CHR (StrToIntDef (Copy (Str, PosAmp+2, Len-3), 32)); - Delete (Str, PosAmp+1, Len-1); - Start := PosAmp + 1; - UNTIL FALSE; -END; - - -PROCEDURE TXmlParser.ReplaceParameterEntities (VAR Str : STRING); - // Recursively replaces all Parameter Entity References in the String - PROCEDURE ReplaceEntities (VAR Str : STRING); - VAR - Start : INTEGER; - PAmp : PChar; - PSemi : PChar; - PosAmp : INTEGER; - Len : INTEGER; - Entity : TEntityDef; - Repl : STRING; // Replacement - BEGIN - IF Str = '' THEN EXIT; - Start := 1; - REPEAT - PAmp := StrPos (PChar (Str)+Start-1, '%'); - IF PAmp = NIL THEN BREAK; - PSemi := StrScan (PAmp+2, ';'); - IF PSemi = NIL THEN BREAK; - PosAmp := PAmp - PChar (Str) + 1; - Len := PSemi-PAmp+1; - Entity := TEntityDef (ParEntities.Node (Copy (Str, PosAmp+1, Len-2))); - IF Entity <> NIL THEN BEGIN - Repl := Entity.Value; - ReplaceEntities (Repl); // Recursion - END - ELSE - Repl := Copy (Str, PosAmp, Len); - Delete (Str, PosAmp, Len); - Insert (Repl, Str, PosAmp); - Start := PosAmp + Length (Repl); - UNTIL FALSE; - END; -BEGIN - ReplaceEntities (Str); -END; - - -PROCEDURE TXmlParser.ReplaceGeneralEntities (VAR Str : STRING); - // Recursively replaces General Entity References in the String - PROCEDURE ReplaceEntities (VAR Str : STRING); - VAR - Start : INTEGER; - PAmp : PChar; - PSemi : PChar; - PosAmp : INTEGER; - Len : INTEGER; - EntityDef : TEntityDef; - EntName : STRING; - Repl : STRING; // Replacement - ExternalEntity : TXmlParser; - BEGIN - IF Str = '' THEN EXIT; - Start := 1; - REPEAT - PAmp := StrPos (PChar (Str)+Start-1, '&'); - IF PAmp = NIL THEN BREAK; - PSemi := StrScan (PAmp+2, ';'); - IF PSemi = NIL THEN BREAK; - PosAmp := PAmp - PChar (Str) + 1; - Len := PSemi-PAmp+1; - EntName := Copy (Str, PosAmp+1, Len-2); - IF EntName = 'lt' THEN Repl := '<' - ELSE IF EntName = 'gt' THEN Repl := '>' - ELSE IF EntName = 'amp' THEN Repl := '&' - ELSE IF EntName = 'apos' THEN Repl := '''' - ELSE IF EntName = 'quot' THEN Repl := '"' - ELSE BEGIN - EntityDef := TEntityDef (Entities.Node (EntName)); - IF EntityDef <> NIL THEN BEGIN - IF EntityDef.Value <> '' THEN // Internal Entity - Repl := EntityDef.Value - ELSE BEGIN // External Entity - ExternalEntity := LoadExternalEntity (EntityDef.SystemId, EntityDef.PublicId, EntityDef.NotationName); - Repl := StrPas (ExternalEntity.DocBuffer); // !!! What if it contains a Text Declaration? - ExternalEntity.Free; - END; - ReplaceEntities (Repl); // Recursion - END - ELSE - Repl := Copy (Str, PosAmp, Len); - END; - Delete (Str, PosAmp, Len); - Insert (Repl, Str, PosAmp); - Start := PosAmp + Length (Repl); - UNTIL FALSE; - END; -BEGIN - ReplaceEntities (Str); -END; - - -FUNCTION TXmlParser.LoadExternalEntity (SystemId, PublicId, Notation : STRING) : TXmlParser; - // This will be called whenever there is a Parsed External Entity or - // the DTD External Subset to be parsed. - // It has to create a TXmlParser instance and load the desired Entity. - // This instance of LoadExternalEntity assumes that "SystemId" is a valid - // file name (relative to the Document source) and loads this file using - // the LoadFromFile method. -VAR - Filename : STRING; -BEGIN - // --- Convert System ID to complete filename - Filename := StringReplace (SystemId, '/', '\', [rfReplaceAll]); - IF Copy (FSource, 1, 1) <> '<' THEN - IF (Copy (Filename, 1, 2) = '\\') OR (Copy (Filename, 2, 1) = ':') THEN - // Already has an absolute Path - ELSE BEGIN - Filename := ExtractFilePath (FSource) + Filename; - END; - - // --- Load the File - Result := TXmlParser.Create; - Result.LoadFromFile (Filename); -END; - - -FUNCTION TXmlParser.TranslateEncoding (CONST Source : STRING) : STRING; - // The member variable "CurEncoding" always holds the name of the current - // encoding, e.g. 'UTF-8' or 'ISO-8859-1'. - // This virtual method "TranslateEncoding" is responsible for translating - // the content passed in the "Source" parameter to the Encoding which - // is expected by the application. - // This instance of "TranlateEncoding" assumes that the Application expects - // Windows ANSI (Win1252) strings. It is able to transform UTF-8 or ISO-8859-1 - // encodings. - // If you want your application to understand or create other encodings, you - // override this function. -BEGIN - IF CurEncoding = 'UTF-8' - THEN Result := Utf8ToAnsi (Source) - ELSE Result := Source; -END; - - -PROCEDURE TXmlParser.DtdElementFound (DtdElementRec : TDtdElementRec); - // This method is called for every element which is found in the DTD - // declaration. The variant record TDtdElementRec is passed which - // holds informations about the element. - // You can override this function to handle DTD declarations. - // Note that when you parse the same Document instance a second time, - // the DTD will not get parsed again. -BEGIN -END; - - -FUNCTION TXmlParser.GetDocBuffer: PChar; - // Returns FBuffer or a pointer to a NUL char if Buffer is empty -BEGIN - IF FBuffer = NIL - THEN Result := #0 - ELSE Result := FBuffer; -END; - - -(*$IFNDEF HAS_CONTNRS_UNIT -=============================================================================================== -TObjectList -=============================================================================================== -*) - -DESTRUCTOR TObjectList.Destroy; -BEGIN - Clear; - SetCapacity(0); - INHERITED Destroy; -END; - - -PROCEDURE TObjectList.Delete (Index : INTEGER); -BEGIN - IF (Index < 0) OR (Index >= Count) THEN EXIT; - TObject (Items [Index]).Free; - INHERITED Delete (Index); -END; - - -PROCEDURE TObjectList.Clear; -BEGIN - WHILE Count > 0 DO - Delete (Count-1); -END; - -(*$ENDIF *) - -(* -=============================================================================================== -TNvpNode --------- -Node base class for the TNvpList -=============================================================================================== -*) - -CONSTRUCTOR TNvpNode.Create (TheName, TheValue : STRING); -BEGIN - INHERITED Create; - Name := TheName; - Value := TheValue; -END; - - -(* -=============================================================================================== -TNvpList --------- -A generic List of Name-Value Pairs, based on the TObjectList introduced in Delphi 5 -=============================================================================================== -*) - -PROCEDURE TNvpList.Add (Node : TNvpNode); -VAR - I : INTEGER; -BEGIN - FOR I := Count-1 DOWNTO 0 DO - IF Node.Name > TNvpNode (Items [I]).Name THEN BEGIN - Insert (I+1, Node); - EXIT; - END; - Insert (0, Node); -END; - - - -FUNCTION TNvpList.Node (Name : STRING) : TNvpNode; - // Binary search for Node -VAR - L, H : INTEGER; // Low, High Limit - T, C : INTEGER; // Test Index, Comparison result - Last : INTEGER; // Last Test Index -BEGIN - IF Count=0 THEN BEGIN - Result := NIL; - EXIT; - END; - - L := 0; - H := Count; - Last := -1; - REPEAT - T := (L+H) DIV 2; - IF T=Last THEN BREAK; - Result := TNvpNode (Items [T]); - C := CompareStr (Result.Name, Name); - IF C = 0 THEN EXIT - ELSE IF C < 0 THEN L := T - ELSE H := T; - Last := T; - UNTIL FALSE; - Result := NIL; -END; - - -FUNCTION TNvpList.Node (Index : INTEGER) : TNvpNode; -BEGIN - IF (Index < 0) OR (Index >= Count) - THEN Result := NIL - ELSE Result := TNvpNode (Items [Index]); -END; - - -FUNCTION TNvpList.Value (Name : STRING) : STRING; -VAR - Nvp : TNvpNode; -BEGIN - Nvp := TNvpNode (Node (Name)); - IF Nvp <> NIL - THEN Result := Nvp.Value - ELSE Result := ''; -END; - - -FUNCTION TNvpList.Value (Index : INTEGER) : STRING; -BEGIN - IF (Index < 0) OR (Index >= Count) - THEN Result := '' - ELSE Result := TNvpNode (Items [Index]).Value; -END; - - -FUNCTION TNvpList.Name (Index : INTEGER) : STRING; -BEGIN - IF (Index < 0) OR (Index >= Count) - THEN Result := '' - ELSE Result := TNvpNode (Items [Index]).Name; -END; - - -(* -=============================================================================================== -TAttrList -List of Attributes. The "Analyze" method extracts the Attributes from the given Buffer. -Is used for extraction of Attributes in Start-Tags, Empty-Element Tags and the "pseudo" -attributes in XML Prologs, Text Declarations and PIs. -=============================================================================================== -*) - -PROCEDURE TAttrList.Analyze (Start : PChar; VAR Final : PChar); - // Analyze the Buffer for Attribute=Name pairs. - // Terminates when there is a character which is not IN CNameStart - // (e.g. '?>' or '>' or '/>') -TYPE - TPhase = (phName, phEq, phValue); -VAR - Phase : TPhase; - F : PChar; - Name : STRING; - Value : STRING; - Attr : TAttr; -BEGIN - Clear; - Phase := phName; - Final := Start; - REPEAT - IF (Final^ = #0) OR (Final^ = '>') THEN BREAK; - IF NOT (Final^ IN CWhitespace) THEN - CASE Phase OF - phName : BEGIN - IF NOT (Final^ IN CNameStart) THEN EXIT; - ExtractName (Final, CWhitespace + ['=', '/'], F); - SetStringSF (Name, Final, F); - Final := F; - Phase := phEq; - END; - phEq : BEGIN - IF Final^ = '=' THEN - Phase := phValue - END; - phValue : BEGIN - IF Final^ IN CQuoteChar THEN BEGIN - ExtractQuote (Final, Value, F); - Attr := TAttr.Create; - Attr.Name := Name; - Attr.Value := Value; - Attr.ValueType := vtNormal; - Add (Attr); - Final := F; - Phase := phName; - END; - END; - END; - INC (Final); - UNTIL FALSE; -END; - - -(* -=============================================================================================== -TElemList -List of TElemDef nodes. -=============================================================================================== -*) - -FUNCTION TElemList.Node (Name : STRING) : TElemDef; - // Binary search for the Node with the given Name -VAR - L, H : INTEGER; // Low, High Limit - T, C : INTEGER; // Test Index, Comparison result - Last : INTEGER; // Last Test Index -BEGIN - IF Count=0 THEN BEGIN - Result := NIL; - EXIT; - END; - - L := 0; - H := Count; - Last := -1; - REPEAT - T := (L+H) DIV 2; - IF T=Last THEN BREAK; - Result := TElemDef (Items [T]); - C := CompareStr (Result.Name, Name); - IF C = 0 THEN EXIT - ELSE IF C < 0 THEN L := T - ELSE H := T; - Last := T; - UNTIL FALSE; - Result := NIL; -END; - - -PROCEDURE TElemList.Add (Node : TElemDef); -VAR - I : INTEGER; -BEGIN - FOR I := Count-1 DOWNTO 0 DO - IF Node.Name > TElemDef (Items [I]).Name THEN BEGIN - Insert (I+1, Node); - EXIT; - END; - Insert (0, Node); -END; - - -(* -=============================================================================================== -TScannerXmlParser -A TXmlParser descendant for the TCustomXmlScanner component -=============================================================================================== -*) - -TYPE - TScannerXmlParser = CLASS (TXmlParser) - Scanner : TCustomXmlScanner; - CONSTRUCTOR Create (TheScanner : TCustomXmlScanner); - FUNCTION LoadExternalEntity (SystemId, PublicId, - Notation : STRING) : TXmlParser; OVERRIDE; - FUNCTION TranslateEncoding (CONST Source : STRING) : STRING; OVERRIDE; - PROCEDURE DtdElementFound (DtdElementRec : TDtdElementRec); OVERRIDE; - END; - -CONSTRUCTOR TScannerXmlParser.Create (TheScanner : TCustomXmlScanner); -BEGIN - INHERITED Create; - Scanner := TheScanner; -END; - - -FUNCTION TScannerXmlParser.LoadExternalEntity (SystemId, PublicId, Notation : STRING) : TXmlParser; -BEGIN - IF Assigned (Scanner.FOnLoadExternal) - THEN Scanner.FOnLoadExternal (Scanner, SystemId, PublicId, Notation, Result) - ELSE Result := INHERITED LoadExternalEntity (SystemId, PublicId, Notation); -END; - - -FUNCTION TScannerXmlParser.TranslateEncoding (CONST Source : STRING) : STRING; -BEGIN - IF Assigned (Scanner.FOnTranslateEncoding) - THEN Result := Scanner.FOnTranslateEncoding (Scanner, CurEncoding, Source) - ELSE Result := INHERITED TranslateEncoding (Source); -END; - - -PROCEDURE TScannerXmlParser.DtdElementFound (DtdElementRec : TDtdElementRec); -BEGIN - WITH DtdElementRec DO - CASE ElementType OF - deElement : Scanner.WhenElement (ElemDef); - deAttList : Scanner.WhenAttList (ElemDef); - deEntity : Scanner.WhenEntity (EntityDef); - deNotation : Scanner.WhenNotation (NotationDef); - dePI : Scanner.WhenPI (STRING (Target), STRING (Content), AttrList); - deComment : Scanner.WhenComment (StrSFPas (Start, Final)); - deError : Scanner.WhenDtdError (Pos); - END; -END; - - -(* -=============================================================================================== -TCustomXmlScanner -=============================================================================================== -*) - -CONSTRUCTOR TCustomXmlScanner.Create (AOwner: TComponent); -BEGIN - INHERITED; - FXmlParser := TScannerXmlParser.Create (Self); -END; - - -DESTRUCTOR TCustomXmlScanner.Destroy; -BEGIN - FXmlParser.Free; - INHERITED; -END; - - -PROCEDURE TCustomXmlScanner.LoadFromFile (Filename : TFilename); - // Load XML Document from file -BEGIN - FXmlParser.LoadFromFile (Filename); -END; - - -PROCEDURE TCustomXmlScanner.LoadFromBuffer (Buffer : PChar); - // Load XML Document from buffer -BEGIN - FXmlParser.LoadFromBuffer (Buffer); -END; - - -PROCEDURE TCustomXmlScanner.SetBuffer (Buffer : PChar); - // Refer to Buffer -BEGIN - FXmlParser.SetBuffer (Buffer); -END; - - -FUNCTION TCustomXmlScanner.GetFilename : TFilename; -BEGIN - Result := FXmlParser.Source; -END; - - -FUNCTION TCustomXmlScanner.GetNormalize : BOOLEAN; -BEGIN - Result := FXmlParser.Normalize; -END; - - -PROCEDURE TCustomXmlScanner.SetNormalize (Value : BOOLEAN); -BEGIN - FXmlParser.Normalize := Value; -END; - - -PROCEDURE TCustomXmlScanner.WhenXmlProlog(XmlVersion, Encoding: STRING; Standalone : BOOLEAN); - // Is called when the parser has parsed the declaration of the prolog -BEGIN - IF Assigned (FOnXmlProlog) THEN FOnXmlProlog (Self, XmlVersion, Encoding, Standalone); -END; - - -PROCEDURE TCustomXmlScanner.WhenComment (Comment : STRING); - // Is called when the parser has parsed a -BEGIN - IF Assigned (FOnComment) THEN FOnComment (Self, Comment); -END; - - -PROCEDURE TCustomXmlScanner.WhenPI (Target, Content: STRING; Attributes : TAttrList); - // Is called when the parser has parsed a -BEGIN - IF Assigned (FOnPI) THEN FOnPI (Self, Target, Content, Attributes); -END; - - -PROCEDURE TCustomXmlScanner.WhenDtdRead (RootElementName : STRING); - // Is called when the parser has completely parsed the DTD -BEGIN - IF Assigned (FOnDtdRead) THEN FOnDtdRead (Self, RootElementName); -END; - - -PROCEDURE TCustomXmlScanner.WhenStartTag (TagName : STRING; Attributes : TAttrList); - // Is called when the parser has parsed a start tag like

-BEGIN - IF Assigned (FOnStartTag) THEN FOnStartTag (Self, TagName, Attributes); -END; - - -PROCEDURE TCustomXmlScanner.WhenEmptyTag (TagName : STRING; Attributes : TAttrList); - // Is called when the parser has parsed an Empty Element Tag like
-BEGIN - IF Assigned (FOnEmptyTag) THEN FOnEmptyTag (Self, TagName, Attributes); -END; - - -PROCEDURE TCustomXmlScanner.WhenEndTag (TagName : STRING); - // Is called when the parser has parsed an End Tag like

-BEGIN - IF Assigned (FOnEndTag) THEN FOnEndTag (Self, TagName); -END; - - -PROCEDURE TCustomXmlScanner.WhenContent (Content : STRING); - // Is called when the parser has parsed an element's text content -BEGIN - IF Assigned (FOnContent) THEN FOnContent (Self, Content); -END; - - -PROCEDURE TCustomXmlScanner.WhenCData (Content : STRING); - // Is called when the parser has parsed a CDATA section -BEGIN - IF Assigned (FOnCData) THEN FOnCData (Self, Content); -END; - - -PROCEDURE TCustomXmlScanner.WhenElement (ElemDef : TElemDef); - // Is called when the parser has parsed an definition - // inside the DTD -BEGIN - IF Assigned (FOnElement) THEN FOnElement (Self, ElemDef); -END; - - -PROCEDURE TCustomXmlScanner.WhenAttList (ElemDef : TElemDef); - // Is called when the parser has parsed an definition - // inside the DTD -BEGIN - IF Assigned (FOnAttList) THEN FOnAttList (Self, ElemDef); -END; - - -PROCEDURE TCustomXmlScanner.WhenEntity (EntityDef : TEntityDef); - // Is called when the parser has parsed an definition - // inside the DTD -BEGIN - IF Assigned (FOnEntity) THEN FOnEntity (Self, EntityDef); -END; - - -PROCEDURE TCustomXmlScanner.WhenNotation (NotationDef : TNotationDef); - // Is called when the parser has parsed a definition - // inside the DTD -BEGIN - IF Assigned (FOnNotation) THEN FOnNotation (Self, NotationDef); -END; - - -PROCEDURE TCustomXmlScanner.WhenDtdError (ErrorPos : PChar); - // Is called when the parser has found an Error in the DTD -BEGIN - IF Assigned (FOnDtdError) THEN FOnDtdError (Self, ErrorPos); -END; - - -PROCEDURE TCustomXmlScanner.Execute; - // Perform scanning - // Scanning is done synchronously, i.e. you can expect events to be triggered - // in the order of the XML data stream. Execute will finish when the whole XML - // document has been scanned or when the StopParser property has been set to TRUE. -BEGIN - FStopParser := FALSE; - FXmlParser.StartScan; - WHILE FXmlParser.Scan AND (NOT FStopParser) DO - CASE FXmlParser.CurPartType OF - ptNone : ; - ptXmlProlog : WhenXmlProlog (FXmlParser.XmlVersion, FXmlParser.Encoding, FXmlParser.Standalone); - ptComment : WhenComment (StrSFPas (FXmlParser.CurStart, FXmlParser.CurFinal)); - ptPI : WhenPI (FXmlParser.CurName, FXmlParser.CurContent, FXmlParser.CurAttr); - ptDtdc : WhenDtdRead (FXmlParser.RootName); - ptStartTag : WhenStartTag (FXmlParser.CurName, FXmlParser.CurAttr); - ptEmptyTag : WhenEmptyTag (FXmlParser.CurName, FXmlParser.CurAttr); - ptEndTag : WhenEndTag (FXmlParser.CurName); - ptContent : WhenContent (FXmlParser.CurContent); - ptCData : WhenCData (FXmlParser.CurContent); - END; -END; - - -END. +(** +=============================================================================================== +Name : LibXmlParser +=============================================================================================== +Project : All Projects +=============================================================================================== +Subject : Progressive XML Parser for all types of XML Files +=============================================================================================== +Author : Stefan Heymann + Eschenweg 3 + 72076 Tübingen + GERMANY + +E-Mail: stefan@destructor.de +URL: www.destructor.de +=============================================================================================== +Source, Legals ("Licence") +-------------------------- +The official site to get this parser is http://www.destructor.de/ + +Usage and Distribution of this Source Code is ruled by the +"Destructor.de Source code Licence" (DSL) which comes with this file or +can be downloaded at http://www.destructor.de/ + +IN SHORT: Usage and distribution of this source code is free. + You use it completely on your own risk. + +Postcardware +------------ +If you like this code, please send a postcard of your city to my above address. +=============================================================================================== +!!! All parts of this code which are not finished or not conforming exactly to + the XmlSpec are marked with three exclamation marks + +-!- Parts where the parser may be able to detect errors in the document's syntax are + marked with the dash-exlamation mark-dash sequence. +=============================================================================================== +Terminology: +------------ +- Start: Start of a buffer part +- Final: End (last character) of a buffer part +- DTD: Document Type Definition +- DTDc: Document Type Declaration +- XMLSpec: The current W3C XML Recommendation (version 1.0 as of 1998-02-10), Chapter No. +- Cur*: Fields concerning the "Current" part passed back by the "Scan" method +=============================================================================================== +Scanning the XML document +------------------------- +- Create TXmlParser Instance MyXml := TXmlParser.Create; +- Load XML Document MyXml.LoadFromFile (Filename); +- Start Scanning MyXml.StartScan; +- Scan Loop WHILE MyXml.Scan DO +- Test for Part Type CASE MyXml.CurPartType OF +- Handle Parts ... : ;;; +- Handle Parts ... : ;;; +- Handle Parts ... : ;;; + END; +- Destroy MyXml.Free; +=============================================================================================== +Loading the XML document +------------------------ +You can load the XML document from a file with the "LoadFromFile" method. +It is beyond the scope of this parser to perform HTTP or FTP accesses. If you want your +application to handle such requests (URLs), you can load the XML via HTTP or FTP or whatever +protocol and hand over the data buffer using the "LoadFromBuffer" or "SetBuffer" method. +"LoadFromBuffer" loads the internal buffer of TXmlParser with the given null-terminated +string, thereby creating a copy of that buffer. +"SetBuffer" just takes the pointer to another buffer, which means that the given +buffer pointer must be valid while the document is accessed via TXmlParser. +=============================================================================================== +Encodings: +---------- +This XML parser kind of "understands" the following encodings: +- UTF-8 +- ISO-8859-1 +- Windows-1252 + +Any flavor of multi-byte characters (and this includes UTF-16) is not supported. Sorry. + +Every string which has to be passed to the application passes the virtual method +"TranslateEncoding" which translates the string from the current encoding (stored in +"CurEncoding") into the encoding the application wishes to receive. +The "TranslateEncoding" method that is built into TXmlParser assumes that the application +wants to receive Windows ANSI (Windows-1252, about the same as ISO-8859-1) and is able +to convert UTF-8 and ISO-8859-1 encodings. +For other source and target encodings, you will have to override "TranslateEncoding". +=============================================================================================== +Buffer Handling +--------------- +- The document must be loaded completely into a piece of RAM +- All character positions are referenced by PChar pointers +- The TXmlParser instance can either "own" the buffer itself (then, FBufferSize is > 0) + or reference the buffer of another instance or object (then, FBuffersize is 0 and + FBuffer is not NIL) +- The Property DocBuffer passes back a pointer to the first byte of the document. If there + is no document stored (FBuffer is NIL), the DocBuffer returns a pointer to a NULL character. +=============================================================================================== +Whitespace Handling +------------------- +The TXmlParser property "PackSpaces" determines how Whitespace is returned in Text Content: +While PackSpaces is true, all leading and trailing whitespace characters are trimmed of, all +Whitespace is converted to Space #x20 characters and contiguous Whitespace characters are +compressed to one. +If the "Scan" method reports a ptContent part, the application can get the original text +with all whitespace characters by extracting the characters from "CurStart" to "CurFinal". +If the application detects an xml:space attribute, it can set "PackSpaces" accordingly or +use CurStart/CurFinal. +Please note that TXmlParser does _not_ normalize Line Breaks to single LineFeed characters +as the XmlSpec requires (XmlSpec 2.11). +The xml:space attribute is not handled by TXmlParser. This is on behalf of the application. +=============================================================================================== +Non-XML-Conforming +------------------ +TXmlParser does not conform 100 % exactly to the XmlSpec: +- UTF-16 is not supported (XmlSpec 2.2) + (Workaround: Convert UTF-16 to UTF-8 and hand the buffer over to TXmlParser) +- As the parser only works with single byte strings, all Unicode characters > 255 + can currently not be handled correctly. +- Line breaks are not normalized to single Linefeed #x0A characters (XmlSpec 2.11) + (Workaround: The Application can access the text contents on its own [CurStart, CurFinal], + thereby applying every normalization it wishes to) +- The attribute value normalization does not work exactly as defined in the + Second Edition of the XML 1.0 specification. +- See also the code parts marked with three consecutive exclamation marks. These are + parts which are not finished in the current code release. + +This list may be incomplete, so it may grow if I get to know any other points. +As work on the parser proceeds, this list may also shrink. +=============================================================================================== +Things Todo +----------- +- Introduce a new event/callback which is called when there is an unresolvable + entity or character reference +- Support Unicode +- Use Streams instead of reading the whole XML into memory +=============================================================================================== +Change History, Version numbers +------------------------------- +The Date is given in ISO Year-Month-Day (YYYY-MM-DD) order. +Versions are counted from 1.0.0 beginning with the version from 2000-03-16. +Unreleased versions don't get a version number. + +Date Author Version Changes +----------------------------------------------------------------------------------------------- +2000-03-16 HeySt 1.0.0 Start +2000-03-28 HeySt 1.0.1 Initial Publishing of TXmlParser on the destructor.de Web Site +2000-03-30 HeySt 1.0.2 TXmlParser.AnalyzeCData: Call "TranslateEncoding" for CurContent +2000-03-31 HeySt 1.0.3 Deleted the StrPosE function (was not needed anyway) +2000-04-04 HeySt 1.0.4 TDtdElementRec modified: Start/Final for all Elements; + Should be backwards compatible. + AnalyzeDtdc: Set CurPartType to ptDtdc +2000-04-23 HeySt 1.0.5 New class TObjectList. Eliminated reference to the Delphi 5 + "Contnrs" unit so LibXmlParser is Delphi 4 compatible. +2000-07-03 HeySt 1.0.6 TNvpNode: Added Constructor +2000-07-11 HeySt 1.0.7 Removed "Windows" from USES clause + Added three-exclamation-mark comments for Utf8ToAnsi/AnsiToUtf8 + Added three-exclamation-mark comments for CHR function calls +2000-07-23 HeySt 1.0.8 TXmlParser.Clear: CurAttr.Clear; EntityStack.Clear; + (This was not a bug; just defensive programming) +2000-07-29 HeySt 1.0.9 TNvpList: Added methods: Node(Index), Value(Index), Name(Index); +2000-10-07 HeySt Introduced Conditional Defines + Uses Contnrs unit and its TObjectList class again for + Delphi 5 and newer versions +2001-01-30 HeySt Introduced Version Numbering + Made LoadFromFile and LoadFromBuffer BOOLEAN functions + Introduced FileMode parameter for LoadFromFile + BugFix: TAttrList.Analyze: Must add CWhitespace to ExtractName call + Comments worked over +2001-02-28 HeySt 1.0.10 Completely worked over and tested the UTF-8 functions + Fixed a bug in TXmlParser.Scan which caused it to start over when it + was called after the end of scanning, resulting in an endless loop + TEntityStack is now a TObjectList instead of TList +2001-07-03 HeySt 1.0.11 Updated Compiler Version IFDEFs for Kylix +2001-07-11 HeySt 1.0.12 New TCustomXmlScanner component (taken over from LibXmlComps.pas) +2001-07-14 HeySt 1.0.13 Bugfix TCustomXmlScanner.FOnTranslateEncoding +2001-10-22 HeySt Don't clear CurName anymore when the parser finds a CDATA section. +2001-12-03 HeySt 1.0.14 TObjectList.Clear: Make call to INHERITED method (fixes a memory leak) +2001-12-05 HeySt 1.0.15 TObjectList.Clear: removed call to INHERITED method + TObjectList.Destroy: Inserted SetCapacity call. + Reduces need for frequent re-allocation of pointer buffer + Dedicated to my father, Theodor Heymann +2002-06-26 HeySt 1.0.16 TXmlParser.Scan: Fixed a bug with PIs whose name is beginning + with 'xml'. Thanks to Uwe Kamm for submitting this bug. + The CurEncoding property is now always in uppercase letters (the XML + spec wants it to be treated case independently so when it's uppercase + comparisons are faster) +2002-03-04 HeySt 1.0.17 Included an IFDEF for Delphi 7 (VER150) and Kylix + There is a new symbol HAS_CONTNRS_UNIT which is used now to + distinguish between IDEs which come with the Contnrs unit and + those that don't. +*) + +UNIT libxmlparser; + +{$I jedi-sdl.inc} + +INTERFACE + +USES + SysUtils, Classes, + (*$IFDEF HAS_CONTNRS_UNIT *) // The Contnrs Unit was introduced in Delphi 5 + Contnrs, + (*$ENDIF*) + Math; + +CONST + CVersion = '1.0.17'; // This variable will be updated for every release + // (I hope, I won't forget to do it everytime ...) + +TYPE + TPartType = // --- Document Part Types + (ptNone, // Nothing + ptXmlProlog, // XML Prolog XmlSpec 2.8 / 4.3.1 + ptComment, // Comment XmlSpec 2.5 + ptPI, // Processing Instruction XmlSpec 2.6 + ptDtdc, // Document Type Declaration XmlSpec 2.8 + ptStartTag, // Start Tag XmlSpec 3.1 + ptEmptyTag, // Empty-Element Tag XmlSpec 3.1 + ptEndTag, // End Tag XmlSpec 3.1 + ptContent, // Text Content between Tags + ptCData); // CDATA Section XmlSpec 2.7 + + TDtdElemType = // --- DTD Elements + (deElement, // !ELEMENT declaration + deAttList, // !ATTLIST declaration + deEntity, // !ENTITY declaration + deNotation, // !NOTATION declaration + dePI, // PI in DTD + deComment, // Comment in DTD + deError); // Error found in the DTD + +TYPE + TAttrList = CLASS; + TEntityStack = CLASS; + TNvpList = CLASS; + TElemDef = CLASS; + TElemList = CLASS; + TEntityDef = CLASS; + TNotationDef = CLASS; + + TDtdElementRec = RECORD // --- This Record is returned by the DTD parser callback function + Start, Final : PChar; // Start/End of the Element's Declaration + CASE ElementType : TDtdElemType OF // Type of the Element + deElement, // + deAttList : (ElemDef : TElemDef); // + deEntity : (EntityDef : TEntityDef); // + deNotation : (NotationDef : TNotationDef); // + dePI : (Target : PChar; // + Content : PChar; + AttrList : TAttrList); + deError : (Pos : PChar); // Error + // deComment : ((No additional fields here)); // + END; + + TXmlParser = CLASS // --- Internal Properties and Methods + PROTECTED + FBuffer : PChar; // NIL if there is no buffer available + FBufferSize : INTEGER; // 0 if the buffer is not owned by the Document instance + FSource : STRING; // Name of Source of document. Filename for Documents loaded with LoadFromFile + + FXmlVersion : STRING; // XML version from Document header. Default is '1.0' + FEncoding : STRING; // Encoding from Document header. Default is 'UTF-8' + FStandalone : BOOLEAN; // Standalone declaration from Document header. Default is 'yes' + FRootName : STRING; // Name of the Root Element (= DTD name) + FDtdcFinal : PChar; // Pointer to the '>' character terminating the DTD declaration + + FNormalize : BOOLEAN; // If true: Pack Whitespace and don't return empty contents + EntityStack : TEntityStack; // Entity Stack for Parameter and General Entities + FCurEncoding : STRING; // Current Encoding during parsing (always uppercase) + + PROCEDURE AnalyzeProlog; // Analyze XML Prolog or Text Declaration + PROCEDURE AnalyzeComment (Start : PChar; VAR Final : PChar); // Analyze Comments + PROCEDURE AnalyzePI (Start : PChar; VAR Final : PChar); // Analyze Processing Instructions (PI) + PROCEDURE AnalyzeDtdc; // Analyze Document Type Declaration + PROCEDURE AnalyzeDtdElements (Start : PChar; VAR Final : PChar); // Analyze DTD declarations + PROCEDURE AnalyzeTag; // Analyze Start/End/Empty-Element Tags + PROCEDURE AnalyzeCData; // Analyze CDATA Sections + PROCEDURE AnalyzeText (VAR IsDone : BOOLEAN); // Analyze Text Content between Tags + PROCEDURE AnalyzeElementDecl (Start : PChar; VAR Final : PChar); + PROCEDURE AnalyzeAttListDecl (Start : PChar; VAR Final : PChar); + PROCEDURE AnalyzeEntityDecl (Start : PChar; VAR Final : PChar); + PROCEDURE AnalyzeNotationDecl (Start : PChar; VAR Final : PChar); + + PROCEDURE PushPE (VAR Start : PChar); + PROCEDURE ReplaceCharacterEntities (VAR Str : STRING); + PROCEDURE ReplaceParameterEntities (VAR Str : STRING); + PROCEDURE ReplaceGeneralEntities (VAR Str : STRING); + + FUNCTION GetDocBuffer : PChar; // Returns FBuffer or a pointer to a NUL char if Buffer is empty + + PUBLIC // --- Document Properties + PROPERTY XmlVersion : STRING READ FXmlVersion; // XML version from the Document Prolog + PROPERTY Encoding : STRING READ FEncoding; // Document Encoding from Prolog + PROPERTY Standalone : BOOLEAN READ FStandalone; // Standalone Declaration from Prolog + PROPERTY RootName : STRING READ FRootName; // Name of the Root Element + PROPERTY Normalize : BOOLEAN READ FNormalize WRITE FNormalize; // True if Content is to be normalized + PROPERTY Source : STRING READ FSource; // Name of Document Source (Filename) + PROPERTY DocBuffer : PChar READ GetDocBuffer; // Returns document buffer + PUBLIC // --- DTD Objects + Elements : TElemList; // Elements: List of TElemDef (contains Attribute Definitions) + Entities : TNvpList; // General Entities: List of TEntityDef + ParEntities : TNvpList; // Parameter Entities: List of TEntityDef + Notations : TNvpList; // Notations: List of TNotationDef + PUBLIC + CONSTRUCTOR Create; + DESTRUCTOR Destroy; OVERRIDE; + + // --- Document Handling + FUNCTION LoadFromFile (Filename : STRING; + FileMode : INTEGER = fmOpenRead OR fmShareDenyNone) : BOOLEAN; + // Loads Document from given file + FUNCTION LoadFromBuffer (Buffer : PChar) : BOOLEAN; // Loads Document from another buffer + PROCEDURE SetBuffer (Buffer : PChar); // References another buffer + PROCEDURE Clear; // Clear Document + + PUBLIC + // --- Scanning through the document + CurPartType : TPartType; // Current Type + CurName : STRING; // Current Name + CurContent : STRING; // Current Normalized Content + CurStart : PChar; // Current First character + CurFinal : PChar; // Current Last character + CurAttr : TAttrList; // Current Attribute List + PROPERTY CurEncoding : STRING READ FCurEncoding; // Current Encoding + PROCEDURE StartScan; + FUNCTION Scan : BOOLEAN; + + // --- Events / Callbacks + FUNCTION LoadExternalEntity (SystemId, PublicId, + Notation : STRING) : TXmlParser; VIRTUAL; + FUNCTION TranslateEncoding (CONST Source : STRING) : STRING; VIRTUAL; + PROCEDURE DtdElementFound (DtdElementRec : TDtdElementRec); VIRTUAL; + END; + + TValueType = // --- Attribute Value Type + (vtNormal, // Normal specified Attribute + vtImplied, // #IMPLIED attribute value + vtFixed, // #FIXED attribute value + vtDefault); // Attribute value from default value in !ATTLIST declaration + + TAttrDefault = // --- Attribute Default Type + (adDefault, // Normal default value + adRequired, // #REQUIRED attribute + adImplied, // #IMPLIED attribute + adFixed); // #FIXED attribute + + TAttrType = // --- Type of attribute + (atUnknown, // Unknown type + atCData, // Character data only + atID, // ID + atIdRef, // ID Reference + atIdRefs, // Several ID References, separated by Whitespace + atEntity, // Name of an unparsed Entity + atEntities, // Several unparsed Entity names, separated by Whitespace + atNmToken, // Name Token + atNmTokens, // Several Name Tokens, separated by Whitespace + atNotation, // A selection of Notation names (Unparsed Entity) + atEnumeration); // Enumeration + + TElemType = // --- Element content type + (etEmpty, // Element is always empty + etAny, // Element can have any mixture of PCDATA and any elements + etChildren, // Element must contain only elements + etMixed); // Mixed PCDATA and elements + + (*$IFDEF HAS_CONTNRS_UNIT *) + TObjectList = Contnrs.TObjectList; // Re-Export this identifier + (*$ELSE *) + TObjectList = CLASS (TList) + DESTRUCTOR Destroy; OVERRIDE; + PROCEDURE Delete (Index : INTEGER); + PROCEDURE Clear; OVERRIDE; + END; + (*$ENDIF *) + + TNvpNode = CLASS // Name-Value Pair Node + Name : STRING; + Value : STRING; + CONSTRUCTOR Create (TheName : STRING = ''; TheValue : STRING = ''); + END; + + TNvpList = CLASS (TObjectList) // Name-Value Pair List + PROCEDURE Add (Node : TNvpNode); + FUNCTION Node (Name : STRING) : TNvpNode; OVERLOAD; + FUNCTION Node (Index : INTEGER) : TNvpNode; OVERLOAD; + FUNCTION Value (Name : STRING) : STRING; OVERLOAD; + FUNCTION Value (Index : INTEGER) : STRING; OVERLOAD; + FUNCTION Name (Index : INTEGER) : STRING; + END; + + TAttr = CLASS (TNvpNode) // Attribute of a Start-Tag or Empty-Element-Tag + ValueType : TValueType; + AttrType : TAttrType; + END; + + TAttrList = CLASS (TNvpList) // List of Attributes + PROCEDURE Analyze (Start : PChar; VAR Final : PChar); + END; + + TEntityStack = CLASS (TObjectList) // Stack where current position is stored before parsing entities + PROTECTED + Owner : TXmlParser; + PUBLIC + CONSTRUCTOR Create (TheOwner : TXmlParser); + PROCEDURE Push (LastPos : PChar); OVERLOAD; + PROCEDURE Push (Instance : TObject; LastPos : PChar); OVERLOAD; + FUNCTION Pop : PChar; // Returns next char or NIL if EOF is reached. Frees Instance. + END; + + TAttrDef = CLASS (TNvpNode) // Represents a '; + + // --- Name Constants for the above enumeration types + CPartType_Name : ARRAY [TPartType] OF STRING = + ('', 'XML Prolog', 'Comment', 'PI', + 'DTD Declaration', 'Start Tag', 'Empty Tag', 'End Tag', + 'Text', 'CDATA'); + CValueType_Name : ARRAY [TValueType] OF STRING = ('Normal', 'Implied', 'Fixed', 'Default'); + CAttrDefault_Name : ARRAY [TAttrDefault] OF STRING = ('Default', 'Required', 'Implied', 'Fixed'); + CElemType_Name : ARRAY [TElemType] OF STRING = ('Empty', 'Any', 'Childs only', 'Mixed'); + CAttrType_Name : ARRAY [TAttrType] OF STRING = ('Unknown', 'CDATA', + 'ID', 'IDREF', 'IDREFS', + 'ENTITY', 'ENTITIES', + 'NMTOKEN', 'NMTOKENS', + 'Notation', 'Enumeration'); + +FUNCTION ConvertWs (Source: STRING; PackWs: BOOLEAN) : STRING; // Convert WS to spaces #x20 +PROCEDURE SetStringSF (VAR S : STRING; BufferStart, BufferFinal : PChar); // SetString by Start/Final of buffer +FUNCTION StrSFPas (Start, Finish : PChar) : STRING; // Convert buffer part to Pascal string +FUNCTION TrimWs (Source : STRING) : STRING; // Trim Whitespace + +FUNCTION AnsiToUtf8 (Source : ANSISTRING) : STRING; // Convert Win-1252 to UTF-8 +FUNCTION Utf8ToAnsi (Source : STRING; UnknownChar : CHAR = '¿') : ANSISTRING; // Convert UTF-8 to Win-1252 + + +(* +=============================================================================================== +TCustomXmlScanner event based component wrapper for TXmlParser +=============================================================================================== +*) + +TYPE + TCustomXmlScanner = CLASS; + TXmlPrologEvent = PROCEDURE (Sender : TObject; XmlVersion, Encoding: STRING; Standalone : BOOLEAN) OF OBJECT; + TCommentEvent = PROCEDURE (Sender : TObject; Comment : STRING) OF OBJECT; + TPIEvent = PROCEDURE (Sender : TObject; Target, Content: STRING; Attributes : TAttrList) OF OBJECT; + TDtdEvent = PROCEDURE (Sender : TObject; RootElementName : STRING) OF OBJECT; + TStartTagEvent = PROCEDURE (Sender : TObject; TagName : STRING; Attributes : TAttrList) OF OBJECT; + TEndTagEvent = PROCEDURE (Sender : TObject; TagName : STRING) OF OBJECT; + TContentEvent = PROCEDURE (Sender : TObject; Content : STRING) OF OBJECT; + TElementEvent = PROCEDURE (Sender : TObject; ElemDef : TElemDef) OF OBJECT; + TEntityEvent = PROCEDURE (Sender : TObject; EntityDef : TEntityDef) OF OBJECT; + TNotationEvent = PROCEDURE (Sender : TObject; NotationDef : TNotationDef) OF OBJECT; + TErrorEvent = PROCEDURE (Sender : TObject; ErrorPos : PChar) OF OBJECT; + TExternalEvent = PROCEDURE (Sender : TObject; SystemId, PublicId, NotationId : STRING; + VAR Result : TXmlParser) OF OBJECT; + TEncodingEvent = FUNCTION (Sender : TObject; CurrentEncoding, Source : STRING) : STRING OF OBJECT; + + + TCustomXmlScanner = CLASS (TComponent) + PROTECTED + FXmlParser : TXmlParser; + FOnXmlProlog : TXmlPrologEvent; + FOnComment : TCommentEvent; + FOnPI : TPIEvent; + FOnDtdRead : TDtdEvent; + FOnStartTag : TStartTagEvent; + FOnEmptyTag : TStartTagEvent; + FOnEndTag : TEndTagEvent; + FOnContent : TContentEvent; + FOnCData : TContentEvent; + FOnElement : TElementEvent; + FOnAttList : TElementEvent; + FOnEntity : TEntityEvent; + FOnNotation : TNotationEvent; + FOnDtdError : TErrorEvent; + FOnLoadExternal : TExternalEvent; + FOnTranslateEncoding : TEncodingEvent; + FStopParser : BOOLEAN; + FUNCTION GetNormalize : BOOLEAN; + PROCEDURE SetNormalize (Value : BOOLEAN); + + PROCEDURE WhenXmlProlog(XmlVersion, Encoding: STRING; Standalone : BOOLEAN); VIRTUAL; + PROCEDURE WhenComment (Comment : STRING); VIRTUAL; + PROCEDURE WhenPI (Target, Content: STRING; Attributes : TAttrList); VIRTUAL; + PROCEDURE WhenDtdRead (RootElementName : STRING); VIRTUAL; + PROCEDURE WhenStartTag (TagName : STRING; Attributes : TAttrList); VIRTUAL; + PROCEDURE WhenEmptyTag (TagName : STRING; Attributes : TAttrList); VIRTUAL; + PROCEDURE WhenEndTag (TagName : STRING); VIRTUAL; + PROCEDURE WhenContent (Content : STRING); VIRTUAL; + PROCEDURE WhenCData (Content : STRING); VIRTUAL; + PROCEDURE WhenElement (ElemDef : TElemDef); VIRTUAL; + PROCEDURE WhenAttList (ElemDef : TElemDef); VIRTUAL; + PROCEDURE WhenEntity (EntityDef : TEntityDef); VIRTUAL; + PROCEDURE WhenNotation (NotationDef : TNotationDef); VIRTUAL; + PROCEDURE WhenDtdError (ErrorPos : PChar); VIRTUAL; + + PUBLIC + CONSTRUCTOR Create (AOwner: TComponent); OVERRIDE; + DESTRUCTOR Destroy; OVERRIDE; + + PROCEDURE LoadFromFile (Filename : TFilename); // Load XML Document from file + PROCEDURE LoadFromBuffer (Buffer : PChar); // Load XML Document from buffer + PROCEDURE SetBuffer (Buffer : PChar); // Refer to Buffer + FUNCTION GetFilename : TFilename; + + PROCEDURE Execute; // Perform scanning + + PROTECTED + PROPERTY XmlParser : TXmlParser READ FXmlParser; + PROPERTY StopParser : BOOLEAN READ FStopParser WRITE FStopParser; + PROPERTY Filename : TFilename READ GetFilename WRITE LoadFromFile; + PROPERTY Normalize : BOOLEAN READ GetNormalize WRITE SetNormalize; + PROPERTY OnXmlProlog : TXmlPrologEvent READ FOnXmlProlog WRITE FOnXmlProlog; + PROPERTY OnComment : TCommentEvent READ FOnComment WRITE FOnComment; + PROPERTY OnPI : TPIEvent READ FOnPI WRITE FOnPI; + PROPERTY OnDtdRead : TDtdEvent READ FOnDtdRead WRITE FOnDtdRead; + PROPERTY OnStartTag : TStartTagEvent READ FOnStartTag WRITE FOnStartTag; + PROPERTY OnEmptyTag : TStartTagEvent READ FOnEmptyTag WRITE FOnEmptyTag; + PROPERTY OnEndTag : TEndTagEvent READ FOnEndTag WRITE FOnEndTag; + PROPERTY OnContent : TContentEvent READ FOnContent WRITE FOnContent; + PROPERTY OnCData : TContentEvent READ FOnCData WRITE FOnCData; + PROPERTY OnElement : TElementEvent READ FOnElement WRITE FOnElement; + PROPERTY OnAttList : TElementEvent READ FOnAttList WRITE FOnAttList; + PROPERTY OnEntity : TEntityEvent READ FOnEntity WRITE FOnEntity; + PROPERTY OnNotation : TNotationEvent READ FOnNotation WRITE FOnNotation; + PROPERTY OnDtdError : TErrorEvent READ FOnDtdError WRITE FOnDtdError; + PROPERTY OnLoadExternal : TExternalEvent READ FOnLoadExternal WRITE FOnLoadExternal; + PROPERTY OnTranslateEncoding : TEncodingEvent READ FOnTranslateEncoding WRITE FOnTranslateEncoding; + END; + +(* +=============================================================================================== +IMPLEMENTATION +=============================================================================================== +*) + +IMPLEMENTATION + + +(* +=============================================================================================== +Unicode and UTF-8 stuff +=============================================================================================== +*) + +CONST + // --- Character Translation Table for Unicode <-> Win-1252 + WIN1252_UNICODE : ARRAY [$00..$FF] OF WORD = ( + $0000, $0001, $0002, $0003, $0004, $0005, $0006, $0007, $0008, $0009, + $000A, $000B, $000C, $000D, $000E, $000F, $0010, $0011, $0012, $0013, + $0014, $0015, $0016, $0017, $0018, $0019, $001A, $001B, $001C, $001D, + $001E, $001F, $0020, $0021, $0022, $0023, $0024, $0025, $0026, $0027, + $0028, $0029, $002A, $002B, $002C, $002D, $002E, $002F, $0030, $0031, + $0032, $0033, $0034, $0035, $0036, $0037, $0038, $0039, $003A, $003B, + $003C, $003D, $003E, $003F, $0040, $0041, $0042, $0043, $0044, $0045, + $0046, $0047, $0048, $0049, $004A, $004B, $004C, $004D, $004E, $004F, + $0050, $0051, $0052, $0053, $0054, $0055, $0056, $0057, $0058, $0059, + $005A, $005B, $005C, $005D, $005E, $005F, $0060, $0061, $0062, $0063, + $0064, $0065, $0066, $0067, $0068, $0069, $006A, $006B, $006C, $006D, + $006E, $006F, $0070, $0071, $0072, $0073, $0074, $0075, $0076, $0077, + $0078, $0079, $007A, $007B, $007C, $007D, $007E, $007F, + + $20AC, $0081, $201A, $0192, $201E, $2026, $2020, $2021, $02C6, $2030, + $0160, $2039, $0152, $008D, $017D, $008F, $0090, $2018, $2019, $201C, + $201D, $2022, $2013, $2014, $02DC, $2122, $0161, $203A, $0153, $009D, + $017E, $0178, $00A0, $00A1, $00A2, $00A3, $00A4, $00A5, $00A6, $00A7, + $00A8, $00A9, $00AA, $00AB, $00AC, $00AD, $00AE, $00AF, $00B0, $00B1, + $00B2, $00B3, $00B4, $00B5, $00B6, $00B7, $00B8, $00B9, $00BA, $00BB, + $00BC, $00BD, $00BE, $00BF, $00C0, $00C1, $00C2, $00C3, $00C4, $00C5, + $00C6, $00C7, $00C8, $00C9, $00CA, $00CB, $00CC, $00CD, $00CE, $00CF, + $00D0, $00D1, $00D2, $00D3, $00D4, $00D5, $00D6, $00D7, $00D8, $00D9, + $00DA, $00DB, $00DC, $00DD, $00DE, $00DF, $00E0, $00E1, $00E2, $00E3, + $00E4, $00E5, $00E6, $00E7, $00E8, $00E9, $00EA, $00EB, $00EC, $00ED, + $00EE, $00EF, $00F0, $00F1, $00F2, $00F3, $00F4, $00F5, $00F6, $00F7, + $00F8, $00F9, $00FA, $00FB, $00FC, $00FD, $00FE, $00FF); + +(* UTF-8 (somewhat simplified) + ----- + Character Range Byte sequence + --------------- -------------------------- (x=Bits from original character) + $0000..$007F 0xxxxxxx + $0080..$07FF 110xxxxx 10xxxxxx + $8000..$FFFF 1110xxxx 10xxxxxx 10xxxxxx + + Example + -------- + Transforming the Unicode character U+00E4 LATIN SMALL LETTER A WITH DIAERESIS ("ä"): + + ISO-8859-1, Decimal 228 + Win1252, Hex $E4 + ANSI Bin 1110 0100 + abcd efgh + + UTF-8 Binary 1100xxab 10cdefgh + Binary 11000011 10100100 + Hex $C3 $A4 + Decimal 195 164 + ANSI Ã ¤ *) + + +FUNCTION AnsiToUtf8 (Source : ANSISTRING) : STRING; + (* Converts the given Windows ANSI (Win1252) String to UTF-8. *) +VAR + I : INTEGER; // Loop counter + U : WORD; // Current Unicode value + Len : INTEGER; // Current real length of "Result" string +BEGIN + SetLength (Result, Length (Source) * 3); // Worst case + Len := 0; + FOR I := 1 TO Length (Source) DO BEGIN + U := WIN1252_UNICODE [ORD (Source [I])]; + CASE U OF + $0000..$007F : BEGIN + INC (Len); + Result [Len] := CHR (U); + END; + $0080..$07FF : BEGIN + INC (Len); + Result [Len] := CHR ($C0 OR (U SHR 6)); + INC (Len); + Result [Len] := CHR ($80 OR (U AND $3F)); + END; + $0800..$FFFF : BEGIN + INC (Len); + Result [Len] := CHR ($E0 OR (U SHR 12)); + INC (Len); + Result [Len] := CHR ($80 OR ((U SHR 6) AND $3F)); + INC (Len); + Result [Len] := CHR ($80 OR (U AND $3F)); + END; + END; + END; + SetLength (Result, Len); +END; + + +FUNCTION Utf8ToAnsi (Source : STRING; UnknownChar : CHAR = '¿') : ANSISTRING; + (* Converts the given UTF-8 String to Windows ANSI (Win-1252). + If a character can not be converted, the "UnknownChar" is inserted. *) +VAR + SourceLen : INTEGER; // Length of Source string + I, K : INTEGER; + A : BYTE; // Current ANSI character value + U : WORD; + Ch : CHAR; // Dest char + Len : INTEGER; // Current real length of "Result" string +BEGIN + SourceLen := Length (Source); + SetLength (Result, SourceLen); // Enough room to live + Len := 0; + I := 1; + WHILE I <= SourceLen DO BEGIN + A := ORD (Source [I]); + IF A < $80 THEN BEGIN // Range $0000..$007F + INC (Len); + Result [Len] := Source [I]; + INC (I); + END + ELSE BEGIN // Determine U, Inc I + IF (A AND $E0 = $C0) AND (I < SourceLen) THEN BEGIN // Range $0080..$07FF + U := (WORD (A AND $1F) SHL 6) OR (ORD (Source [I+1]) AND $3F); + INC (I, 2); + END + ELSE IF (A AND $F0 = $E0) AND (I < SourceLen-1) THEN BEGIN // Range $0800..$FFFF + U := (WORD (A AND $0F) SHL 12) OR + (WORD (ORD (Source [I+1]) AND $3F) SHL 6) OR + ( ORD (Source [I+2]) AND $3F); + INC (I, 3); + END + ELSE BEGIN // Unknown/unsupported + INC (I); + FOR K := 7 DOWNTO 0 DO + IF A AND (1 SHL K) = 0 THEN BEGIN + INC (I, (A SHR (K+1))-1); + BREAK; + END; + U := WIN1252_UNICODE [ORD (UnknownChar)]; + END; + Ch := UnknownChar; // Retrieve ANSI char + FOR A := $00 TO $FF DO + IF WIN1252_UNICODE [A] = U THEN BEGIN + Ch := CHR (A); + BREAK; + END; + INC (Len); + Result [Len] := Ch; + END; + END; + SetLength (Result, Len); +END; + + +(* +=============================================================================================== +"Special" Helper Functions + +Don't ask me why. But including these functions makes the parser *DRAMATICALLY* faster +on my K6-233 machine. You can test it yourself just by commenting them out. +They do exactly the same as the Assembler routines defined in SysUtils. +(This is where you can see how great the Delphi compiler really is. The compiled code is +faster than hand-coded assembler!) +=============================================================================================== +--> Just move this line below the StrScan function --> *) + + +FUNCTION StrPos (CONST Str, SearchStr : PChar) : PChar; + // Same functionality as SysUtils.StrPos +VAR + First : CHAR; + Len : INTEGER; +BEGIN + First := SearchStr^; + Len := StrLen (SearchStr); + Result := Str; + REPEAT + IF Result^ = First THEN + IF StrLComp (Result, SearchStr, Len) = 0 THEN BREAK; + IF Result^ = #0 THEN BEGIN + Result := NIL; + BREAK; + END; + INC (Result); + UNTIL FALSE; +END; + + +FUNCTION StrScan (CONST Start : PChar; CONST Ch : CHAR) : PChar; + // Same functionality as SysUtils.StrScan +BEGIN + Result := Start; + WHILE Result^ <> Ch DO BEGIN + IF Result^ = #0 THEN BEGIN + Result := NIL; + EXIT; + END; + INC (Result); + END; +END; + + +(* +=============================================================================================== +Helper Functions +=============================================================================================== +*) + +FUNCTION DelChars (Source : STRING; CharsToDelete : TCharset) : STRING; + // Delete all "CharsToDelete" from the string +VAR + I : INTEGER; +BEGIN + Result := Source; + FOR I := Length (Result) DOWNTO 1 DO + IF Result [I] IN CharsToDelete THEN + Delete (Result, I, 1); +END; + + +FUNCTION TrimWs (Source : STRING) : STRING; + // Trimms off Whitespace characters from both ends of the string +VAR + I : INTEGER; +BEGIN + // --- Trim Left + I := 1; + WHILE (I <= Length (Source)) AND (Source [I] IN CWhitespace) DO + INC (I); + Result := Copy (Source, I, MaxInt); + + // --- Trim Right + I := Length (Result); + WHILE (I > 1) AND (Result [I] IN CWhitespace) DO + DEC (I); + Delete (Result, I+1, Length (Result)-I); +END; + + +FUNCTION ConvertWs (Source: STRING; PackWs: BOOLEAN) : STRING; + // Converts all Whitespace characters to the Space #x20 character + // If "PackWs" is true, contiguous Whitespace characters are packed to one +VAR + I : INTEGER; +BEGIN + Result := Source; + FOR I := Length (Result) DOWNTO 1 DO + IF (Result [I] IN CWhitespace) THEN + IF PackWs AND (I > 1) AND (Result [I-1] IN CWhitespace) + THEN Delete (Result, I, 1) + ELSE Result [I] := #32; +END; + + +PROCEDURE SetStringSF (VAR S : STRING; BufferStart, BufferFinal : PChar); +BEGIN + SetString (S, BufferStart, BufferFinal-BufferStart+1); +END; + + +FUNCTION StrLPas (Start : PChar; Len : INTEGER) : STRING; +BEGIN + SetString (Result, Start, Len); +END; + + +FUNCTION StrSFPas (Start, Finish : PChar) : STRING; +BEGIN + SetString (Result, Start, Finish-Start+1); +END; + + +FUNCTION StrScanE (CONST Source : PChar; CONST CharToScanFor : CHAR) : PChar; + // If "CharToScanFor" is not found, StrScanE returns the last char of the + // buffer instead of NIL +BEGIN + Result := StrScan (Source, CharToScanFor); + IF Result = NIL THEN + Result := StrEnd (Source)-1; +END; + + +PROCEDURE ExtractName (Start : PChar; Terminators : TCharset; VAR Final : PChar); + (* Extracts the complete Name beginning at "Start". + It is assumed that the name is contained in Markup, so the '>' character is + always a Termination. + Start: IN Pointer to first char of name. Is always considered to be valid + Terminators: IN Characters which terminate the name + Final: OUT Pointer to last char of name *) +BEGIN + Final := Start+1; + Include (Terminators, #0); + Include (Terminators, '>'); + WHILE NOT (Final^ IN Terminators) DO + INC (Final); + DEC (Final); +END; + + +PROCEDURE ExtractQuote (Start : PChar; VAR Content : STRING; VAR Final : PChar); + (* Extract a string which is contained in single or double Quotes. + Start: IN Pointer to opening quote + Content: OUT The quoted string + Final: OUT Pointer to closing quote *) +BEGIN + Final := StrScan (Start+1, Start^); + IF Final = NIL THEN BEGIN + Final := StrEnd (Start+1)-1; + SetString (Content, Start+1, Final-Start); + END + ELSE + SetString (Content, Start+1, Final-1-Start); +END; + + +(* +=============================================================================================== +TEntityStackNode +This Node is pushed to the "Entity Stack" whenever the parser parses entity replacement text. +The "Instance" field holds the Instance pointer of an External Entity buffer. When it is +popped, the Instance is freed. +The "Encoding" field holds the name of the Encoding. External Parsed Entities may have +another encoding as the document entity (XmlSpec 4.3.3). So when there is an " 0 THEN BEGIN + ESN := TEntityStackNode (Items [Count-1]); + Result := ESN.LastPos; + IF ESN.Instance <> NIL THEN + ESN.Instance.Free; + IF ESN.Encoding <> '' THEN + Owner.FCurEncoding := ESN.Encoding; // Restore current Encoding + Delete (Count-1); + END + ELSE + Result := NIL; +END; + + +(* +=============================================================================================== +TExternalID +----------- +XmlSpec 4.2.2: ExternalID ::= 'SYSTEM' S SystemLiteral | + 'PUBLIC' S PubidLiteral S SystemLiteral +XmlSpec 4.7: PublicID ::= 'PUBLIC' S PubidLiteral +SystemLiteral and PubidLiteral are quoted +=============================================================================================== +*) + +TYPE + TExternalID = CLASS + PublicId : STRING; + SystemId : STRING; + Final : PChar; + CONSTRUCTOR Create (Start : PChar); + END; + +CONSTRUCTOR TExternalID.Create (Start : PChar); +BEGIN + INHERITED Create; + Final := Start; + IF StrLComp (Start, 'SYSTEM', 6) = 0 THEN BEGIN + WHILE NOT (Final^ IN (CQuoteChar + [#0, '>', '['])) DO INC (Final); + IF NOT (Final^ IN CQuoteChar) THEN EXIT; + ExtractQuote (Final, SystemID, Final); + END + ELSE IF StrLComp (Start, 'PUBLIC', 6) = 0 THEN BEGIN + WHILE NOT (Final^ IN (CQuoteChar + [#0, '>', '['])) DO INC (Final); + IF NOT (Final^ IN CQuoteChar) THEN EXIT; + ExtractQuote (Final, PublicID, Final); + INC (Final); + WHILE NOT (Final^ IN (CQuoteChar + [#0, '>', '['])) DO INC (Final); + IF NOT (Final^ IN CQuoteChar) THEN EXIT; + ExtractQuote (Final, SystemID, Final); + END; +END; + + +(* +=============================================================================================== +TXmlParser +=============================================================================================== +*) + +CONSTRUCTOR TXmlParser.Create; +BEGIN + INHERITED Create; + FBuffer := NIL; + FBufferSize := 0; + Elements := TElemList.Create; + Entities := TNvpList.Create; + ParEntities := TNvpList.Create; + Notations := TNvpList.Create; + CurAttr := TAttrList.Create; + EntityStack := TEntityStack.Create (Self); + Clear; +END; + + +DESTRUCTOR TXmlParser.Destroy; +BEGIN + Clear; + Elements.Free; + Entities.Free; + ParEntities.Free; + Notations.Free; + CurAttr.Free; + EntityStack.Free; + INHERITED Destroy; +END; + + +PROCEDURE TXmlParser.Clear; + // Free Buffer and clear all object attributes +BEGIN + IF (FBufferSize > 0) AND (FBuffer <> NIL) THEN + FreeMem (FBuffer); + FBuffer := NIL; + FBufferSize := 0; + FSource := ''; + FXmlVersion := ''; + FEncoding := ''; + FStandalone := FALSE; + FRootName := ''; + FDtdcFinal := NIL; + FNormalize := TRUE; + Elements.Clear; + Entities.Clear; + ParEntities.Clear; + Notations.Clear; + CurAttr.Clear; + EntityStack.Clear; +END; + + +FUNCTION TXmlParser.LoadFromFile (Filename : STRING; FileMode : INTEGER = fmOpenRead OR fmShareDenyNone) : BOOLEAN; + // Loads Document from given file + // Returns TRUE if successful +VAR + f : FILE; + ReadIn : INTEGER; + OldFileMode : INTEGER; +BEGIN + Result := FALSE; + Clear; + + // --- Open File + OldFileMode := SYSTEM.FileMode; + TRY + SYSTEM.FileMode := FileMode; + TRY + AssignFile (f, Filename); + Reset (f, 1); + EXCEPT + EXIT; + END; + + TRY + // --- Allocate Memory + TRY + FBufferSize := Filesize (f) + 1; + GetMem (FBuffer, FBufferSize); + EXCEPT + Clear; + EXIT; + END; + + // --- Read File + TRY + BlockRead (f, FBuffer^, FBufferSize, ReadIn); + (FBuffer+ReadIn)^ := #0; // NULL termination + EXCEPT + Clear; + EXIT; + END; + FINALLY + CloseFile (f); + END; + + FSource := Filename; + Result := TRUE; + + FINALLY + SYSTEM.FileMode := OldFileMode; + END; +END; + + +FUNCTION TXmlParser.LoadFromBuffer (Buffer : PChar) : BOOLEAN; + // Loads Document from another buffer + // Returns TRUE if successful + // The "Source" property becomes '' if successful +BEGIN + Result := FALSE; + Clear; + FBufferSize := StrLen (Buffer) + 1; + TRY + GetMem (FBuffer, FBufferSize); + EXCEPT + Clear; + EXIT; + END; + StrCopy (FBuffer, Buffer); + FSource := ''; + Result := TRUE; +END; + + +PROCEDURE TXmlParser.SetBuffer (Buffer : PChar); // References another buffer +BEGIN + Clear; + FBuffer := Buffer; + FBufferSize := 0; + FSource := ''; +END; + + +//----------------------------------------------------------------------------------------------- +// Scanning through the document +//----------------------------------------------------------------------------------------------- + +PROCEDURE TXmlParser.StartScan; +BEGIN + CurPartType := ptNone; + CurName := ''; + CurContent := ''; + CurStart := NIL; + CurFinal := NIL; + CurAttr.Clear; + EntityStack.Clear; +END; + + +FUNCTION TXmlParser.Scan : BOOLEAN; + // Scans the next Part + // Returns TRUE if a part could be found, FALSE if there is no part any more + // + // "IsDone" can be set to FALSE by AnalyzeText in order to go to the next part + // if there is no Content due to normalization +VAR + IsDone : BOOLEAN; +BEGIN + REPEAT + IsDone := TRUE; + + // --- Start of next Part + IF CurStart = NIL + THEN CurStart := DocBuffer + ELSE CurStart := CurFinal+1; + CurFinal := CurStart; + + // --- End of Document of Pop off a new part from the Entity stack? + IF CurStart^ = #0 THEN + CurStart := EntityStack.Pop; + + // --- No Document or End Of Document: Terminate Scan + IF (CurStart = NIL) OR (CurStart^ = #0) THEN BEGIN + CurStart := StrEnd (DocBuffer); + CurFinal := CurStart-1; + EntityStack.Clear; + Result := FALSE; + EXIT; + END; + + IF (StrLComp (CurStart, ''); + IF CurFinal <> NIL + THEN INC (CurFinal) + ELSE CurFinal := StrEnd (CurStart)-1; + FCurEncoding := AnsiUpperCase (CurAttr.Value ('encoding')); + IF FCurEncoding = '' THEN + FCurEncoding := 'UTF-8'; // Default XML Encoding is UTF-8 + CurPartType := ptXmlProlog; + CurName := ''; + CurContent := ''; +END; + + +PROCEDURE TXmlParser.AnalyzeComment (Start : PChar; VAR Final : PChar); + // Analyze Comments +BEGIN + Final := StrPos (Start+4, '-->'); + IF Final = NIL + THEN Final := StrEnd (Start)-1 + ELSE INC (Final, 2); + CurPartType := ptComment; +END; + + +PROCEDURE TXmlParser.AnalyzePI (Start : PChar; VAR Final : PChar); + // Analyze Processing Instructions (PI) + // This is also called for Character +VAR + F : PChar; +BEGIN + CurPartType := ptPI; + Final := StrPos (Start+2, '?>'); + IF Final = NIL + THEN Final := StrEnd (Start)-1 + ELSE INC (Final); + ExtractName (Start+2, CWhitespace + ['?', '>'], F); + SetStringSF (CurName, Start+2, F); + SetStringSF (CurContent, F+1, Final-2); + CurAttr.Analyze (F+1, F); +END; + + +PROCEDURE TXmlParser.AnalyzeDtdc; + (* Analyze Document Type Declaration + doctypedecl ::= '' + markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment + PEReference ::= '%' Name ';' + + elementdecl ::= '' + AttlistDecl ::= '' + EntityDecl ::= '' | + '' + NotationDecl ::= '' + PI ::= '' Char* )))? '?>' + Comment ::= '' *) +TYPE + TPhase = (phName, phDtd, phInternal, phFinishing); +VAR + Phase : TPhase; + F : PChar; + ExternalID : TExternalID; + ExternalDTD : TXmlParser; + DER : TDtdElementRec; +BEGIN + DER.Start := CurStart; + EntityStack.Clear; // Clear stack for Parameter Entities + CurPartType := ptDtdc; + + // --- Don't read DTDc twice + IF FDtdcFinal <> NIL THEN BEGIN + CurFinal := FDtdcFinal; + EXIT; + END; + + // --- Scan DTDc + CurFinal := CurStart + 9; // First char after '' : BREAK; + ELSE IF NOT (CurFinal^ IN CWhitespace) THEN BEGIN + CASE Phase OF + phName : IF (CurFinal^ IN CNameStart) THEN BEGIN + ExtractName (CurFinal, CWhitespace + ['[', '>'], F); + SetStringSF (FRootName, CurFinal, F); + CurFinal := F; + Phase := phDtd; + END; + phDtd : IF (StrLComp (CurFinal, 'SYSTEM', 6) = 0) OR + (StrLComp (CurFinal, 'PUBLIC', 6) = 0) THEN BEGIN + ExternalID := TExternalID.Create (CurFinal); + ExternalDTD := LoadExternalEntity (ExternalId.SystemId, ExternalID.PublicId, ''); + F := StrPos (ExternalDtd.DocBuffer, ' NIL THEN + AnalyzeDtdElements (F, F); + ExternalDTD.Free; + CurFinal := ExternalID.Final; + ExternalID.Free; + END; + ELSE BEGIN + DER.ElementType := deError; + DER.Pos := CurFinal; + DER.Final := CurFinal; + DtdElementFound (DER); + END; + END; + + END; + END; + INC (CurFinal); + UNTIL FALSE; + + CurPartType := ptDtdc; + CurName := ''; + CurContent := ''; + + // It is an error in the document if "EntityStack" is not empty now + IF EntityStack.Count > 0 THEN BEGIN + DER.ElementType := deError; + DER.Final := CurFinal; + DER.Pos := CurFinal; + DtdElementFound (DER); + END; + + EntityStack.Clear; // Clear stack for General Entities + FDtdcFinal := CurFinal; +END; + + +PROCEDURE TXmlParser.AnalyzeDtdElements (Start : PChar; VAR Final : PChar); + // Analyze the "Elements" of a DTD contained in the external or + // internal DTD subset. +VAR + DER : TDtdElementRec; +BEGIN + Final := Start; + REPEAT + CASE Final^ OF + '%' : BEGIN + PushPE (Final); + CONTINUE; + END; + #0 : IF EntityStack.Count = 0 THEN + BREAK + ELSE BEGIN + CurFinal := EntityStack.Pop; + CONTINUE; + END; + ']', + '>' : BREAK; + '<' : IF StrLComp (Final, ''); + + // --- Set Default Attribute values for nonexistent attributes + IF (CurPartType = ptStartTag) OR (CurPartType = ptEmptyTag) THEN BEGIN + ElemDef := Elements.Node (CurName); + IF ElemDef <> NIL THEN BEGIN + FOR I := 0 TO ElemDef.Count-1 DO BEGIN + AttrDef := TAttrDef (ElemDef [I]); + Attr := TAttr (CurAttr.Node (AttrDef.Name)); + IF (Attr = NIL) AND (AttrDef.Value <> '') THEN BEGIN + Attr := TAttr.Create (AttrDef.Name, AttrDef.Value); + Attr.ValueType := vtDefault; + CurAttr.Add (Attr); + END; + IF Attr <> NIL THEN BEGIN + CASE AttrDef.DefaultType OF + adDefault : ; + adRequired : ; // -!- It is an error in the document if "Attr.Value" is an empty string + adImplied : Attr.ValueType := vtImplied; + adFixed : BEGIN + Attr.ValueType := vtFixed; + Attr.Value := AttrDef.Value; + END; + END; + Attr.AttrType := AttrDef.AttrType; + END; + END; + END; + + // --- Normalize Attribute Values. XmlSpec: + // - a character reference is processed by appending the referenced character to the attribute value + // - an entity reference is processed by recursively processing the replacement text of the entity + // - a whitespace character (#x20, #xD, #xA, #x9) is processed by appending #x20 to the normalized value, + // except that only a single #x20 is appended for a "#xD#xA" sequence that is part of an external + // parsed entity or the literal entity value of an internal parsed entity + // - other characters are processed by appending them to the normalized value + // If the declared value is not CDATA, then the XML processor must further process the + // normalized attribute value by discarding any leading and trailing space (#x20) characters, + // and by replacing sequences of space (#x20) characters by a single space (#x20) character. + // All attributes for which no declaration has been read should be treated by a + // non-validating parser as if declared CDATA. + // !!! The XML 1.0 SE specification is somewhat different here + // This code does not conform exactly to this specification + FOR I := 0 TO CurAttr.Count-1 DO + WITH TAttr (CurAttr [I]) DO BEGIN + ReplaceGeneralEntities (Value); + ReplaceCharacterEntities (Value); + IF (AttrType <> atCData) AND (AttrType <> atUnknown) + THEN Value := TranslateEncoding (TrimWs (ConvertWs (Value, TRUE))) + ELSE Value := TranslateEncoding (ConvertWs (Value, FALSE)); + END; + END; +END; + + +PROCEDURE TXmlParser.AnalyzeCData; + // Analyze CDATA Sections +BEGIN + CurPartType := ptCData; + CurFinal := StrPos (CurStart, CDEnd); + IF CurFinal = NIL THEN BEGIN + CurFinal := StrEnd (CurStart)-1; + CurContent := TranslateEncoding (StrPas (CurStart+Length (CDStart))); + END + ELSE BEGIN + SetStringSF (CurContent, CurStart+Length (CDStart), CurFinal-1); + INC (CurFinal, Length (CDEnd)-1); + CurContent := TranslateEncoding (CurContent); + END; +END; + + +PROCEDURE TXmlParser.AnalyzeText (VAR IsDone : BOOLEAN); + (* Analyzes Text Content between Tags. CurFinal will point to the last content character. + Content ends at a '<' character or at the end of the document. + Entity References and Character Entity references are resolved. + If PackSpaces is TRUE, contiguous Whitespace Characters will be compressed to + one Space #x20 character, Whitespace at the beginning and end of content will + be trimmed off and content which is or becomes empty is not returned to + the application (in this case, "IsDone" is set to FALSE which causes the + Scan method to proceed directly to the next part. *) + + PROCEDURE ProcessEntity; + (* Is called if there is an ampsersand '&' character found in the document. + IN "CurFinal" points to the ampersand + OUT "CurFinal" points to the first character after the semi-colon ';' *) + VAR + P : PChar; + Name : STRING; + EntityDef : TEntityDef; + ExternalEntity : TXmlParser; + BEGIN + P := StrScan (CurFinal , ';'); + IF P <> NIL THEN BEGIN + SetStringSF (Name, CurFinal+1, P-1); + + // Is it a Character Entity? + IF (CurFinal+1)^ = '#' THEN BEGIN + IF UpCase ((CurFinal+2)^) = 'X' // !!! Can't use "CHR" for Unicode characters > 255: + THEN CurContent := CurContent + CHR (StrToIntDef ('$'+Copy (Name, 3, MaxInt), 32)) + ELSE CurContent := CurContent + CHR (StrToIntDef (Copy (Name, 2, MaxInt), 32)); + CurFinal := P+1; + EXIT; + END + + // Is it a Predefined Entity? + ELSE IF Name = 'lt' THEN BEGIN CurContent := CurContent + '<'; CurFinal := P+1; EXIT; END + ELSE IF Name = 'gt' THEN BEGIN CurContent := CurContent + '>'; CurFinal := P+1; EXIT; END + ELSE IF Name = 'amp' THEN BEGIN CurContent := CurContent + '&'; CurFinal := P+1; EXIT; END + ELSE IF Name = 'apos' THEN BEGIN CurContent := CurContent + ''''; CurFinal := P+1; EXIT; END + ELSE IF Name = 'quot' THEN BEGIN CurContent := CurContent + '"'; CurFinal := P+1; EXIT; END; + + // Replace with Entity from DTD + EntityDef := TEntityDef (Entities.Node (Name)); + IF EntityDef <> NIL THEN BEGIN + IF EntityDef.Value <> '' THEN BEGIN + EntityStack.Push (P+1); + CurFinal := PChar (EntityDef.Value); + END + ELSE BEGIN + ExternalEntity := LoadExternalEntity (EntityDef.SystemId, EntityDef.PublicId, EntityDef.NotationName); + EntityStack.Push (ExternalEntity, P+1); + CurFinal := ExternalEntity.DocBuffer; + END; + END + ELSE BEGIN + CurContent := CurContent + Name; + CurFinal := P+1; + END; + END + ELSE BEGIN + INC (CurFinal); + END; + END; + +VAR + C : INTEGER; +BEGIN + CurFinal := CurStart; + CurPartType := ptContent; + CurContent := ''; + C := 0; + REPEAT + CASE CurFinal^ OF + '&' : BEGIN + CurContent := CurContent + TranslateEncoding (StrLPas (CurFinal-C, C)); + C := 0; + ProcessEntity; + CONTINUE; + END; + #0 : BEGIN + IF EntityStack.Count = 0 THEN + BREAK + ELSE BEGIN + CurContent := CurContent + TranslateEncoding (StrLPas (CurFinal-C, C)); + C := 0; + CurFinal := EntityStack.Pop; + CONTINUE; + END; + END; + '<' : BREAK; + ELSE INC (C); + END; + INC (CurFinal); + UNTIL FALSE; + CurContent := CurContent + TranslateEncoding (StrLPas (CurFinal-C, C)); + DEC (CurFinal); + + IF FNormalize THEN BEGIN + CurContent := ConvertWs (TrimWs (CurContent), TRUE); + IsDone := CurContent <> ''; // IsDone will only get FALSE if PackSpaces is TRUE + END; +END; + + +PROCEDURE TXmlParser.AnalyzeElementDecl (Start : PChar; VAR Final : PChar); + (* Parse ' character + XmlSpec 3.2: + elementdecl ::= '' + contentspec ::= 'EMPTY' | 'ANY' | Mixed | children + Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | + '(' S? '#PCDATA' S? ')' + children ::= (choice | seq) ('?' | '*' | '+')? + choice ::= '(' S? cp ( S? '|' S? cp )* S? ')' + cp ::= (Name | choice | seq) ('?' | '*' | '+')? + seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' + + More simply: + contentspec ::= EMPTY + ANY + '(#PCDATA)' + '(#PCDATA | A | B)*' + '(A, B, C)' + '(A | B | C)' + '(A?, B*, C+), + '(A, (B | C | D)* )' *) +VAR + Element : TElemDef; + Elem2 : TElemDef; + F : PChar; + DER : TDtdElementRec; +BEGIN + Element := TElemDef.Create; + Final := Start + 9; + DER.Start := Start; + REPEAT + IF Final^ = '>' THEN BREAK; + IF (Final^ IN CNameStart) AND (Element.Name = '') THEN BEGIN + ExtractName (Final, CWhitespace, F); + SetStringSF (Element.Name, Final, F); + Final := F; + F := StrScan (Final+1, '>'); + IF F = NIL THEN BEGIN + Element.Definition := STRING (Final); + Final := StrEnd (Final); + BREAK; + END + ELSE BEGIN + SetStringSF (Element.Definition, Final+1, F-1); + Final := F; + BREAK; + END; + END; + INC (Final); + UNTIL FALSE; + Element.Definition := DelChars (Element.Definition, CWhitespace); + ReplaceParameterEntities (Element.Definition); + IF Element.Definition = 'EMPTY' THEN Element.ElemType := etEmpty + ELSE IF Element.Definition = 'ANY' THEN Element.ElemType := etAny + ELSE IF Copy (Element.Definition, 1, 8) = '(#PCDATA' THEN Element.ElemType := etMixed + ELSE IF Copy (Element.Definition, 1, 1) = '(' THEN Element.ElemType := etChildren + ELSE Element.ElemType := etAny; + + Elem2 := Elements.Node (Element.Name); + IF Elem2 <> NIL THEN + Elements.Delete (Elements.IndexOf (Elem2)); + Elements.Add (Element); + Final := StrScanE (Final, '>'); + DER.ElementType := deElement; + DER.ElemDef := Element; + DER.Final := Final; + DtdElementFound (DER); +END; + + +PROCEDURE TXmlParser.AnalyzeAttListDecl (Start : PChar; VAR Final : PChar); + (* Parse ' character + XmlSpec 3.3: + AttlistDecl ::= '' + AttDef ::= S Name S AttType S DefaultDecl + AttType ::= StringType | TokenizedType | EnumeratedType + StringType ::= 'CDATA' + TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' + EnumeratedType ::= NotationType | Enumeration + NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' + Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' + DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) + AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" + Examples: + *) +TYPE + TPhase = (phElementName, phName, phType, phNotationContent, phDefault); +VAR + Phase : TPhase; + F : PChar; + ElementName : STRING; + ElemDef : TElemDef; + AttrDef : TAttrDef; + AttrDef2 : TAttrDef; + Strg : STRING; + DER : TDtdElementRec; +BEGIN + Final := Start + 9; // The character after ' : BREAK; + ELSE CASE Phase OF + phElementName : BEGIN + ExtractName (Final, CWhitespace + CQuoteChar + ['#'], F); + SetStringSF (ElementName, Final, F); + Final := F; + ElemDef := Elements.Node (ElementName); + IF ElemDef = NIL THEN BEGIN + ElemDef := TElemDef.Create; + ElemDef.Name := ElementName; + ElemDef.Definition := 'ANY'; + ElemDef.ElemType := etAny; + Elements.Add (ElemDef); + END; + Phase := phName; + END; + phName : BEGIN + AttrDef := TAttrDef.Create; + ExtractName (Final, CWhitespace + CQuoteChar + ['#'], F); + SetStringSF (AttrDef.Name, Final, F); + Final := F; + AttrDef2 := TAttrDef (ElemDef.Node (AttrDef.Name)); + IF AttrDef2 <> NIL THEN + ElemDef.Delete (ElemDef.IndexOf (AttrDef2)); + ElemDef.Add (AttrDef); + Phase := phType; + END; + phType : BEGIN + IF Final^ = '(' THEN BEGIN + F := StrScan (Final+1, ')'); + IF F <> NIL + THEN SetStringSF (AttrDef.TypeDef, Final+1, F-1) + ELSE AttrDef.TypeDef := STRING (Final+1); + AttrDef.TypeDef := DelChars (AttrDef.TypeDef, CWhitespace); + AttrDef.AttrType := atEnumeration; + ReplaceParameterEntities (AttrDef.TypeDef); + ReplaceCharacterEntities (AttrDef.TypeDef); + Phase := phDefault; + END + ELSE IF StrLComp (Final, 'NOTATION', 8) = 0 THEN BEGIN + INC (Final, 8); + AttrDef.AttrType := atNotation; + Phase := phNotationContent; + END + ELSE BEGIN + ExtractName (Final, CWhitespace+CQuoteChar+['#'], F); + SetStringSF (AttrDef.TypeDef, Final, F); + IF AttrDef.TypeDef = 'CDATA' THEN AttrDef.AttrType := atCData + ELSE IF AttrDef.TypeDef = 'ID' THEN AttrDef.AttrType := atId + ELSE IF AttrDef.TypeDef = 'IDREF' THEN AttrDef.AttrType := atIdRef + ELSE IF AttrDef.TypeDef = 'IDREFS' THEN AttrDef.AttrType := atIdRefs + ELSE IF AttrDef.TypeDef = 'ENTITY' THEN AttrDef.AttrType := atEntity + ELSE IF AttrDef.TypeDef = 'ENTITIES' THEN AttrDef.AttrType := atEntities + ELSE IF AttrDef.TypeDef = 'NMTOKEN' THEN AttrDef.AttrType := atNmToken + ELSE IF AttrDef.TypeDef = 'NMTOKENS' THEN AttrDef.AttrType := atNmTokens; + Phase := phDefault; + END + END; + phNotationContent : BEGIN + F := StrScan (Final, ')'); + IF F <> NIL THEN + SetStringSF (AttrDef.Notations, Final+1, F-1) + ELSE BEGIN + AttrDef.Notations := STRING (Final+1); + Final := StrEnd (Final); + END; + ReplaceParameterEntities (AttrDef.Notations); + AttrDef.Notations := DelChars (AttrDef.Notations, CWhitespace); + Phase := phDefault; + END; + phDefault : BEGIN + IF Final^ = '#' THEN BEGIN + ExtractName (Final, CWhiteSpace + CQuoteChar, F); + SetStringSF (Strg, Final, F); + Final := F; + ReplaceParameterEntities (Strg); + IF Strg = '#REQUIRED' THEN BEGIN AttrDef.DefaultType := adRequired; Phase := phName; END + ELSE IF Strg = '#IMPLIED' THEN BEGIN AttrDef.DefaultType := adImplied; Phase := phName; END + ELSE IF Strg = '#FIXED' THEN AttrDef.DefaultType := adFixed; + END + ELSE IF (Final^ IN CQuoteChar) THEN BEGIN + ExtractQuote (Final, AttrDef.Value, Final); + ReplaceParameterEntities (AttrDef.Value); + ReplaceCharacterEntities (AttrDef.Value); + Phase := phName; + END; + IF Phase = phName THEN BEGIN + AttrDef := NIL; + END; + END; + + END; + END; + INC (Final); + UNTIL FALSE; + + Final := StrScan (Final, '>'); + + DER.ElementType := deAttList; + DER.ElemDef := ElemDef; + DER.Final := Final; + DtdElementFound (DER); +END; + + +PROCEDURE TXmlParser.AnalyzeEntityDecl (Start : PChar; VAR Final : PChar); + (* Parse ' character + XmlSpec 4.2: + EntityDecl ::= '' | + '' + EntityDef ::= EntityValue | (ExternalID NDataDecl?) + PEDef ::= EntityValue | ExternalID + NDataDecl ::= S 'NDATA' S Name + EntityValue ::= '"' ([^%&"] | PEReference | EntityRef | CharRef)* '"' | + "'" ([^%&'] | PEReference | EntityRef | CharRef)* "'" + PEReference ::= '%' Name ';' + + Examples + + + + "> + + + Dies ist ein Test-Absatz

"> + *) +TYPE + TPhase = (phName, phContent, phNData, phNotationName, phFinalGT); +VAR + Phase : TPhase; + IsParamEntity : BOOLEAN; + F : PChar; + ExternalID : TExternalID; + EntityDef : TEntityDef; + EntityDef2 : TEntityDef; + DER : TDtdElementRec; +BEGIN + Final := Start + 8; // First char after ' : BREAK; + ELSE CASE Phase OF + phName : IF Final^ IN CNameStart THEN BEGIN + ExtractName (Final, CWhitespace + CQuoteChar, F); + SetStringSF (EntityDef.Name, Final, F); + Final := F; + Phase := phContent; + END; + phContent : IF Final^ IN CQuoteChar THEN BEGIN + ExtractQuote (Final, EntityDef.Value, Final); + Phase := phFinalGT; + END + ELSE IF (StrLComp (Final, 'SYSTEM', 6) = 0) OR + (StrLComp (Final, 'PUBLIC', 6) = 0) THEN BEGIN + ExternalID := TExternalID.Create (Final); + EntityDef.SystemId := ExternalID.SystemId; + EntityDef.PublicId := ExternalID.PublicId; + Final := ExternalID.Final; + Phase := phNData; + ExternalID.Free; + END; + phNData : IF StrLComp (Final, 'NDATA', 5) = 0 THEN BEGIN + INC (Final, 4); + Phase := phNotationName; + END; + phNotationName : IF Final^ IN CNameStart THEN BEGIN + ExtractName (Final, CWhitespace + ['>'], F); + SetStringSF (EntityDef.NotationName, Final, F); + Final := F; + Phase := phFinalGT; + END; + phFinalGT : ; // -!- There is an error in the document if this branch is called + END; + END; + INC (Final); + UNTIL FALSE; + IF IsParamEntity THEN BEGIN + EntityDef2 := TEntityDef (ParEntities.Node (EntityDef.Name)); + IF EntityDef2 <> NIL THEN + ParEntities.Delete (ParEntities.IndexOf (EntityDef2)); + ParEntities.Add (EntityDef); + ReplaceCharacterEntities (EntityDef.Value); + END + ELSE BEGIN + EntityDef2 := TEntityDef (Entities.Node (EntityDef.Name)); + IF EntityDef2 <> NIL THEN + Entities.Delete (Entities.IndexOf (EntityDef2)); + Entities.Add (EntityDef); + ReplaceParameterEntities (EntityDef.Value); // Create replacement texts (see XmlSpec 4.5) + ReplaceCharacterEntities (EntityDef.Value); + END; + Final := StrScanE (Final, '>'); + + DER.ElementType := deEntity; + DER.EntityDef := EntityDef; + DER.Final := Final; + DtdElementFound (DER); +END; + + +PROCEDURE TXmlParser.AnalyzeNotationDecl (Start : PChar; VAR Final : PChar); + // Parse ' character + // XmlSpec 4.7: NotationDecl ::= '' +TYPE + TPhase = (phName, phExtId, phEnd); +VAR + ExternalID : TExternalID; + Phase : TPhase; + F : PChar; + NotationDef : TNotationDef; + DER : TDtdElementRec; +BEGIN + Final := Start + 10; // Character after ', + #0 : BREAK; + ELSE CASE Phase OF + phName : BEGIN + ExtractName (Final, CWhitespace + ['>'], F); + SetStringSF (NotationDef.Name, Final, F); + Final := F; + Phase := phExtId; + END; + phExtId : BEGIN + ExternalID := TExternalID.Create (Final); + NotationDef.Value := ExternalID.SystemId; + NotationDef.PublicId := ExternalID.PublicId; + Final := ExternalId.Final; + ExternalId.Free; + Phase := phEnd; + END; + phEnd : ; // -!- There is an error in the document if this branch is called + END; + END; + INC (Final); + UNTIL FALSE; + Notations.Add (NotationDef); + Final := StrScanE (Final, '>'); + + DER.ElementType := deNotation; + DER.NotationDef := NotationDef; + DER.Final := Final; + DtdElementFound (DER); +END; + + +PROCEDURE TXmlParser.PushPE (VAR Start : PChar); + (* If there is a parameter entity reference found in the data stream, + the current position will be pushed to the entity stack. + Start: IN Pointer to the '%' character starting the PE reference + OUT Pointer to first character of PE replacement text *) +VAR + P : PChar; + EntityDef : TEntityDef; +BEGIN + P := StrScan (Start, ';'); + IF P <> NIL THEN BEGIN + EntityDef := TEntityDef (ParEntities.Node (StrSFPas (Start+1, P-1))); + IF EntityDef <> NIL THEN BEGIN + EntityStack.Push (P+1); + Start := PChar (EntityDef.Value); + END + ELSE + Start := P+1; + END; +END; + + +PROCEDURE TXmlParser.ReplaceCharacterEntities (VAR Str : STRING); + // Replaces all Character Entity References in the String +VAR + Start : INTEGER; + PAmp : PChar; + PSemi : PChar; + PosAmp : INTEGER; + Len : INTEGER; // Length of Entity Reference +BEGIN + IF Str = '' THEN EXIT; + Start := 1; + REPEAT + PAmp := StrPos (PChar (Str) + Start-1, '&#'); + IF PAmp = NIL THEN BREAK; + PSemi := StrScan (PAmp+2, ';'); + IF PSemi = NIL THEN BREAK; + PosAmp := PAmp - PChar (Str) + 1; + Len := PSemi-PAmp+1; + IF CompareText (Str [PosAmp+2], 'x') = 0 // !!! Can't use "CHR" for Unicode characters > 255 + THEN Str [PosAmp] := CHR (StrToIntDef ('$'+Copy (Str, PosAmp+3, Len-4), 0)) + ELSE Str [PosAmp] := CHR (StrToIntDef (Copy (Str, PosAmp+2, Len-3), 32)); + Delete (Str, PosAmp+1, Len-1); + Start := PosAmp + 1; + UNTIL FALSE; +END; + + +PROCEDURE TXmlParser.ReplaceParameterEntities (VAR Str : STRING); + // Recursively replaces all Parameter Entity References in the String + PROCEDURE ReplaceEntities (VAR Str : STRING); + VAR + Start : INTEGER; + PAmp : PChar; + PSemi : PChar; + PosAmp : INTEGER; + Len : INTEGER; + Entity : TEntityDef; + Repl : STRING; // Replacement + BEGIN + IF Str = '' THEN EXIT; + Start := 1; + REPEAT + PAmp := StrPos (PChar (Str)+Start-1, '%'); + IF PAmp = NIL THEN BREAK; + PSemi := StrScan (PAmp+2, ';'); + IF PSemi = NIL THEN BREAK; + PosAmp := PAmp - PChar (Str) + 1; + Len := PSemi-PAmp+1; + Entity := TEntityDef (ParEntities.Node (Copy (Str, PosAmp+1, Len-2))); + IF Entity <> NIL THEN BEGIN + Repl := Entity.Value; + ReplaceEntities (Repl); // Recursion + END + ELSE + Repl := Copy (Str, PosAmp, Len); + Delete (Str, PosAmp, Len); + Insert (Repl, Str, PosAmp); + Start := PosAmp + Length (Repl); + UNTIL FALSE; + END; +BEGIN + ReplaceEntities (Str); +END; + + +PROCEDURE TXmlParser.ReplaceGeneralEntities (VAR Str : STRING); + // Recursively replaces General Entity References in the String + PROCEDURE ReplaceEntities (VAR Str : STRING); + VAR + Start : INTEGER; + PAmp : PChar; + PSemi : PChar; + PosAmp : INTEGER; + Len : INTEGER; + EntityDef : TEntityDef; + EntName : STRING; + Repl : STRING; // Replacement + ExternalEntity : TXmlParser; + BEGIN + IF Str = '' THEN EXIT; + Start := 1; + REPEAT + PAmp := StrPos (PChar (Str)+Start-1, '&'); + IF PAmp = NIL THEN BREAK; + PSemi := StrScan (PAmp+2, ';'); + IF PSemi = NIL THEN BREAK; + PosAmp := PAmp - PChar (Str) + 1; + Len := PSemi-PAmp+1; + EntName := Copy (Str, PosAmp+1, Len-2); + IF EntName = 'lt' THEN Repl := '<' + ELSE IF EntName = 'gt' THEN Repl := '>' + ELSE IF EntName = 'amp' THEN Repl := '&' + ELSE IF EntName = 'apos' THEN Repl := '''' + ELSE IF EntName = 'quot' THEN Repl := '"' + ELSE BEGIN + EntityDef := TEntityDef (Entities.Node (EntName)); + IF EntityDef <> NIL THEN BEGIN + IF EntityDef.Value <> '' THEN // Internal Entity + Repl := EntityDef.Value + ELSE BEGIN // External Entity + ExternalEntity := LoadExternalEntity (EntityDef.SystemId, EntityDef.PublicId, EntityDef.NotationName); + Repl := StrPas (ExternalEntity.DocBuffer); // !!! What if it contains a Text Declaration? + ExternalEntity.Free; + END; + ReplaceEntities (Repl); // Recursion + END + ELSE + Repl := Copy (Str, PosAmp, Len); + END; + Delete (Str, PosAmp, Len); + Insert (Repl, Str, PosAmp); + Start := PosAmp + Length (Repl); + UNTIL FALSE; + END; +BEGIN + ReplaceEntities (Str); +END; + + +FUNCTION TXmlParser.LoadExternalEntity (SystemId, PublicId, Notation : STRING) : TXmlParser; + // This will be called whenever there is a Parsed External Entity or + // the DTD External Subset to be parsed. + // It has to create a TXmlParser instance and load the desired Entity. + // This instance of LoadExternalEntity assumes that "SystemId" is a valid + // file name (relative to the Document source) and loads this file using + // the LoadFromFile method. +VAR + Filename : STRING; +BEGIN + // --- Convert System ID to complete filename + Filename := StringReplace (SystemId, '/', '\', [rfReplaceAll]); + IF Copy (FSource, 1, 1) <> '<' THEN + IF (Copy (Filename, 1, 2) = '\\') OR (Copy (Filename, 2, 1) = ':') THEN + // Already has an absolute Path + ELSE BEGIN + Filename := ExtractFilePath (FSource) + Filename; + END; + + // --- Load the File + Result := TXmlParser.Create; + Result.LoadFromFile (Filename); +END; + + +FUNCTION TXmlParser.TranslateEncoding (CONST Source : STRING) : STRING; + // The member variable "CurEncoding" always holds the name of the current + // encoding, e.g. 'UTF-8' or 'ISO-8859-1'. + // This virtual method "TranslateEncoding" is responsible for translating + // the content passed in the "Source" parameter to the Encoding which + // is expected by the application. + // This instance of "TranlateEncoding" assumes that the Application expects + // Windows ANSI (Win1252) strings. It is able to transform UTF-8 or ISO-8859-1 + // encodings. + // If you want your application to understand or create other encodings, you + // override this function. +BEGIN + IF CurEncoding = 'UTF-8' + THEN Result := Utf8ToAnsi (Source) + ELSE Result := Source; +END; + + +PROCEDURE TXmlParser.DtdElementFound (DtdElementRec : TDtdElementRec); + // This method is called for every element which is found in the DTD + // declaration. The variant record TDtdElementRec is passed which + // holds informations about the element. + // You can override this function to handle DTD declarations. + // Note that when you parse the same Document instance a second time, + // the DTD will not get parsed again. +BEGIN +END; + + +FUNCTION TXmlParser.GetDocBuffer: PChar; + // Returns FBuffer or a pointer to a NUL char if Buffer is empty +BEGIN + IF FBuffer = NIL + THEN Result := #0 + ELSE Result := FBuffer; +END; + + +(*$IFNDEF HAS_CONTNRS_UNIT +=============================================================================================== +TObjectList +=============================================================================================== +*) + +DESTRUCTOR TObjectList.Destroy; +BEGIN + Clear; + SetCapacity(0); + INHERITED Destroy; +END; + + +PROCEDURE TObjectList.Delete (Index : INTEGER); +BEGIN + IF (Index < 0) OR (Index >= Count) THEN EXIT; + TObject (Items [Index]).Free; + INHERITED Delete (Index); +END; + + +PROCEDURE TObjectList.Clear; +BEGIN + WHILE Count > 0 DO + Delete (Count-1); +END; + +(*$ENDIF *) + +(* +=============================================================================================== +TNvpNode +-------- +Node base class for the TNvpList +=============================================================================================== +*) + +CONSTRUCTOR TNvpNode.Create (TheName, TheValue : STRING); +BEGIN + INHERITED Create; + Name := TheName; + Value := TheValue; +END; + + +(* +=============================================================================================== +TNvpList +-------- +A generic List of Name-Value Pairs, based on the TObjectList introduced in Delphi 5 +=============================================================================================== +*) + +PROCEDURE TNvpList.Add (Node : TNvpNode); +VAR + I : INTEGER; +BEGIN + FOR I := Count-1 DOWNTO 0 DO + IF Node.Name > TNvpNode (Items [I]).Name THEN BEGIN + Insert (I+1, Node); + EXIT; + END; + Insert (0, Node); +END; + + + +FUNCTION TNvpList.Node (Name : STRING) : TNvpNode; + // Binary search for Node +VAR + L, H : INTEGER; // Low, High Limit + T, C : INTEGER; // Test Index, Comparison result + Last : INTEGER; // Last Test Index +BEGIN + IF Count=0 THEN BEGIN + Result := NIL; + EXIT; + END; + + L := 0; + H := Count; + Last := -1; + REPEAT + T := (L+H) DIV 2; + IF T=Last THEN BREAK; + Result := TNvpNode (Items [T]); + C := CompareStr (Result.Name, Name); + IF C = 0 THEN EXIT + ELSE IF C < 0 THEN L := T + ELSE H := T; + Last := T; + UNTIL FALSE; + Result := NIL; +END; + + +FUNCTION TNvpList.Node (Index : INTEGER) : TNvpNode; +BEGIN + IF (Index < 0) OR (Index >= Count) + THEN Result := NIL + ELSE Result := TNvpNode (Items [Index]); +END; + + +FUNCTION TNvpList.Value (Name : STRING) : STRING; +VAR + Nvp : TNvpNode; +BEGIN + Nvp := TNvpNode (Node (Name)); + IF Nvp <> NIL + THEN Result := Nvp.Value + ELSE Result := ''; +END; + + +FUNCTION TNvpList.Value (Index : INTEGER) : STRING; +BEGIN + IF (Index < 0) OR (Index >= Count) + THEN Result := '' + ELSE Result := TNvpNode (Items [Index]).Value; +END; + + +FUNCTION TNvpList.Name (Index : INTEGER) : STRING; +BEGIN + IF (Index < 0) OR (Index >= Count) + THEN Result := '' + ELSE Result := TNvpNode (Items [Index]).Name; +END; + + +(* +=============================================================================================== +TAttrList +List of Attributes. The "Analyze" method extracts the Attributes from the given Buffer. +Is used for extraction of Attributes in Start-Tags, Empty-Element Tags and the "pseudo" +attributes in XML Prologs, Text Declarations and PIs. +=============================================================================================== +*) + +PROCEDURE TAttrList.Analyze (Start : PChar; VAR Final : PChar); + // Analyze the Buffer for Attribute=Name pairs. + // Terminates when there is a character which is not IN CNameStart + // (e.g. '?>' or '>' or '/>') +TYPE + TPhase = (phName, phEq, phValue); +VAR + Phase : TPhase; + F : PChar; + Name : STRING; + Value : STRING; + Attr : TAttr; +BEGIN + Clear; + Phase := phName; + Final := Start; + REPEAT + IF (Final^ = #0) OR (Final^ = '>') THEN BREAK; + IF NOT (Final^ IN CWhitespace) THEN + CASE Phase OF + phName : BEGIN + IF NOT (Final^ IN CNameStart) THEN EXIT; + ExtractName (Final, CWhitespace + ['=', '/'], F); + SetStringSF (Name, Final, F); + Final := F; + Phase := phEq; + END; + phEq : BEGIN + IF Final^ = '=' THEN + Phase := phValue + END; + phValue : BEGIN + IF Final^ IN CQuoteChar THEN BEGIN + ExtractQuote (Final, Value, F); + Attr := TAttr.Create; + Attr.Name := Name; + Attr.Value := Value; + Attr.ValueType := vtNormal; + Add (Attr); + Final := F; + Phase := phName; + END; + END; + END; + INC (Final); + UNTIL FALSE; +END; + + +(* +=============================================================================================== +TElemList +List of TElemDef nodes. +=============================================================================================== +*) + +FUNCTION TElemList.Node (Name : STRING) : TElemDef; + // Binary search for the Node with the given Name +VAR + L, H : INTEGER; // Low, High Limit + T, C : INTEGER; // Test Index, Comparison result + Last : INTEGER; // Last Test Index +BEGIN + IF Count=0 THEN BEGIN + Result := NIL; + EXIT; + END; + + L := 0; + H := Count; + Last := -1; + REPEAT + T := (L+H) DIV 2; + IF T=Last THEN BREAK; + Result := TElemDef (Items [T]); + C := CompareStr (Result.Name, Name); + IF C = 0 THEN EXIT + ELSE IF C < 0 THEN L := T + ELSE H := T; + Last := T; + UNTIL FALSE; + Result := NIL; +END; + + +PROCEDURE TElemList.Add (Node : TElemDef); +VAR + I : INTEGER; +BEGIN + FOR I := Count-1 DOWNTO 0 DO + IF Node.Name > TElemDef (Items [I]).Name THEN BEGIN + Insert (I+1, Node); + EXIT; + END; + Insert (0, Node); +END; + + +(* +=============================================================================================== +TScannerXmlParser +A TXmlParser descendant for the TCustomXmlScanner component +=============================================================================================== +*) + +TYPE + TScannerXmlParser = CLASS (TXmlParser) + Scanner : TCustomXmlScanner; + CONSTRUCTOR Create (TheScanner : TCustomXmlScanner); + FUNCTION LoadExternalEntity (SystemId, PublicId, + Notation : STRING) : TXmlParser; OVERRIDE; + FUNCTION TranslateEncoding (CONST Source : STRING) : STRING; OVERRIDE; + PROCEDURE DtdElementFound (DtdElementRec : TDtdElementRec); OVERRIDE; + END; + +CONSTRUCTOR TScannerXmlParser.Create (TheScanner : TCustomXmlScanner); +BEGIN + INHERITED Create; + Scanner := TheScanner; +END; + + +FUNCTION TScannerXmlParser.LoadExternalEntity (SystemId, PublicId, Notation : STRING) : TXmlParser; +BEGIN + IF Assigned (Scanner.FOnLoadExternal) + THEN Scanner.FOnLoadExternal (Scanner, SystemId, PublicId, Notation, Result) + ELSE Result := INHERITED LoadExternalEntity (SystemId, PublicId, Notation); +END; + + +FUNCTION TScannerXmlParser.TranslateEncoding (CONST Source : STRING) : STRING; +BEGIN + IF Assigned (Scanner.FOnTranslateEncoding) + THEN Result := Scanner.FOnTranslateEncoding (Scanner, CurEncoding, Source) + ELSE Result := INHERITED TranslateEncoding (Source); +END; + + +PROCEDURE TScannerXmlParser.DtdElementFound (DtdElementRec : TDtdElementRec); +BEGIN + WITH DtdElementRec DO + CASE ElementType OF + deElement : Scanner.WhenElement (ElemDef); + deAttList : Scanner.WhenAttList (ElemDef); + deEntity : Scanner.WhenEntity (EntityDef); + deNotation : Scanner.WhenNotation (NotationDef); + dePI : Scanner.WhenPI (STRING (Target), STRING (Content), AttrList); + deComment : Scanner.WhenComment (StrSFPas (Start, Final)); + deError : Scanner.WhenDtdError (Pos); + END; +END; + + +(* +=============================================================================================== +TCustomXmlScanner +=============================================================================================== +*) + +CONSTRUCTOR TCustomXmlScanner.Create (AOwner: TComponent); +BEGIN + INHERITED; + FXmlParser := TScannerXmlParser.Create (Self); +END; + + +DESTRUCTOR TCustomXmlScanner.Destroy; +BEGIN + FXmlParser.Free; + INHERITED; +END; + + +PROCEDURE TCustomXmlScanner.LoadFromFile (Filename : TFilename); + // Load XML Document from file +BEGIN + FXmlParser.LoadFromFile (Filename); +END; + + +PROCEDURE TCustomXmlScanner.LoadFromBuffer (Buffer : PChar); + // Load XML Document from buffer +BEGIN + FXmlParser.LoadFromBuffer (Buffer); +END; + + +PROCEDURE TCustomXmlScanner.SetBuffer (Buffer : PChar); + // Refer to Buffer +BEGIN + FXmlParser.SetBuffer (Buffer); +END; + + +FUNCTION TCustomXmlScanner.GetFilename : TFilename; +BEGIN + Result := FXmlParser.Source; +END; + + +FUNCTION TCustomXmlScanner.GetNormalize : BOOLEAN; +BEGIN + Result := FXmlParser.Normalize; +END; + + +PROCEDURE TCustomXmlScanner.SetNormalize (Value : BOOLEAN); +BEGIN + FXmlParser.Normalize := Value; +END; + + +PROCEDURE TCustomXmlScanner.WhenXmlProlog(XmlVersion, Encoding: STRING; Standalone : BOOLEAN); + // Is called when the parser has parsed the declaration of the prolog +BEGIN + IF Assigned (FOnXmlProlog) THEN FOnXmlProlog (Self, XmlVersion, Encoding, Standalone); +END; + + +PROCEDURE TCustomXmlScanner.WhenComment (Comment : STRING); + // Is called when the parser has parsed a +BEGIN + IF Assigned (FOnComment) THEN FOnComment (Self, Comment); +END; + + +PROCEDURE TCustomXmlScanner.WhenPI (Target, Content: STRING; Attributes : TAttrList); + // Is called when the parser has parsed a +BEGIN + IF Assigned (FOnPI) THEN FOnPI (Self, Target, Content, Attributes); +END; + + +PROCEDURE TCustomXmlScanner.WhenDtdRead (RootElementName : STRING); + // Is called when the parser has completely parsed the DTD +BEGIN + IF Assigned (FOnDtdRead) THEN FOnDtdRead (Self, RootElementName); +END; + + +PROCEDURE TCustomXmlScanner.WhenStartTag (TagName : STRING; Attributes : TAttrList); + // Is called when the parser has parsed a start tag like

+BEGIN + IF Assigned (FOnStartTag) THEN FOnStartTag (Self, TagName, Attributes); +END; + + +PROCEDURE TCustomXmlScanner.WhenEmptyTag (TagName : STRING; Attributes : TAttrList); + // Is called when the parser has parsed an Empty Element Tag like
+BEGIN + IF Assigned (FOnEmptyTag) THEN FOnEmptyTag (Self, TagName, Attributes); +END; + + +PROCEDURE TCustomXmlScanner.WhenEndTag (TagName : STRING); + // Is called when the parser has parsed an End Tag like

+BEGIN + IF Assigned (FOnEndTag) THEN FOnEndTag (Self, TagName); +END; + + +PROCEDURE TCustomXmlScanner.WhenContent (Content : STRING); + // Is called when the parser has parsed an element's text content +BEGIN + IF Assigned (FOnContent) THEN FOnContent (Self, Content); +END; + + +PROCEDURE TCustomXmlScanner.WhenCData (Content : STRING); + // Is called when the parser has parsed a CDATA section +BEGIN + IF Assigned (FOnCData) THEN FOnCData (Self, Content); +END; + + +PROCEDURE TCustomXmlScanner.WhenElement (ElemDef : TElemDef); + // Is called when the parser has parsed an definition + // inside the DTD +BEGIN + IF Assigned (FOnElement) THEN FOnElement (Self, ElemDef); +END; + + +PROCEDURE TCustomXmlScanner.WhenAttList (ElemDef : TElemDef); + // Is called when the parser has parsed an definition + // inside the DTD +BEGIN + IF Assigned (FOnAttList) THEN FOnAttList (Self, ElemDef); +END; + + +PROCEDURE TCustomXmlScanner.WhenEntity (EntityDef : TEntityDef); + // Is called when the parser has parsed an definition + // inside the DTD +BEGIN + IF Assigned (FOnEntity) THEN FOnEntity (Self, EntityDef); +END; + + +PROCEDURE TCustomXmlScanner.WhenNotation (NotationDef : TNotationDef); + // Is called when the parser has parsed a definition + // inside the DTD +BEGIN + IF Assigned (FOnNotation) THEN FOnNotation (Self, NotationDef); +END; + + +PROCEDURE TCustomXmlScanner.WhenDtdError (ErrorPos : PChar); + // Is called when the parser has found an Error in the DTD +BEGIN + IF Assigned (FOnDtdError) THEN FOnDtdError (Self, ErrorPos); +END; + + +PROCEDURE TCustomXmlScanner.Execute; + // Perform scanning + // Scanning is done synchronously, i.e. you can expect events to be triggered + // in the order of the XML data stream. Execute will finish when the whole XML + // document has been scanned or when the StopParser property has been set to TRUE. +BEGIN + FStopParser := FALSE; + FXmlParser.StartScan; + WHILE FXmlParser.Scan AND (NOT FStopParser) DO + CASE FXmlParser.CurPartType OF + ptNone : ; + ptXmlProlog : WhenXmlProlog (FXmlParser.XmlVersion, FXmlParser.Encoding, FXmlParser.Standalone); + ptComment : WhenComment (StrSFPas (FXmlParser.CurStart, FXmlParser.CurFinal)); + ptPI : WhenPI (FXmlParser.CurName, FXmlParser.CurContent, FXmlParser.CurAttr); + ptDtdc : WhenDtdRead (FXmlParser.RootName); + ptStartTag : WhenStartTag (FXmlParser.CurName, FXmlParser.CurAttr); + ptEmptyTag : WhenEmptyTag (FXmlParser.CurName, FXmlParser.CurAttr); + ptEndTag : WhenEndTag (FXmlParser.CurName); + ptContent : WhenContent (FXmlParser.CurContent); + ptCData : WhenCData (FXmlParser.CurContent); + END; +END; + + +END. -- cgit v1.2.3