Title: Building an Easy-to-Use Parser/Parsing Framework (Part II)
Question: How to create a simple parsing framework to parse any kind of data?
Answer:
Building an Easy-to-Use Parser/Parsing Framework (Part II)
Example
Welcome to the second part of my article "Building an Easy-to-Use
Parser/Parsing Framework". This time, I want to show you how to create a
real working dtd parser as exemplified in the first part. If you don't read
my first article, please make up for this now:
Building an
Easy-to-Use Parser/Parsing Framework (Part I)
As mentioned earlier, we need a dtd document which holds up all our parsed
informations in an easy-to-access object model. Take a look at the
following interface section:
type
{ TDTDAttributeTyp }
TDTDAttributeTyp =
(atData, atID, atIDRef, atEnumeration);
{ TDTDAttributeStatus }
TDTDAttributeStatus =
(asDefault, asImplied, asRequired, asFixed);
{ TDTDChildTyp }
TDTDChildTyp =
(ctElement, ctChoice, ctSequence);
{ TDTDElementTyp }
TDTDElementTyp =
(etAny, etEmpty, etData, etContainer);
{ TDTDElementStatus }
TDTDElementStatus =
(esRequired, esRequiredSeq, esOptional, esOptionalSeq);
{ TDTDItem }
TDTDItem = class(TCollectionItem)
private
{ Private declarations }
FName: string;
public
{ Public declarations }
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
property Name: string read FName write FName;
end;
{ TDTDItems }
TDTDItems = class(TCollection)
private
{ Private declarations }
function GetItem(Index: Integer): TDTDItem;
procedure SetItem(Index: Integer; Value: TDTDItem);
public
{ Public declarations }
function Add: TDTDItem;
function Find(const Name: string): TDTDItem;
property Items[Index: Integer]: TDTDItem read GetItem write SetItem;
default;
end;
{ TDTDEntity }
TDTDEntity = class(TDTDItem)
private
{ Private declarations }
public
{ Public declarations }
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
end;
{ TDTDEntities }
TDTDEntities = class(TDTDItems)
private
{ Private declarations }
function GetItem(Index: Integer): TDTDEntity;
procedure SetItem(Index: Integer; Value: TDTDEntity);
public
{ Public declarations }
function Add: TDTDEntity;
function Find(const Name: string): TDTDEntity;
property Items[Index: Integer]: TDTDEntity read GetItem write SetItem;
default;
end;
{ TDTDEnum }
TDTDEnum = class(TDTDItem)
private
{ Private declarations }
public
{ Public declarations }
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
end;
{ TDTDEnums }
TDTDEnums = class(TDTDItems)
private
{ Private declarations }
function GetItem(Index: Integer): TDTDEnum;
procedure SetItem(Index: Integer; Value: TDTDEnum);
public
{ Public declarations }
function Add: TDTDEnum;
function Find(const Name: string): TDTDEnum;
property Items[Index: Integer]: TDTDEnum read GetItem write SetItem;
default;
end;
{ TDTDAttribute }
TDTDAttribute = class(TDTDItem)
private
{ Private declarations }
FTyp: TDTDAttributeTyp;
FStatus: TDTDAttributeStatus;
FDefault: string;
FEnums: TDTDEnums;
procedure SetEnums(Value: TDTDEnums);
public
{ Public declarations }
constructor Create(Collection: TCollection); override;
destructor Destroy; override;
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
property Typ: TDTDAttributeTyp read FTyp write FTyp;
property Status: TDTDAttributeStatus read FStatus write FStatus;
property Default: string read FDefault write FDefault;
property Enums: TDTDEnums read FEnums write SetEnums;
end;
{ TDTDAttributes }
TDTDAttributes = class(TDTDItems)
private
{ Private declarations }
function GetItem(Index: Integer): TDTDAttribute;
procedure SetItem(Index: Integer; Value: TDTDAttribute);
public
{ Public declarations }
function Add: TDTDAttribute;
function Find(const Name: string): TDTDAttribute;
property Items[Index: Integer]: TDTDAttribute read GetItem write
SetItem; default;
end;
{ TDTDProperty }
TDTDProperty = class(TDTDItem)
private
{ Private declarations }
FStatus: TDTDElementStatus;
public
{ Public declarations }
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
property Status: TDTDElementStatus read FStatus write FStatus;
end;
{ TDTDProperties}
TDTDProperties = class(TDTDItems)
private
{ Private declarations }
function GetItem(Index: Integer): TDTDProperty;
procedure SetItem(Index: Integer; Value: TDTDProperty);
public
{ Public declarations }
function Add: TDTDProperty;
function Find(const Name: string): TDTDProperty;
property Items[Index: Integer]: TDTDProperty read GetItem write
SetItem; default;
end;
{ TDTDChild }
TDTDChilds = class;
TDTDChild = class(TDTDProperty)
private
{ Private declarations }
FTyp: TDTDChildTyp;
FChilds: TDTDChilds;
procedure SetChilds(const Value: TDTDChilds);
public
{ Public declarations }
constructor Create(Collection: TCollection); override;
destructor Destroy; override;
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
property Typ: TDTDChildTyp read FTyp write FTyp;
property Childs: TDTDChilds read FChilds write SetChilds;
end;
{ TDTDChilds}
TDTDChilds = class(TDTDProperties)
private
{ Private declarations }
function GetItem(Index: Integer): TDTDChild;
procedure SetItem(Index: Integer; Value: TDTDChild);
public
{ Public declarations }
function Add: TDTDChild;
function Find(const Name: string): TDTDChild;
property Items[Index: Integer]: TDTDChild read GetItem write SetItem;
default;
end;
{ TDTDElement }
TDTDElement = class(TDTDProperty)
private
{ Private declarations }
FTyp: TDTDElementTyp;
FAttributes: TDTDAttributes;
FChilds: TDTDChilds;
procedure SetAttributes(Value: TDTDAttributes);
procedure SetChilds(Value: TDTDChilds);
public
{ Public declarations }
constructor Create(Collection: TCollection); override;
destructor Destroy; override;
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
property Typ: TDTDElementTyp read FTyp write FTyp;
property Attributes: TDTDAttributes read FAttributes write
SetAttributes;
property Childs: TDTDChilds read FChilds write SetChilds;
end;
{ TDTDElements }
TDTDElements = class(TDTDProperties)
private
{ Private declarations }
function GetItem(Index: Integer): TDTDElement;
procedure SetItem(Index: Integer; Value: TDTDElement);
public
{ Public declarations }
function Add: TDTDElement;
function Find(const Name: string): TDTDElement;
property Items[Index: Integer]: TDTDElement read GetItem write
SetItem; default;
end;
{ TDTDDocument }
TDTDDocument = class(TPersistent)
private
{ Private declarations }
FEntities: TDTDEntities;
FElements: TDTDElements;
procedure SetEntities(Value: TDTDEntities);
procedure SetElements(Value: TDTDElements);
public
{ Public declarations }
constructor Create;
destructor Destroy; override;
procedure Assign(Source: TPersistent); override;
published
{ Published declarations }
property Entities: TDTDEntities read FEntities write SetEntities;
property Elements: TDTDElements read FElements write SetElements;
end;
This model implements all needed objects to parse a dtd file. Notice, that
not all dtd grammars are reflected in this model, it's up to you to
improve my work - but it's enough to parse all standard dtd files.
Next, we need to create our dtd parser, which will be inherited by
TValidationParser as professed in Part I:
type
{ EDTDParser }
EDTDParser = class(Exception);
{ TDTDParser }
TDTDParser = class(TValidationParser)
private
{ Private declarations }
procedure ParseElement(Parser: TStringParser; Document: TDTDDocument;
const Pass: Integer);
procedure ParseAttlist(Parser: TStringParser; Document: TDTDDocument);
procedure ParseFile(const FileName: string; Document: TDTDDocument;
const Pass: Integer = 0);
public
{ Public declarations }
procedure Parse(const FileName: string; var Document: TDTDDocument);
end;
The new exception class EDTDParser will be raised, if the passed
filename is physical not available. One of the weightily methods is
Parse. The first parameter must be an existing filename of the dtd
file to be parsed. The second parameter is the document which holds our
object model and must be pre-initialized. The implementation of this
method is as followed:
01. procedure TDTDParser.Parse(const FileName: string; var Document: TDTDDocument);
02. var
03. TmpDocument: TDTDDocument;
04. begin
05. if not assigned(Document) then
06. raise EDTDParser.Create('Document not assigned!');
07. TmpDocument := TDTDDocument.Create;
08. try
09. ParseFile(FileName, TmpDocument);
10. if Errors.Count = 0 then
11. Document.Assign(TmpDocument);
12. finally
13. TmpDocument.Free;
14. end;
15. end;
In Line 5 we're looking if the passed document was successfully
initialized; if not, an exception (EDTDParser) will be raised.
After comparing that, we create a new temporary instance of a dtd document
(Line 7) and parse the passed filename (Line 9). If no errors occured
(Line 10) we make a copy of the filled dtd document by assigning it to the
passed one (Line 11).
Consecutively we take a look to the ParseFile procedure, which
initializes the main parsing process and looks for the basic keywords:
(Note: The italic lines are not part of the sourcecode - they are used to
explain the unique sections)
procedure TDTDParser.ParseFile(const FileName: string;
Document: TDTDDocument; const Pass: Integer = 0);
var
Parser: TStringParser;
begin
{Create a new instance of the TStringParser.}
Parser := TStringParser.Create;
try
{Check, if the passed filename already exists.}
if not Parser.LoadFromFile(FileName) then
begin
AddErrorFmt('File "%s" not found', [FileName], Parser);
Exit;
end;
{Initialize an endless loop.}
while True do
begin
{Skip to the next valid Tag-Begin-Token "
while not (Parser.Token in [toEOF, ' Parser.SkipToken;
{Break look, if current Token is EOF - End of File.}
if Parser.Token = toEOF then
Break;
{Get the next Token - after Tag-Begin "
Parser.SkipToken;
{Check for valid identification Tag "!" or "?".}
if Parser.Token '!' then
begin
{Only add an error if the current Pass is one "1".}
if not(Parser.Token in ['?']) and (Pass = 1) then
AddError('InvalidToken', Parser);
Continue;
end;
{Check for valid Symbol or Comment Line.}
if Parser.SkipToken toSymbol then
begin
if (Parser.Token '-') and (Pass = 1) then
AddError('InvalidToken', Parser);
Continue;
end;
{Check for "Entity" Tag.}
if UpperCase(Parser.TokenString) = 'ENTITY' then
Continue;
{Check for "Element" Tag.}
if UpperCase(Parser.TokenString) = 'ELEMENT' then
ParseElement(Parser, Document, Pass)
else
{Check for "Attribute" Tag.}
if UpperCase(Parser.TokenString) = 'ATTLIST' then
begin
if Pass = 1 then
ParseAttlist(Parser, Document);
end
{Add an error on invalid Symbols.}
else
if Pass = 1 then
AddErrorFmt('Invalid Symbol "%s"', [Parser.TokenString], Parser);
end;
{Initialize Pass 2 - if currently finished Pass 1.}
if Pass = 0 then
ParseFile(FileName, Document, 1);
finally
Parser.Free;
end;
end;
The ParseFile method simply starts parsing the main structure of a
dtd file and tries to extract some basic keywords like Entity,
Element or Attribute. If one of the last two keywords were
found, a special (ParseElement or ParseAttlist) method is
called to create the corresponding object and to extract additional
informations. If the parser founds any syntax or grammar errors,
respectively items are created.
The method ParseElement includes the functionality to parse and
extract further informations, like Type or Rule:
(Note: The italic lines are not part of the sourcecode - they are used to
explain the unique sections)
procedure TDTDParser.ParseElement(Parser: TStringParser;
Document: TDTDDocument; const Pass: Integer);
var
Element: TDTDElement;
Child: TDTDChild;
Rule: string;
begin
{Get the next Token.}
Parser.SkipToken;
{On first pass, create a new element.}
if Pass = 0 then
Element := Document.Elements.Add
{On second pass, find previous created element.}
else
Element := Document.Elements.Find(Parser.TokenString);
{Set the new element name.}
Element.Name := Parser.TokenString;
try
{Add an error if the current Token isn't a symbol.}
if Parser.Token toSymbol then
Abort;
{Check for element rule, like "any", "empty" or "sequence"...}
Rule := UpperCase(Parser.SkipTokenString);
{...Found Rule: "ANY".}
if (Rule = 'ANY') and (Parser.SkipToken = '') then
begin
Element.Typ := etAny;
Exit;
end;
{...Found Rule: "EMPTY".}
if (Rule = 'EMPTY') and (Parser.SkipToken = '') then
begin
Element.Typ := etEmpty;
Exit;
end;
if (Rule = '(') then
begin
{...Found Rule: "PCDATA".}
if Parser.SkipToken in [toEOF, ''] then
Abort;
if Parser.Token = '#' then
begin
if UpperCase(Parser.SkipToToken('')) = 'PCDATA)' then
begin
Element.Typ := etData;
Exit;
end;
Abort;
end;
{...Found Rule: "sequence/container".}
Element.Typ := etContainer;
repeat
{Create Child objects, if pass = 1.}
Child := nil;
if not (Parser.Token in ['|', ',', ')']) then
begin
if Pass = 0 then
begin
Child := Element.Childs.Add;
Child.Name := Parser.TokenString;
Child.Typ := ctElement;
end
else
if Document.Elements.Find(Parser.TokenString) = nil then
AddErrorFmt('Invalid Element Target "%s"', [Parser.TokenString], Parser);
end;
Parser.SkipToken;
{Check Child Status (=sequence style).}
if Parser.Token in ['+', '?', '*'] then
begin
if Child nil then
case Parser.Token of
'+':
Child.Status := esRequiredSeq;
'?':
Child.Status := esOptional;
'*':
Child.Status := esOptionalSeq;
end;
Parser.SkipToken;
end;
until Parser.SkipToken in [toEOF, ''];
Exit;
end;
{Add an error only on pass 1.}
if Pass = 1 then
AddErrorFmt('Invalid Element Rule "%s"', [Rule], Parser);
except
{Add an error only on pass 1.}
if Pass = 1 then
AddError('InvalidElementFormat', Parser);
end;
end;
The method ParseAttlist includes the functionality to parse and
extract further informations, like Type or Enumerations:
(Note: The italic lines are not part of the sourcecode - they are used to
explain the unique sections)
procedure TDTDParser.ParseAttlist(Parser: TStringParser; Document: TDTDDocument);
var
Attribute: TDTDAttribute;
Element: TDTDElement;
Target, Typ: string;
begin
{Get the next Token.}
Target := Parser.SkipTokenString;
try
{Add an error if the current Token isn't a symbol.}
if Parser.Token toSymbol then
Abort;
{Try to find the element target.}
Element := Document.Elements.Find(Target);
{Add an error if no element was found.}
if Element = nil then
begin
AddErrorFmt('Invalid Element Target "%s"', [Target], Parser);
Exit;
end;
{Get the next Token.}
Parser.SkipToken;
repeat
{Add an error if the current Token isn't a symbol.}
if Parser.Token toSymbol then
Abort;
{Create a new Attribute under the located element.}
Attribute := Element.Attributes.Add;
{Set the new name.}
Attribute.Name := Parser.TokenString;
{Check for Attribute Type...}
Typ := Parser.SkipTokenString;
{...Found Type "CDDATA".}
if UpperCase(Typ) = 'CDATA' then
Attribute.Typ := atData
else
{...Found Type "ID".}
if UpperCase(Typ) = 'ID' then
Attribute.Typ := atID
else
{...Found Type "IDREF".}
if UpperCase(Typ) = 'IDREF' then
Attribute.Typ := atIDRef
else
{...Found Type "enumeration".}
if Typ = '(' then
begin
Attribute.Typ := atEnumeration;
{Seperate enumeration parts and attach them}
{to the parent attribute.}
repeat
Parser.SkipToken;
if not(Parser.Token in ['|', ')']) then
Attribute.Enums.Add.Name := Parser.TokenString;
until Parser.Token in [toEOF, ')'];
{Add an error, if current token is "EOF".}
if Parser.Token = toEOF then
begin
AddErrorFmt('Invalid Enumeration End in Attribute "%s"', [Attribute.Name], Parser);
Exit;
end;
end
else
begin
AddErrorFmt('Invalid Attribute Typ "%s"', [Typ], Parser);
Exit;
end;
{Check for Restrictions...}
Parser.SkipToken;
if Parser.Token = '#' then
begin
{...Found Restriction "IMPLIED".}
Typ := UpperCase(Parser.SkipTokenString);
if Typ = 'IMPLIED' then
begin
Attribute.Status := asImplied;
Parser.SkipToken;
end;
{...Found Restriction "REQUIRED".}
if Typ = 'REQUIRED' then
begin
Attribute.Status := asRequired;
Parser.SkipToken;
end;
{...Found Restriction "FIXED".}
if Typ = 'FIXED' then
begin
Attribute.Status := asFixed;
Parser.SkipToken;
end;
end;
{Extract an optional default value.}
if Parser.Token = '"' then
begin
if Attribute.Status = asImplied then
Abort;
Attribute.Default := Trim(Parser.SkipToToken('"'));
Parser.SkipToken;
end;
until Parser.Token = '';
except
AddErrorFmt('Invalid Attribute Format "%s"', [Target], Parser);
end;
end;
Note: The above methods only detects simple dtd grammas. To parse
all possible tags and additional grammars you had to include a more
complex algorithm to do that - for our purposes (and this article) it's
enough. If you are not familiar with the dtd syntax, check out the site
W3Schools.
Okay, at this point we have finished our object-model and parser
implementation. All we need now is an example application which will take
use of this units. Our demo application will parse a dtd file, detects the
structure and creates a simple xml output with a given startup node.
Take a look at the following dtd:
name CDATA #REQUIRED
value CDATA #REQUIRED
Type (Error | Warning | Information) #REQUIRED
Our demo application will create the following xml output:
In this case, the startup node is BeratungsKontextResp which
will be used as the root node for all other nodes. Our example is
implemented as a console application as followed:
program dtd2xml;
{$APPTYPE CONSOLE}
uses
SysUtils,
DTD_Parser in 'DTD_Parser.pas',
DTD_Document in 'DTD_Document.pas',
StringParser in 'StringParser.pas',
PrivateParser in 'PrivateParser.pas';
var
FileName: string;
Switch_XMLRoot: string;
Switch_XMLData: Boolean;
Switch_RootLst: Boolean;
DTDDocument: TDTDDocument;
DTDParser: TDTDParser;
RootElement: TDTDElement;
i: Integer;
{-----------------------------------------------------------------------------
Procedure: FindCmdSwitch
Author: mh
Date: 23-Jan-2002
Arguments: const Switch: string; const Default: string = ''
Result: string
-----------------------------------------------------------------------------}
function FindCmdSwitch(const Switch: string; const Default: string = ''): string;
var
i: Integer;
begin
Result := '';
for i := 1 to ParamCount do
if UpperCase(Copy(ParamStr(i), 1, Length(Switch))) = UpperCase(Switch) then
begin
Result := Copy(ParamStr(i), Length(Switch) + 1, MAXINT);
Exit;
end;
if Result = '' then
Result := Default;
end;
{-----------------------------------------------------------------------------
Procedure: WriteXML
Author: mh
Date: 23-Jan-2002
Arguments: const AElement: TDTDElement; const AStatus: TDTDElementStatus; Indent: Integer = 0
Result: None
-----------------------------------------------------------------------------}
procedure WriteXML(const AElement: TDTDElement; const AStatus: TDTDElementStatus; Indent: Integer = 0);
var
i: Integer;
Spacer, Def: string;
begin
for i := 1 to Indent * 2 do
Spacer := Spacer + #32;
Write(Spacer + ' for i := 0 to AElement.Attributes.Count - 1 do
with AElement.Attributes[i] do
begin
Def := Default;
if (Switch_XMLData) and (Def = '') then
begin
if Typ = atEnumeration then
begin
if Enums.Count 0 then
Def := Enums[0].Name
else
Def := '???';
end
else
Def := Name;
end;
Write(Format(' %s="%s"', [Name, Def]));
end;
if AElement.Typ etContainer then
begin
Def := '';
if (Switch_XMLData) and (AElement.Typ etEmpty) then
Def := AElement.Name;
WriteLn(Format('%s', [Def, AElement.Name]));
end
else
WriteLn('');
for i := 0 to AElement.Childs.Count - 1 do
WriteXML(DTDDocument.Elements.Find(AElement.Childs[i].Name), AElement.Childs[i].Status, Indent + 1);
if AElement.Typ = etContainer then
WriteLn(Spacer + Format('', [AElement.Name]));
end;
{-----------------------------------------------------------------------------
Procedure: main
Author: mh
Date: 23-Jan-2002
Arguments: None
Result: None
-----------------------------------------------------------------------------}
begin
// display usage.
if (ParamCount = 0) or (FindCmdSwitch('-?', '?') '?') then
begin
WriteLn('');
WriteLn('dtd2xml (parser framework example) version 1.0');
WriteLn('(w)ritten 2002 by Marc Hoffmann. GNU License');
WriteLn('');
WriteLn('Usage: dtd2xml [options] [-?]');
WriteLn('');
WriteLn('Options:');
WriteLn('-xmlroot= XML root element (? = possible elements)');
WriteLn('-xmldata=yes|no Include XML Example data (default = yes)');
WriteLn('');
Exit;
end;
// exract filename.
FileName := ParamStr(1);
// append default extenstion,
if ExtractFileExt(FileName) = '' then
FileName := ChangeFileExt(FileName, '.dtd');
// file exists?
if not FileExists(FileName) then
begin
WriteLn(Format('Fatal: File not found ''%s''.', [FileName]));
Exit;
end;
// extract command-line switches.
Switch_RootLst := FindCmdSwitch('-xmlroot=') = '?';
Switch_XMLRoot := FindCmdSwitch('-xmlroot=');
Switch_XMLData := UpperCase(FindCmdSwitch('-xmldata=')) 'NO';
// create new dtd-document.
DTDDocument := TDTDDocument.Create;
try
// create new dtd-parser.
DTDParser := TDTDParser.Create;
try
// parse file.
DTDParser.Parse(FileName, DTDDocument);
// display possible errors.
if DTDParser.Errors.Count 0 then
begin
for i := 0 to DTDParser.Errors.Count - 1 do
with DTDParser.Errors[i] do
WriteLn(Format('Error in Line %d, Pos %d: %s...', [Line, Position, Message]));
Exit;
end;
// search rootelement.
RootElement := DTDDocument.Elements.Find(Switch_XMLRoot);
// display rootelements & assign possible object.
for i := 0 to DTDDocument.Elements.Count - 1 do
if DTDDocument.Elements[i].Typ = etContainer then
begin
if Switch_RootLst then
WriteLn(DTDDocument.Elements[i].Name)
else
if (Switch_XMLRoot = '') and ((RootElement = nil) or ((RootElement nil)
and (RootElement.Childs.Count RootElement := DTDDocument.Elements[i];
end;
// exit app if rootlist-switch was set.
if Switch_RootLst then
Exit;
// exit app if rootelement is NIL.
if RootElement = nil then
begin
WriteLn(Format('Fatal: Root Element ''%s'' not found.', [Switch_XMLRoot]));
Exit;
end;
// exit app if rootelement is invalid.
if RootElement.Typ etContainer then
begin
WriteLn(Format('Fatal: ''%s'' is not a valid Root Element.', [Switch_XMLRoot]));
Exit;
end;
// write xml output.
WriteLn(Format('' + #13 + '', [RootElement.Name, ExtractFileName(FileName)]));
WriteLn('');
WriteXML(RootElement, RootElement.Status);
// free dtd-parser.
finally
DTDParser.Free;
end;
// free dtd-document.
finally
DTDDocument.Free;
end;
end.
Thank you very much for you regard.
M. Hoffmann