The following code demonstrates how to parse a html file looking for
Begin Tag
End Tag
Raw Text
The following routine demonstrates how to parse a html file.
I welcome feed back to improve the routine, if you have any suggestions/hints please let me know.
rgds
Si Carter
---------------- BEGIN CODE BLOCK ------------------------
unit HTMLParse;
(***************************************************************************
HTMLParse
Purpose: Parse a html file to extract tags and plain text.
Copyright © 2003 - TECT Software Ltd. All Rights Reserved.
All code remains the property of TECT Software Ltd and may not
be changed without permission. Use of this code is granted to
any developer for private, open source or commercial applications.
No warranty expressed or implied. Use at own risk.
Contact:
WEB - www.tectsoft.com
EMail - support@tectsoft.com
Copyright Notice Must Remain With File.
Visit www.tectsoft.com for *low cost* developer friendly web hosting.
Requires:
FastStrings from http://www.droopyeyes.com
Usage:
See www.howtodothings.com for demo usage.
****************************************************************************)
interface
uses Classes, FastStringFuncs, FastStrings;
type
TTagType = (ttBeginTag, ttEndTag, ttRawText);
THTMLParseProc = procedure(const HTMLData: string; TagType: TTagType;
Parameters: TStrings);
procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc);
implementation
uses SysUtils;
const
(* NOTE: download the file below, the following codes are wrong when
displayed in a browser like this :-) *)
THTMLReplaceWords: array[0..4] of array[0..1] of string = ((' ', ' '),
('&', '&'), ('<', '<'), ('>', '>'), ('"', '"'));
procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc);
procedure CallTagProc(IsTag: Boolean; HTMLData: string);
var
s: string;
sl: TStringList;
I: Integer;
begin
HTMLData := Trim(HTMLData);
if Length(HTMLData) > 0 then
begin
if IsTag then
begin
if Pos(' ', HTMLData) > 0 then
s := Trim(Copy(HTMLData, 1, Pos(' ', HTMLData)))
else
s := Trim(HTMLData);
sl := TStringList.Create;
try
sl.Text := Trim(Copy(HTMLData, Length(s) + 1, length(HTMLData)));
sl.Text := Trim(FastReplace(sl.Text, ';', #13));
sl.Text := Trim(FastReplace(sl.Text, '" ', #13));
sl.Text := Trim(FastReplace(sl.Text, '"', ''));
if LeftStr(s, 1) = '/' then
THTMLParseProc(ParseProc)(uppercase(s), ttEndTag, sl)
else
THTMLParseProc(ParseProc)(UpperCase(s), ttBeginTag, sl);
finally
sl.Free;
end;
end else
begin
for I := 0 to 4 do
HTMLData := FastReplace(HTMLData, THTMLReplaceWords[I, 0],
THTMLReplaceWords[I, 1]);
THTMLParseProc(ParseProc)(HTMLData, ttRawText, nil);
end;
end;
end;
var
s: string;
P: PChar;
begin
Assert(Assigned(ParseProc));
P := PChar(HTML);
s := '';
while P^ <> #0 do
begin
case P^ of
'<':
begin
CallTagProc(False, s);
s := '';
end;
'>':
begin
CallTagProc(True, s);
s := '';
end;
else
s := s + P^;
end; //case
Inc(P);
end;
end;
end.