//------------------------------------------------------------------------------
// RegExpUnit.Pas Copyright (C) 1997 Object Dynamics Ltd.
//
// An implementation of regular expression searching for Delphi 2 and later. The
// code is based on that presented by Kernighan & Plauger in their book
// "Software Tools in Pascal" (ISBN 0-201-10342-7). You should consult this book
// for a detailed explanation of how this implementation works, as the code is
// not particularly heavily commented!
//
// *** IMPORTANT ***
//
// By using this code, you accept the following conditions:
//
// You may use and adapt this code freely, but it remains the
// copyright of Object Dynamics Ltd. Any adaptations must retain the
// copyright message at the head of this file.
//
// You use this code at your own risk. Object Dynamics is not responsible
// for any loss or damage caused by programs using this code.
//
//
// History:
//
// Version 1.0 Created by Neil Butterworth, September 1997
// - Fixed problem with .* not matching entire line
//
//------------------------------------------------------------------------------
unit ODRegExpUnit;
interface
//------------------------------------------------------------------------------
// Find
//
// Searches the string "str" from position "start" for the pattern "pat". The
// pattern must have been constructed using one of the two MakePattern functions
// described below.
//
// The function returns the position of the pattern in the string, or zero if
// no match is found. If there is a match, the length of matched characters
// is returned in the "len" parameter.
//
// If the "csense" parameter is false, the case of characters is ignored.
//------------------------------------------------------------------------------
function Find( const str : string; start : integer;
const pat : string; var len : integer;
csense : boolean ) : integer;
//------------------------------------------------------------------------------
// MakePattern
//
// Constructs an encoded version of a regular expression, required for input
// as the "pat" parameter of Find, described above.
//
// MakePattern begins construction of the pattern at the position indicated
// by "start" - this should almost always be 1. This extraneous parameter is
// maintained for future Software Tools compatibility.
//------------------------------------------------------------------------------
function MakePattern( const pat : string; start : integer ) : string;
//------------------------------------------------------------------------------
// MakePatternNoRegexp
//
// As above, but treats all regular expression characters as non-special. For
// example:
//
// MakePattern( '[a-z]') wo
//
// would create apattern that would match the string '[a-z]', rather than
// one that would match a single lower-case character.
//------------------------------------------------------------------------------
function MakePatternNoRegEx( const pat : string; start : integer ) : string;
//------------------------------------------------------------------------------
implementation
uses
SysUtils;
//------------------------------------------------------------------------------
// The following constants define the symbols used by regular expressions. The
// set used is identical to that used by UNIX programs such as vi and ed, but
// does not include extended regular expressions as used by (for example )nawk.
//------------------------------------------------------------------------------
const
CLOSURE = '*'; // match zero or more of preceding character
BOL = '^'; // beginning of line
EOL = '$'; // end of line
ESCAPE = '\'; // escape next character
DASH = '-'; // used in [a-z] type expressions
NEGATE = '^'; // negate next character/range in [a-z] expression
CCL ='['; // intro for [a-z] expressions
CCLEND = ']'; // outro for [a-z] expressions
ANY = '.'; // match any single character
//------------------------------------------------------------------------------
// These are used for internally encoding expressions
//------------------------------------------------------------------------------
NCCL = '!'; // negate [a-z] must not be same as NEGATE!!!
LITCHAR = '@'; // quote single literal character
TAB = char( 9 ); // tab
//------------------------------------------------------------------------------
// Convert a single character to uppercase (if possible)
//------------------------------------------------------------------------------
function ToUpper( c : char ) : char;
begin
if (( c >= 'a' ) and (c <='z')) then
result := char(integer(c) - 32)
else
result := c;
end;
//------------------------------------------------------------------------------
// Compare two characters for equality. If csense is false, the comparison
// ignores case.
//------------------------------------------------------------------------------
function CmpChar( c1, c2 : char; csense : boolean ) : boolean;
begin
if ( csense ) then
result := c1 = c2
else
result := ToUpper(c1) = ToUpper(c2);
end;
//------------------------------------------------------------------------------
// Check if single character is alphanumeric.
//------------------------------------------------------------------------------
function IsAlphaNum( c : char ) : boolean;
begin
result := ((c>= 'A') and (c<='Z'))
or ((c >='a') and (c<='z'))
or ((c>='0') and (c<='9'));
end;
//------------------------------------------------------------------------------
// Check if index is beyond the last character position in a string.
//------------------------------------------------------------------------------
function AtEnd( const str : string; index : integer ) : boolean;
begin
result := index > Length( str );
end;
//------------------------------------------------------------------------------
// Expand an escaped character.
//------------------------------------------------------------------------------
function Esc( const s : string; var i : integer ) : char;
begin
if ( s[i] <> ESCAPE ) then
result := s[i]
else if ( AtEnd( s, i+1 )) then
result := ESCAPE
else begin
inc( i );
if ( s[i] = 't' ) then
result := TAB
else
result := s[i];
end;
end;
//------------------------------------------------------------------------------
// Expand a character class in the form c1-c2. For example a-d expands to "abcd".
//------------------------------------------------------------------------------
function ExpandDash( delim : char; const pat : string; var i : integer ) : string;
var
k : char;
begin
result := '';
while( (pat[i] <> delim) and (not AtEnd( pat, i ))) do begin
if ( pat[i] = ESCAPE ) then
result := result + Esc( pat, i )
else if ( pat[i] <> DASH ) then
result := result + pat[i]
else if ( AtEnd( pat, i )) then
result := result + DASH
else if ( IsAlphaNum( pat[i-1] )
and IsAlphaNum( pat[i+1])
and ( pat[i-1] <= pat[i+1])) then begin
for k := char(integer(pat[i-1]) + 1) to pat[i+1] do
result := result + k;
inc( i );
end
else
result := result + DASH;
inc( i );
end;
end;
//------------------------------------------------------------------------------
// Expand character class in form [a-z]
//------------------------------------------------------------------------------
function ExpandCharClass( const c : string; var i : integer ) : string;
var
countpos : integer;
tmp : string;
begin
result := '';
inc( i );
if ( c[i] = NEGATE ) then begin
result := result + NCCL;
inc ( i );
end
else
result := result + CCL;
result := result + ' ';
countpos := Length( result );
tmp := ExpandDash( CCLEND, c, i );
result[countpos] := char(length(tmp));
if ( c[i] = CCLEND ) then
result := result + tmp
else
result := '';
end;
//------------------------------------------------------------------------------
// Insert a closure symbol at position cpos.
//------------------------------------------------------------------------------
procedure InsertClosure( var pat : string; cpos : integer );
begin
Insert( CLOSURE, pat, cpos );
end;
//------------------------------------------------------------------------------
// Construct a pattern from an expression. A pattern is an expanded encoding
// of an expression, which the search functions need. This version ignores
// ALL regular expression characters.
//
// The "start" parameter indicates the starting point in the expression. This
// will almost always be 1.
//------------------------------------------------------------------------------
function MakePatternNoRegEx( const pat : string; start : integer ) : string;
var
i : integer;
begin
result := '';
for i := start to length( pat ) do
result := result + LITCHAR + pat[i];
end;
//------------------------------------------------------------------------------
// As above, but handles regualr expression characters.
//------------------------------------------------------------------------------
function MakePattern( const pat : string; start : integer ) : string;
var
p, pstart, i : integer;
begin
i := start;
result := '';
pstart := 0;
while( not AtEnd( pat, i ) ) do begin
if ( pat[i] = ANY ) then begin
pstart := Length( result ) + 1;
result := result + ANY;
end
else if ( (pat[i] = BOL) and (i = start )) then begin
pstart := Length( result ) + 1;
result := result + BOL;
end
else if ( (pat[i] =EOL) and AtEnd(pat, i+1 )) then begin
pstart := length( result ) + 1;
result := result + EOL;
end
else if ( pat[i] = CCL ) then begin
pstart := length( result ) + 1;
result := result + ExpandCharClass( pat, i );
end
else if ( ( pat[i] = CLOSURE ) and ( i > start )) then begin
p := pstart;
pstart := length( result ) + 1;
if ( ( p < 1 ) or (result[p] in [BOL, EOL, CLOSURE]) ) then begin
result := '';
exit;
end;
InsertClosure( result, p );
end
else begin
pstart := length( result ) + 1;
result := result + LITCHAR + Esc( pat, i );
end;
inc( i );
end;
end;
//------------------------------------------------------------------------------
// Get length of piece of a pattern. For example, the encoded length of the
// expression 'a' is 2, as it is encodeds a LITCHAR followed by an 'a'
//------------------------------------------------------------------------------
function PatSize( const pat : string; n : integer ) : integer;
begin
if ( pat[n] = LITCHAR ) then
result := 2
else if ( pat[n] in [BOL, EOL, ANY, CLOSURE] ) then
result := 1
else if ( (pat[n] = CCL) or (pat[n] = NCCL)) then
result := integer( pat[n+1] ) + 2;
end;
//------------------------------------------------------------------------------
// Find single character in character class.
//------------------------------------------------------------------------------
function LocateChar( c : char; const pat : string; offset : integer;
csense : boolean ) : boolean;
var
i : integer;
begin
result := false;
i := offset + integer( pat[offset] );
while( i > offset ) do begin
if ( CmpChar(c, pat[i], csense ) ) then begin
result := true;
exit;
end;
dec(i );
end;
end;
//------------------------------------------------------------------------------
// Match a single pattern element against a string.
//------------------------------------------------------------------------------
function MatchOne( const str : string; var i : integer;
const pat : string; j : integer;
csense : boolean ) : boolean;
var
advance : integer;
begin
advance := -1;
if ( AtEnd( str, i ) ) then begin
if ( pat[j] = EOL ) then
advance := 0;
end
else if ( not (pat[j] in [LITCHAR, BOl, EOL, ANY, CCL, NCCL, CLOSURE])) then
raise Exception.Create( 'should never happen!' )
else begin
case pat[j] of
LITCHAR:
if ( CmpChar( str[i], pat[j+1], csense ) ) then
advance := 1;
BOL:
if ( i = 1 ) then
advance := 0;
ANY:
if ( not AtEnd( str, i ) ) then //i+1
advance := 1;
EOL:
if ( AtEnd( str, i + 1 ) ) then
advance := 0;
CCL:
if ( LocateChar( str[i], pat, j+1, csense )) then
advance := 1;
NCCL:
if ( not AtEnd( str, i + 1 )
and( not LocateChar( str[i], pat, j+1, csense) )) then
advance := 1;
end;
end;
if ( advance >= 0 ) then begin
i := i + advance;
result := true;
end
else
result := false;
end;
//------------------------------------------------------------------------------
// Look for a match of patttern element pat[j] in a string, starting at offset.
//------------------------------------------------------------------------------
function MatchPat( const str : string; offset : integer;
const pat : string; j : integer;
csense : boolean ) : integer;
var
i, k : integer;
begin
while( not AtEnd( pat, j ) ) do begin
if ( pat[j] = CLOSURE ) then begin
j := j + PatSize( pat, j );
i := offset;
while( (not AtEnd( str, i ))
and ( MatchOne( str, i, pat, j, csense ))) do begin
// nothing
end;
while( i >= offset ) do begin
k := MatchPat( str, i, pat, j + PatSize( pat, j ), csense);
if ( k > 0 ) then
break;
dec( i );
end;
offset := k;
break;
end
else if ( not MatchOne( str, offset, pat, j, csense ) ) then begin
offset := 0;
break;
end
else
j := j + PatSize( pat, j );
end;
result := offset;
end;
//------------------------------------------------------------------------------
// Look for a pattern in a string, returning the position of the pattern
// (or zero if not found, and the length.
//------------------------------------------------------------------------------
function Find( const str : string; start : integer;
const pat : string;
var len : integer;
csense : boolean ) : integer;
var
i, pos : integer;
begin
i := start;
pos := 0;
result := 0;
while ( not AtEnd( str, i )) do begin
len := MatchPat( str, i, pat, 1, csense );
if ( len <> 0 ) then begin
len := len - i;
result := i;
exit;
end;
inc( i );
end;
end;
//----------------------------------- eof --------------------------------------
end.