BSOne.SFC/Tocsg.Lib/VCL/Other/EM.DomParser.pas

913 lines
31 KiB
Plaintext

{==============================================================================|
| Project : Delphi HTML/XHTML parser module | 1.1.2 |
|==============================================================================|
| Content: |
|==============================================================================|
| The contents of this file are subject to the Mozilla Public License Ver. 1.0 |
| (the "License"); you may not use this file except in compliance with the |
| License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ |
| |
| Software distributed under the License is distributed on an "AS IS" basis, |
| WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for |
| the specific language governing rights and limitations under the License. |
|==============================================================================|
| Initial Developers of the Original Code are: |
| Sandbil (Russia) sandbil@ya.ru |
| All Rights Reserved. |
| Last Modified: |
| 25.10.2014, Sandbil |
|==============================================================================|
| History: see README |
|==============================================================================|}
unit EM.DomParser;
interface
uses
System.Classes, System.RegularExpressionsCore, System.Generics.Collections,
System.Contnrs, System.StrUtils, System.SysUtils;
type
TNodeList = class;
TChildList=class;
TDomTreeNode = class;
TDomTree = class
private
FCount: Integer;
fParseErr: TStringList;
fRootNode: TDomTreeNode;
public
constructor Create;
destructor destroy; override;
property Count: Integer read fCount;
property RootNode: TDomTreeNode read fRootNode;
property ParseErr: TStringList read fParseErr;
end;
TDomTreeNode = class(TObject)
private
fTag: string;
fAttributesTxt: string;
fAttributes: TDictionary<string, string>;
fText: string;
fTypeTag: string;
fChild: TChildList;
fParent: Pointer;
fOwner: TDomTree;
public
property Tag: string read fTag;
property AttributesTxt: string read fAttributesTxt;
property Attributes: TDictionary<string, string> read fAttributes;
property Text: string read fText;
property TypeTag: string read fTypeTag;
property Child: TChildList read fChild;
property Parent: Pointer read fParent;
property Owner: TDomTree read fOwner;
constructor create(hOwner: TDomTree; hParent: Pointer; hTag, hAttrTxt: string; hAttr:
TDictionary<string, string>; hTypeTag, hText: string);
destructor destroy; override;
function FindNode(hNameTag: string; hIndex:integer; hAttrTxt: String;
hAnyLevel: Boolean; dListNode: TNodeList): Boolean;
function FindTagOfIndex(hNameTag: String; hIndex:integer; hAnyLevel:
Boolean; dListNode: TNodeList): Boolean;
function FindXPath(hXPathTxt: String; dListNode: TNodeList;
dListValue:TStringList): Boolean;
function GetAttrValue(hAttrName:string): string;
function GetComment(hIndex: Integer): string;
function GetTagName: string;
function GetTextValue(hIndex:Integer): string;
function GetXPath(hRelative:boolean): string;
function RunParse(HtmlTxt: String): Boolean;
end;
TChildList = class(TList)
private
function Get(Index: Integer): TDomTreeNode;
public
destructor Destroy; override;
property Items[Index: Integer]: TDomTreeNode read Get; default;
end;
TNodeList = class(TList)
private
function Get(Index: Integer): TDomTreeNode;
public
property Items[Index: Integer]: TDomTreeNode read Get; default;
end;
PPrmRec=^TPrmRec;
TPrmRec = record
TagName: string;
ind: Integer;
Attr: string;
AnyLevel: Boolean;
end;
TPrmRecList = class(TList)
private
function Get(Index: Integer): PPrmRec;
public
destructor Destroy; override;
property Items[Index: Integer]: PPrmRec read Get; default;
end;
implementation
{ TDomTree }
{
*********************************** TDomTree ***********************************
}
constructor TDomTree.Create;
begin
fParseErr:= TStringList.Create;
fRootnode:= TDomTreeNode.Create(self,self,'Root','',nil,'','');
FCount:=0;
end;
destructor TDomTree.destroy;
begin
FreeAndNil(fParseErr);
FreeAndNil(fRootNode);
inherited;
end;
{ TChildList }
{
********************************** TChildList **********************************
}
destructor TChildList.Destroy;
var
i: Integer;
begin
for i := 0 to Count - 1 do
self[i].Free;
inherited;
end;
function TChildList.Get(Index: Integer): TDomTreeNode;
begin
Result := TDomTreeNode(inherited Get(Index));
end;
{ TNodeList }
function TNodeList.Get(Index: Integer): TDomTreeNode;
begin
Result := TDomTreeNode(inherited Get(Index));
end;
{ TPrmRecList }
{
********************************* TPrmRecList **********************************
}
destructor TPrmRecList.Destroy;
var
i: Integer;
begin
for i := 0 to Count - 1 do
FreeMem(Items[i]);
inherited;
end;
function TPrmRecList.Get(Index: Integer): PPrmRec;
begin
Result := PPrmRec(inherited Get(Index));
end;
{ TDomTreeNode }
{
********************************* TDomTreeNode *********************************
}
constructor TDomTreeNode.create(hOwner: TDomTree; hParent: Pointer; hTag, hAttrTxt: string;
hAttr: TDictionary<string, string>; hTypeTag, hText: string);
begin
fChild := TChildList.create;
fParent := hParent;
fTag := hTag;
fAttributesTxt := hAttrTxt;
fAttributes := hAttr;
fTypeTag:= hTypeTag;
fText := hText;
fOwner:=hOwner;
inc(hOwner.FCount);
end;
destructor TDomTreeNode.destroy;
begin
FreeAndNil(fAttributes);
FreeAndNil(fChild);
inherited;
end;
//***********FindAttr*************
// hNameTag - name Tag
// hIndex - number of a tag one after another (0 - all tag, 1 - each first ..)
// hAttrTxt - attribute. ex. alt=1
// hAnyLevel - true - all levels after start node; false - only one child level after start node
// dListNode - return TNodeList of TDomTreeNode
function TDomTreeNode.FindNode(hNameTag: string; hIndex:integer; hAttrTxt:
String; hAnyLevel: Boolean; dListNode: TNodeList): Boolean;
var
RegEx: TPerlRegEx;
i,a: integer;
TagNodeList:TNodeList;
tValue: string;
Function FindAttrChildNode(aNode:TDomTreeNode;AttrName,AttrValue: String):TNodeList;
var
aValue: String;
j: integer;
begin
for j := 0 to aNode.Child.Count - 1 do
begin
if aNode.Child[j].Attributes <> nil then
if aNode.Child[j].Attributes.ContainsKey(AttrName) then
if aNode.Child[j].Attributes.TryGetValue(AttrName, aValue) then
if AttrValue = aValue then dListNode.Add(aNode.Child[j]);
if hAnyLevel then
FindAttrChildNode(aNode.Child[j], AttrName, AttrValue);
end;
result:=dListNode;
end;
begin
RegEx:=nil;
try
result:=false;
RegEx := TPerlRegEx.create;
RegEx.Subject := hAttrTxt;
RegEx.RegEx :='([^\s]*?[^\S]*)=([^\S]*".*?"[^\S]*)|'+
'([^\s]*?[^\S]*)=([^\S]*#39.*?#39[^\S]*)|'+
'([^\s]*?[^\S]*)=([^\S]*[^\s]+[^\S]*)|'+
'(autofocus[^\S]*)()|'+
'(disabled[^\S]*)()|'+
'(selected[^\S]*)()';
if (not (hAttrTxt = '')) and (RegEx.Match) then
begin
for i := 1 to RegEx.GroupCount do
if trim(RegEx.Groups[i]) <> '' then break;
if hNameTag = '' then
begin
if FindAttrChildNode(self,RegEx.Groups[i],RegEx.Groups[i+1]).Count>0
then result:=true;
end
else
begin
TagNodeList:=TNodeList.Create;
if FindTagOfIndex(hNameTag,hIndex,hAnyLevel,TagNodeList) then
for a := 0 to TagNodeList.Count - 1 do
if TagNodeList[a].Attributes <> nil then
if TagNodeList[a].Attributes.ContainsKey(RegEx.Groups[i]) then
if TagNodeList[a].Attributes.TryGetValue(RegEx.Groups[i], tValue) then
//There was a strong compareson of values of attribute
// if RegEx.Groups = tValue)
if Pos(RegEx.Groups[i+1], tValue)>0
then
begin
dListNode.Add(TagNodeList[a]);
result:=true;
end;
TagNodeList.Free;
end;
end
else
if hAttrTxt = '' then
begin
TagNodeList:=TNodeList.Create;
if FindTagOfIndex(hNameTag,hIndex,hAnyLevel,TagNodeList) then
for a := 0 to TagNodeList.Count - 1 do
begin
dListNode.Add(TagNodeList[a]);
result:=true;
end;
TagNodeList.Free;
end
else raise Exception.create('Attribute not found: '+ hAttrTxt );
finally
RegEx.free
end;
end;
//***********FindTagOfIndex*************
// hNameTag - name Tag (* - any tag, except text tag)
// hIndex - number of a tag one after another (0 - all tag, 1 - each first ..)
// hAnyLevel - true - all level after start node; false - only one child level after start node
// dListNode - return TNodeList of TDomTreeNode
function TDomTreeNode.FindTagOfIndex(hNameTag: String; hIndex:integer;
hAnyLevel: Boolean; dListNode: TNodeList): Boolean;
function SubStringOccurences(const subString, sourceString : string; caseSensitive : boolean) : integer;
var
pEx: integer;
sub, source : string;
begin
if caseSensitive then
begin
sub := subString;
source := sourceString;
end
else
begin
sub := LowerCase(subString);
source := LowerCase(sourceString);
end;
result := 0;
pEx := PosEx(sub, source, 1);
while pEx <> 0 do
begin
Inc(result);
pEx := PosEx(sub, source, pEx + Length(sub));
end;
end;
Function FindChildTagOfIndex(aNode:TDomTreeNode):TNodeList;
var
countNode,j: integer;
enumTags:string;
begin
countNode:=0;
for j := 0 to aNode.Child.Count - 1 do
begin
if hNameTag <> '*' then
begin
if ((AnsiUpperCase(aNode.Child[j].Tag) = AnsiUpperCase(hNameTag)) and (aNode.Child[j].TypeTag <> '</%s>'))
or ((AnsiUpperCase(aNode.Child[j].Tag) = '') and (AnsiUpperCase(hNameTag)='TEXT()') and (aNode.Child[j].Text <> ''))
or ((LeftStr(AnsiUpperCase(aNode.Child[j].Tag),4) = '<!--') and (AnsiUpperCase(hNameTag)='COMMENT()'))
then
begin
Inc(countNode);
if (countNode = hIndex ) or (hIndex = 0) then dListNode.Add(aNode.Child[j])
end;
if (hAnyLevel) and (aNode.Child.Count > 0) then FindChildTagOfIndex(aNode.Child[j]) ;
end
else
begin
if (aNode.Child[j].TypeTag <> '</%s>') then
begin
enumTags:=enumTags + AnsiUpperCase(aNode.Child[j].Tag)+',';
if (SubStringOccurences(AnsiUpperCase(aNode.Child[j].Tag)+',',enumTags, false) = hIndex ) or (hIndex = 0) then dListNode.Add(aNode.Child[j])
end;
if (hAnyLevel) and (aNode.Child.Count > 0) then FindChildTagOfIndex(aNode.Child[j]) ;
end;
end;
result:=dListNode;
end;
begin
result:=false;
if FindChildTagOfIndex(self).Count > 0
then result:=true;
end;
function TDomTreeNode.FindXPath(hXPathTxt: String; dListNode: TNodeList;
dListValue:TStringList): Boolean;
var
RegExXPath, RegExXPathElmt: TPerlRegEx;
i: integer;
NextAnyLevel:boolean;
PrmXPath:TPrmRecList;
PrmXPathSTR: String;
PrmCount:integer;
procedure MatchXpath(Context,mTxtElmt:string) ;
var
Prm: PPrmRec;
begin
if (Context='/') and (trim(mTxtElmt)='') then NextAnyLevel:=true
else if (Context='/') and (trim(mTxtElmt)='..') then
begin
New(prm);
Prm.TagName:='..';
Prm.ind:=0;
Prm.Attr:='';
Prm.AnyLevel:=false;
PrmXPath.Add(Prm);
end
else
begin
RegExXPathElmt.Options := [preCaseLess];
RegExXPathElmt.Subject:=trim(mTxtElmt);
RegExXPathElmt.RegEx:='^([\.\*@A-Z][-A-Z0-9\(\)]*)\[?([0-9]*)\]?\[?@?([^\]]*)';
if RegExXPathElmt.Match then
begin
New(prm);
Prm.TagName:=RegExXPathElmt.Groups[1];
if not TryStrToInt( RegExXPathElmt.Groups[2], Prm.ind ) then Prm.ind:=0;
Prm.Attr:=RegExXPathElmt.Groups[3];
Prm.AnyLevel:=NextAnyLevel;
if (Context='/') then NextAnyLevel:=False;
PrmXPath.Add(Prm);
end
else
raise Exception.create('XPath is not correct '+ Context + mTxtElmt );
end;
end;
Function FindWithPrm(cPrm:integer; CurNode:TDomTreeNode; dListNode: TNodeList) : boolean;
var
i: integer;
cLNode: TNodeList;
begin
result:=false;
if PrmXPath[cPrm].TagName = '..' then
FindWithPrm(cPrm + 1,CurNode.Parent, dListNode)
else
begin
cLNode:=TNodeList.Create;
if CurNode.FindNode(PrmXPath[cPrm].TagName,PrmXPath[cPrm].ind,PrmXPath[cPrm].Attr,PrmXPath[cPrm].AnyLevel,cLNode) then
for I := 0 to cLNode.Count - 1 do
if cPrm < PrmCount then
FindWithPrm(cPrm + 1,cLNode[i], dListNode)
else dListNode.Add(cLNode[i]) ;
cLNode.free;
end;
if dListNode.Count > 0 then result:=true
end;
begin
PrmXPath:=nil;
RegExXPath:=nil;
RegExXPathElmt:=nil;
try
result:=false;
NextAnyLevel:=false;
PrmXPath:=TPrmRecList.Create;
PrmXPathSTR:='';
RegExXPath := TPerlRegEx.create;
RegExXPathElmt := TPerlRegEx.create;
RegExXPath.Subject:= hXPathTxt;
RegExXPath.RegEx:='(/)([\*@]?[^/]*)';
if RegExXPath.Match then
begin
MatchXpath(RegExXPath.Groups[1],RegExXPath.Groups[2]);
while RegExXPath.MatchAgain do
MatchXpath(RegExXPath.Groups[1],RegExXPath.Groups[2]);
for i := 0 to PrmXPath.Count-1 do
PrmXPathSTR:=PrmXPathSTR + PrmXPath[i].TagName +',' + inttostr(PrmXPath[i].ind) +',' + PrmXPath[i].Attr+',' + BoolToStr(PrmXPath[i].AnyLevel,True)+chr(13)+chr(10);
if PrmXPath.Count > 0 then
begin
if (PrmXPath[PrmXPath.Count-1].TagName[1]='@')
then
begin
PrmCount:= PrmXPath.Count - 2;
PrmXPath[PrmXPath.Count-1].TagName:=AnsiReplaceStr(PrmXPath[PrmXPath.Count-1].TagName,'@','');
if FindWithPrm(0,self,dListNode) then
begin
for I := 0 to dListNode.Count-1 do
if dListNode[i].GetAttrValue(PrmXPath[PrmXPath.Count-1].TagName)<>'' then
dListValue.Add(dListNode[i].GetAttrValue(PrmXPath[PrmXPath.Count-1].TagName));
if dListValue.Count > 0 then result:= true
else result:=false;
end
else result:=false;
end
else
begin
PrmCount:= PrmXPath.Count - 1;
result:= FindWithPrm(0,self,dListNode);
if (AnsiLowerCase(PrmXPath[PrmXPath.Count-1].TagName)='comment()')
or (AnsiLowerCase(PrmXPath[PrmXPath.Count-1].TagName)='text()') then
for I := 0 to dListNode.Count-1 do
begin
if (AnsiLowerCase(PrmXPath[PrmXPath.Count-1].TagName)='text()')
then dListValue.Add(dListNode[i].Text)
else dListValue.Add(TDomTreeNode(dListNode[i]).Tag) ;
end;
end;
end
else raise Exception.create('XPath is not correct or empty.');
end
else raise Exception.create('XPath is not correct or empty.');
finally
PrmXPath.Free;
RegExXPath.Free;
RegExXPathElmt.Free;
end;
end;
function TDomTreeNode.GetAttrValue(hAttrName:string): string;
begin
result:='';
if self.Attributes <> nil then
if self.Attributes.ContainsKey(hAttrName) then
if not self.Attributes.TryGetValue(hAttrName, result) then
result:='';
end;
function TDomTreeNode.GetComment(hIndex: Integer): string;
var
countNode,j: integer;
begin
result:='';
countNode:=0;
for j := 0 to self.Child.Count - 1 do
if (LeftStr(self.Child[j].Tag,4) = '<!--') and
(self.Child[j].TypeTag = '%s') and
(self.Child[j].Text = '')
then
begin
Inc(countNode);
if (countNode = hIndex ) or (hIndex = 0) then
begin
result:= self.Child[j].Tag;
break;
end;
end;
end;
function TDomTreeNode.GetTagName: string;
begin
if self.TypeTag='</%s>' then
result:= format(AnsiReplaceStr(self.TypeTag,'/',''),[self.Tag + ' ' + self.AttributesTxt] )
else
result:= format(self.TypeTag,[self.Tag + ' ' + self.AttributesTxt] );
end;
function TDomTreeNode.GetTextValue(hIndex:Integer): string;
var
countNode,j: integer;
begin
result:='';
countNode:=0;
for j := 0 to self.Child.Count - 1 do
if (self.Child[j].Tag = '') and
(self.Child[j].TypeTag = '') and
(self.Child[j].Text <> '')
then
begin
Inc(countNode);
if (countNode = hIndex ) or (hIndex = 0) then
begin
result:= self.Child[j].Text;
break;
end;
end;
end;
function TDomTreeNode.GetXPath(hRelative:boolean): string;
function GetCountTag(Node: TDomTreeNode): string;
var
CountNode, nNode, i: integer;
begin
CountNode:=0;
result:= '';
if TObject(Node.Parent) is TDomTreeNode then
begin
for i:=0 to TDomTreeNode(Node.Parent).Child.Count - 1 do
begin
if (Node.Tag = TDomTreeNode(Node.Parent).Child[i].Tag)
or ((LeftStr(Node.Tag,4)='<!--') and (LeftStr(TDomTreeNode(Node.Parent).Child[i].Tag,4)='<!--'))
then
inc(CountNode);
if Node = TDomTreeNode(Node.Parent).Child[i] then
nNode:= CountNode;
end;
if (CountNode <> nNode) or ((CountNode = nNode) and (CountNode > 1)) then
result:= format('[%d]',[nNode]);
end;
end;
function GetParent(Node: TDomTreeNode): string;
begin
if TObject(Node.Parent) is TDomTreeNode then
begin
if (hRelative) and (TDomTreeNode(Node.Parent).GetAttrValue('id') <>'') then
result:=format('//*[@id=%s]',[TDomTreeNode(Node.Parent).GetAttrValue('id')])+
'/' + result
else
result:=GetParent(Node.Parent)+
TDomTreeNode(Node.Parent).Tag + GetCountTag(Node.Parent) + '/' + result
end
else result:='.'+result;
end;
begin
if (LeftStr(self.Tag,2) <> '<?') and (LeftStr(self.Tag,9) <> '<!DOCTYPE') then
begin
if LeftStr(self.Tag,4) = '<!--' then result:='comment()'
else if self.Tag <> '' then result:=self.Tag
else result:='text()';
result:=GetParent(self) + result + GetCountTag(self);
if result[1]='.' then
result:='.'+RightStr(result, length(result) - Pos('/', result, 1)+1);
end
else result:='';
end;
function TDomTreeNode.RunParse(HtmlTxt: String): Boolean;
var
RegExHTML, RegExTag: TPerlRegEx;
prev, ErrParseHTML, ind: integer;
ChildTree: TDomTreeNode;
HtmlUtf8, RegExException: string;
tag_txt: TArray<String>;
function getAttr(mAttrTxt: string): TDictionary<string, string>;
var
sGroup,
CheckAttr: String;
procedure MatchAttr;
var
i: integer;
begin
CheckAttr := StuffString(CheckAttr,RegExTag.MatchedOffset+1, RegExTag.MatchedLength, StringOfChar(' ',RegExTag.MatchedLength));
for i := 1 to RegExTag.GroupCount do
if trim(RegExTag.Groups[i]) <> '' then
begin
try
sGroup := trim(RegExTag.Groups[i]);
if not result.ContainsKey(sGroup) then // 추가 16_0119 00:28:35 sunk
result.Add(sGroup, trim(RegExTag.Groups[i + 1]));
except
on E: Exception do
Owner.fParseErr.Add('Warning: not add Attributtes ' +
E.ClassName + ' : ' + E.Message + 'Sourse string: ' + mAttrTxt +
';' + chr(13)+chr(10)+' attributtes: ' + RegExTag.Groups[i]);
end;
break;
end;
end;
begin
try
result := TDictionary<string, string>.create;
if trim(mAttrTxt) <> '' then
begin
RegExTag.Subject := mAttrTxt;
CheckAttr := mAttrTxt;
RegExTag.Options := [preCaseLess, preMultiLine, preSingleLine];
RegExTag.Replacement:='';
// here RegExp for processing attributes of tags
// First not Empty - attribute, next - value
RegExTag.RegEx :='([^\s]*?[^\S]*)=([^\S]*".*?"[^\S]*)|'+
'([^\s]*?[^\S]*)=([^\S]*'#39'.*?'#39'[^\S]*)|'+
'([^\s]*?[^\S]*)=([^\S]*[^\s]+[^\S]*)|'+
'(allowTransparency[^\S]*)()|'+
'(allowfullscreen[^\S]*)()|'+
'(novalidate[^\S]*)()|'+
'(autofocus[^\S]*)()|'+
'(itemscope[^\S]*)()|'+
'(disabled[^\S]*)()|'+
'(readonly[^\S]*)()|'+
'(selected[^\S]*)()|'+
'(checked[^\S]*)()|'+
'(pubdate[^\S]*)()|'+
'(nowrap[^\S]*)()|'+
'(hidden[^\S]*)()|'+
'(async[^\S]*)()';
if RegExTag.Match then
begin
MatchAttr;
while RegExTag.MatchAgain do
MatchAttr;
// ***Start Check Parsing Tag Attributes Error****
if Length(Trim(CheckAttr)) > 0 then
Owner.fParseErr.Add('Warning: parsed not all attributes, ' +
'sourse string: ' + mAttrTxt + chr(13)+chr(10)+
'not parsed string: ' + Trim(CheckAttr));
// ***End Check Parsing Tag Attributes Error************
end
else
Owner.fParseErr.Add('Attributtes not found - ' +
'Sourse string: ' + mAttrTxt);
end;
except
on E: Exception do
Owner.fParseErr.Add('Attributtes - ' + E.ClassName + ' : ' +
E.Message + 'Sourse string: ' + mAttrTxt);
end;
end;
function getTagTxt(mTxt: string): TArray<String>;
begin
try
SetLength(result, 4);
result[0] := ''; // name tag
result[1] := ''; // text attributes
result[2] := ''; // text value following for tag
result[3] := ''; // type tag
if LeftStr(trim(mTxt),2) = '</' then result[3] :='</%s>' //close
else if RightStr(trim(mTxt),2) = '/>' then result[3] :='<%s/>' //selfclose
else if LeftStr(trim(mTxt),2) = '<!' then result[3] :='%s'
else if LeftStr(trim(mTxt),2) = '<?' then result[3] :='%s'
else result[3] :='<%s>'; // open
RegExTag.Subject := mTxt;
RegExTag.Options := [preCaseLess, preMultiLine, preSingleLine];
// here RegExp for processing HTML tags
// Group 1- tag, 2- attributes, 3- text
RegExTag.RegEx := '<([/A-Z][:A-Z0-9]*)\b([^>]*)>([^<]*)';
if RegExTag.Match then
begin
// ****************Start Check Parsing HTML Tag Error************
if mTxt <> '<' + RegExTag.Groups[1] + RegExTag.Groups[2] + '>' + RegExTag.Groups[3] then
Owner.fParseErr.Add('Check error Tags parsing - ' + 'Sourse string: ' + mTxt);
// ****************End Check Parsing HTML Tag Error************
result[0] := trim(RegExTag.Groups[1]);
if trim(RegExTag.Groups[2])<> '' then
if RightStr(trim(RegExTag.Groups[2]),1)= '/' then
result[1] := leftStr(trim(RegExTag.Groups[2]),length(trim(RegExTag.Groups[2]))-1)
else result[1] := trim(RegExTag.Groups[2]);
result[2] := trim(RegExTag.Groups[3]);
end
else
result[0] := trim(mTxt);
except
on E: Exception do
Owner.fParseErr.Add('Tags - ' + E.ClassName + ' : ' + E.Message +
'Sourse string: ' + mTxt);
end;
end;
function getPairTagTxt(mTxt, mPattern: string): TArray<String>;
begin
try
SetLength(result, 4);
result[0] := ''; // name tag
result[1] := ''; // text attributes
result[2] := ''; // text value following for tag
result[3] := ''; // close tag
RegExTag.Subject := mTxt;
RegExTag.Options := [preCaseLess, preMultiLine, preSingleLine];
// here RegExp for processing HTML tags
// Group 1- tag, 2- attributes, 3- text
RegExTag.RegEx := mPattern;
if RegExTag.Match then
begin
// ****************Start Check Parsing HTML Tag Error************
if trim(mTxt) <> '<' + RegExTag.Groups[1] + RegExTag.Groups[2] + '>' + RegExTag.Groups[3] + '<' +RegExTag.Groups[4] +'>' then
Owner.fParseErr.Add('Check error Exception Tags parsing - ' + 'Sourse string: ' + mTxt);
// ****************End Check Parsing HTML Tag Error************
result[0] := trim(RegExTag.Groups[1]);
result[1] := trim(RegExTag.Groups[2]);
result[2] := trim(RegExTag.Groups[3]);
result[3] := trim(RegExTag.Groups[4]);
end
else
result[0] := mTxt;
except
on E: Exception do
Owner.fParseErr.Add('Exception Tags - ' + E.ClassName + ' : ' + E.Message +
'Sourse string: ' + mTxt);
end;
end;
Function CheckParent(aChildTree: TDomTreeNode; tTag: string):TDomTreeNode;
var
ParentTag: string;
begin
result := aChildTree.Parent;
if tTag = '<%s>' then
result := aChildTree
else if tTag = '</%s>' then
if TObject(TDomTreeNode(aChildTree.Parent).Parent) is TDomTreeNode then
begin
ParentTag := TDomTreeNode(aChildTree.Parent).Tag;
if ParentTag = RightStr(aChildTree.Tag, length(aChildTree.Tag) - 1) then
result := TDomTreeNode(aChildTree.Parent).Parent
end;
end;
procedure MatchTag(mTxtMatch:string);
var
ExceptTag: string;
begin
// tag without close tag
ExceptTag :=
',META,LINK,IMG,COL,AREA,BASE,BASEFONT,ISINDEX,BGSOUNDCOMMAND,PARAM,INPUT,EMBED,FRAME,BR,WBR,HR,TRACK,';
if (leftstr(mTxtMatch, 4) = '<!--') then
begin
tag_txt[0] := trim(mTxtMatch);
tag_txt[1] := '';
tag_txt[2] := '';
tag_txt[3] := '%s';
ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, tag_txt[0], '', nil, '%s','')) ;
end
else if (AnsiUpperCase(leftstr(mTxtMatch, 7)) = '<TITLE>') // tag with any symbol
or (AnsiUpperCase(leftstr(mTxtMatch, 10)) = '<PLAINTEXT>')
or (AnsiUpperCase(leftstr(mTxtMatch, 5)) = '<XMP>')
or (AnsiUpperCase(leftstr(mTxtMatch, 7)) = '<SCRIPT')
or (AnsiUpperCase(leftstr(mTxtMatch, 9)) = '<TEXTAREA')
//or (AnsiUpperCase(leftstr(mTxtMatch, 4)) = '<PRE')
then
begin
tag_txt := getPairTagTxt(mTxtMatch,'<([A-Z][A-Z0-9]*)\b([^>]*?)>(.*)<(/\1)>');
ind:=ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, tag_txt[0], tag_txt[1], getAttr(tag_txt[1]), '<%s>','')) ;
if tag_txt[2] <> '' then ChildTree.Child[ind].Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree.Child[ind], '', '', nil, '', tag_txt[2]));
ChildTree.Child[ind].Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree.Child[ind], tag_txt[3], '', nil, '</%s>','')) ;
end
else
begin
tag_txt := getTagTxt(mTxtMatch);
ind := ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, tag_txt[0], tag_txt[1], getAttr(tag_txt[1]), tag_txt[3],''));
if (Pos(',' + AnsiUpperCase(trim(tag_txt[0])) + ',', ExceptTag) = 0)
and (LeftStr(tag_txt[0],2) <> '<?')
and (LeftStr(tag_txt[0],2) <> '<!') then
ChildTree := CheckParent(ChildTree.Child[ind],tag_txt[3]);
if tag_txt[2] <> '' then
ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, '', '', nil, '',tag_txt[2]));
end;
end;
// *************************** START PARSE HTML*************************
begin
RegExHTML:=nil;
RegExTag:=nil;
try
HtmlUtf8 := HtmlTxt;
RegExHTML := TPerlRegEx.create;
RegExTag := TPerlRegEx.create;
ErrParseHTML:=0;
RegExHTML.Options := [preCaseLess, preMultiLine, preSingleLine];
ChildTree := self;
with RegExHTML do
begin
// *********RegExp for parsing HTML**************
// (<title>.*</title>[^<]*) - title
// (<\!--.+?-->[^<]*) - comment
// (<script.*?</script>[^<]*) - script
// (<[^>]+>[^<]*) - all remaining tags
// [^<]* - text
RegExException :='(<PLAINTEXT>.*?</PLAINTEXT>[^<]*)|'+
'(<title>.*?</title>[^<]*)|'+
'(<xmp>.*?</xmp>[^<]*)|'+
'(<script.*?</script>[^<]*)|'+
'(<textarea.*?</textarea>[^<]*)|'+
// '(<pre.*?</pre>[^<]*)|'+
'(<!--.+?-->[^<]*)|';
RegEx := RegExException + '(<[^>]+>[^<]*)'; // all teg and text
Subject := HtmlUtf8;
if Match then
begin
MatchTag(RegExHTML.MatchedText);
prev := MatchedOffset + MatchedLength;
while MatchAgain do
begin
MatchTag(RegExHTML.MatchedText);
// *****Start Check Parsing HTML Error************
if MatchedOffset - prev > 0 then
begin
Owner.fParseErr.Add(IntToStr(ErrParseHTML) + '- Check error found after HTML parsing');
inc(ErrParseHTML)
end;
prev := MatchedOffset + MatchedLength;
// *****End Check Parsing HTML Error************
end;
// ***********End RegExp match cycle************
end
else
raise Exception.create('Input text not contain HTML tags');
// *************End RegExp match ************
end;
Finally
RegExHTML.Free;
RegExTag.Free;
if Owner.FCount>0 then
result := True
else result := False ;
end;
end;
end.