913 lines
31 KiB
Plaintext
913 lines
31 KiB
Plaintext
{==============================================================================|
|
|
| Project : Delphi HTML/XHTML parser module | 1.1.2 |
|
|
|==============================================================================|
|
|
| Content: |
|
|
|==============================================================================|
|
|
| The contents of this file are subject to the Mozilla Public License Ver. 1.0 |
|
|
| (the "License"); you may not use this file except in compliance with the |
|
|
| License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ |
|
|
| |
|
|
| Software distributed under the License is distributed on an "AS IS" basis, |
|
|
| WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for |
|
|
| the specific language governing rights and limitations under the License. |
|
|
|==============================================================================|
|
|
| Initial Developers of the Original Code are: |
|
|
| Sandbil (Russia) sandbil@ya.ru |
|
|
| All Rights Reserved. |
|
|
| Last Modified: |
|
|
| 25.10.2014, Sandbil |
|
|
|==============================================================================|
|
|
| History: see README |
|
|
|==============================================================================|}
|
|
|
|
|
|
unit EM.DomParser;
|
|
|
|
interface
|
|
|
|
uses
|
|
System.Classes, System.RegularExpressionsCore, System.Generics.Collections,
|
|
System.Contnrs, System.StrUtils, System.SysUtils;
|
|
|
|
|
|
|
|
type
|
|
TNodeList = class;
|
|
TChildList=class;
|
|
TDomTreeNode = class;
|
|
|
|
|
|
|
|
TDomTree = class
|
|
private
|
|
FCount: Integer;
|
|
fParseErr: TStringList;
|
|
fRootNode: TDomTreeNode;
|
|
public
|
|
constructor Create;
|
|
destructor destroy; override;
|
|
property Count: Integer read fCount;
|
|
property RootNode: TDomTreeNode read fRootNode;
|
|
property ParseErr: TStringList read fParseErr;
|
|
end;
|
|
|
|
TDomTreeNode = class(TObject)
|
|
private
|
|
fTag: string;
|
|
fAttributesTxt: string;
|
|
fAttributes: TDictionary<string, string>;
|
|
fText: string;
|
|
fTypeTag: string;
|
|
fChild: TChildList;
|
|
fParent: Pointer;
|
|
fOwner: TDomTree;
|
|
public
|
|
property Tag: string read fTag;
|
|
property AttributesTxt: string read fAttributesTxt;
|
|
property Attributes: TDictionary<string, string> read fAttributes;
|
|
property Text: string read fText;
|
|
property TypeTag: string read fTypeTag;
|
|
property Child: TChildList read fChild;
|
|
property Parent: Pointer read fParent;
|
|
property Owner: TDomTree read fOwner;
|
|
|
|
constructor create(hOwner: TDomTree; hParent: Pointer; hTag, hAttrTxt: string; hAttr:
|
|
TDictionary<string, string>; hTypeTag, hText: string);
|
|
destructor destroy; override;
|
|
function FindNode(hNameTag: string; hIndex:integer; hAttrTxt: String;
|
|
hAnyLevel: Boolean; dListNode: TNodeList): Boolean;
|
|
function FindTagOfIndex(hNameTag: String; hIndex:integer; hAnyLevel:
|
|
Boolean; dListNode: TNodeList): Boolean;
|
|
function FindXPath(hXPathTxt: String; dListNode: TNodeList;
|
|
dListValue:TStringList): Boolean;
|
|
function GetAttrValue(hAttrName:string): string;
|
|
function GetComment(hIndex: Integer): string;
|
|
function GetTagName: string;
|
|
function GetTextValue(hIndex:Integer): string;
|
|
function GetXPath(hRelative:boolean): string;
|
|
function RunParse(HtmlTxt: String): Boolean;
|
|
end;
|
|
|
|
TChildList = class(TList)
|
|
private
|
|
function Get(Index: Integer): TDomTreeNode;
|
|
public
|
|
destructor Destroy; override;
|
|
property Items[Index: Integer]: TDomTreeNode read Get; default;
|
|
end;
|
|
|
|
TNodeList = class(TList)
|
|
private
|
|
function Get(Index: Integer): TDomTreeNode;
|
|
public
|
|
property Items[Index: Integer]: TDomTreeNode read Get; default;
|
|
end;
|
|
|
|
|
|
PPrmRec=^TPrmRec;
|
|
TPrmRec = record
|
|
TagName: string;
|
|
ind: Integer;
|
|
Attr: string;
|
|
AnyLevel: Boolean;
|
|
end;
|
|
|
|
TPrmRecList = class(TList)
|
|
private
|
|
function Get(Index: Integer): PPrmRec;
|
|
public
|
|
destructor Destroy; override;
|
|
property Items[Index: Integer]: PPrmRec read Get; default;
|
|
end;
|
|
|
|
|
|
|
|
|
|
|
|
implementation
|
|
|
|
{ TDomTree }
|
|
|
|
{
|
|
*********************************** TDomTree ***********************************
|
|
}
|
|
constructor TDomTree.Create;
|
|
begin
|
|
fParseErr:= TStringList.Create;
|
|
fRootnode:= TDomTreeNode.Create(self,self,'Root','',nil,'','');
|
|
FCount:=0;
|
|
|
|
end;
|
|
|
|
destructor TDomTree.destroy;
|
|
begin
|
|
FreeAndNil(fParseErr);
|
|
FreeAndNil(fRootNode);
|
|
inherited;
|
|
end;
|
|
|
|
|
|
{ TChildList }
|
|
|
|
{
|
|
********************************** TChildList **********************************
|
|
}
|
|
destructor TChildList.Destroy;
|
|
var
|
|
i: Integer;
|
|
begin
|
|
for i := 0 to Count - 1 do
|
|
self[i].Free;
|
|
inherited;
|
|
end;
|
|
|
|
|
|
function TChildList.Get(Index: Integer): TDomTreeNode;
|
|
begin
|
|
Result := TDomTreeNode(inherited Get(Index));
|
|
end;
|
|
|
|
{ TNodeList }
|
|
|
|
function TNodeList.Get(Index: Integer): TDomTreeNode;
|
|
begin
|
|
Result := TDomTreeNode(inherited Get(Index));
|
|
end;
|
|
|
|
|
|
{ TPrmRecList }
|
|
|
|
{
|
|
********************************* TPrmRecList **********************************
|
|
}
|
|
destructor TPrmRecList.Destroy;
|
|
var
|
|
i: Integer;
|
|
begin
|
|
for i := 0 to Count - 1 do
|
|
FreeMem(Items[i]);
|
|
inherited;
|
|
end;
|
|
|
|
|
|
|
|
function TPrmRecList.Get(Index: Integer): PPrmRec;
|
|
begin
|
|
Result := PPrmRec(inherited Get(Index));
|
|
end;
|
|
|
|
{ TDomTreeNode }
|
|
|
|
{
|
|
********************************* TDomTreeNode *********************************
|
|
}
|
|
constructor TDomTreeNode.create(hOwner: TDomTree; hParent: Pointer; hTag, hAttrTxt: string;
|
|
hAttr: TDictionary<string, string>; hTypeTag, hText: string);
|
|
begin
|
|
fChild := TChildList.create;
|
|
fParent := hParent;
|
|
fTag := hTag;
|
|
fAttributesTxt := hAttrTxt;
|
|
fAttributes := hAttr;
|
|
fTypeTag:= hTypeTag;
|
|
fText := hText;
|
|
fOwner:=hOwner;
|
|
inc(hOwner.FCount);
|
|
end;
|
|
|
|
destructor TDomTreeNode.destroy;
|
|
begin
|
|
FreeAndNil(fAttributes);
|
|
FreeAndNil(fChild);
|
|
inherited;
|
|
end;
|
|
|
|
//***********FindAttr*************
|
|
// hNameTag - name Tag
|
|
// hIndex - number of a tag one after another (0 - all tag, 1 - each first ..)
|
|
// hAttrTxt - attribute. ex. alt=1
|
|
// hAnyLevel - true - all levels after start node; false - only one child level after start node
|
|
// dListNode - return TNodeList of TDomTreeNode
|
|
|
|
function TDomTreeNode.FindNode(hNameTag: string; hIndex:integer; hAttrTxt:
|
|
String; hAnyLevel: Boolean; dListNode: TNodeList): Boolean;
|
|
var
|
|
RegEx: TPerlRegEx;
|
|
i,a: integer;
|
|
TagNodeList:TNodeList;
|
|
tValue: string;
|
|
|
|
Function FindAttrChildNode(aNode:TDomTreeNode;AttrName,AttrValue: String):TNodeList;
|
|
var
|
|
aValue: String;
|
|
j: integer;
|
|
begin
|
|
for j := 0 to aNode.Child.Count - 1 do
|
|
begin
|
|
if aNode.Child[j].Attributes <> nil then
|
|
if aNode.Child[j].Attributes.ContainsKey(AttrName) then
|
|
if aNode.Child[j].Attributes.TryGetValue(AttrName, aValue) then
|
|
if AttrValue = aValue then dListNode.Add(aNode.Child[j]);
|
|
if hAnyLevel then
|
|
FindAttrChildNode(aNode.Child[j], AttrName, AttrValue);
|
|
end;
|
|
result:=dListNode;
|
|
end;
|
|
|
|
begin
|
|
RegEx:=nil;
|
|
try
|
|
result:=false;
|
|
RegEx := TPerlRegEx.create;
|
|
RegEx.Subject := hAttrTxt;
|
|
RegEx.RegEx :='([^\s]*?[^\S]*)=([^\S]*".*?"[^\S]*)|'+
|
|
'([^\s]*?[^\S]*)=([^\S]*#39.*?#39[^\S]*)|'+
|
|
'([^\s]*?[^\S]*)=([^\S]*[^\s]+[^\S]*)|'+
|
|
'(autofocus[^\S]*)()|'+
|
|
'(disabled[^\S]*)()|'+
|
|
'(selected[^\S]*)()';
|
|
|
|
if (not (hAttrTxt = '')) and (RegEx.Match) then
|
|
begin
|
|
for i := 1 to RegEx.GroupCount do
|
|
if trim(RegEx.Groups[i]) <> '' then break;
|
|
if hNameTag = '' then
|
|
begin
|
|
if FindAttrChildNode(self,RegEx.Groups[i],RegEx.Groups[i+1]).Count>0
|
|
then result:=true;
|
|
end
|
|
else
|
|
begin
|
|
TagNodeList:=TNodeList.Create;
|
|
if FindTagOfIndex(hNameTag,hIndex,hAnyLevel,TagNodeList) then
|
|
for a := 0 to TagNodeList.Count - 1 do
|
|
if TagNodeList[a].Attributes <> nil then
|
|
if TagNodeList[a].Attributes.ContainsKey(RegEx.Groups[i]) then
|
|
if TagNodeList[a].Attributes.TryGetValue(RegEx.Groups[i], tValue) then
|
|
//There was a strong compareson of values of attribute
|
|
// if RegEx.Groups = tValue)
|
|
if Pos(RegEx.Groups[i+1], tValue)>0
|
|
then
|
|
begin
|
|
dListNode.Add(TagNodeList[a]);
|
|
result:=true;
|
|
end;
|
|
TagNodeList.Free;
|
|
end;
|
|
end
|
|
else
|
|
if hAttrTxt = '' then
|
|
begin
|
|
TagNodeList:=TNodeList.Create;
|
|
if FindTagOfIndex(hNameTag,hIndex,hAnyLevel,TagNodeList) then
|
|
for a := 0 to TagNodeList.Count - 1 do
|
|
begin
|
|
dListNode.Add(TagNodeList[a]);
|
|
result:=true;
|
|
end;
|
|
TagNodeList.Free;
|
|
end
|
|
else raise Exception.create('Attribute not found: '+ hAttrTxt );
|
|
|
|
finally
|
|
RegEx.free
|
|
end;
|
|
end;
|
|
|
|
//***********FindTagOfIndex*************
|
|
// hNameTag - name Tag (* - any tag, except text tag)
|
|
// hIndex - number of a tag one after another (0 - all tag, 1 - each first ..)
|
|
// hAnyLevel - true - all level after start node; false - only one child level after start node
|
|
// dListNode - return TNodeList of TDomTreeNode
|
|
|
|
function TDomTreeNode.FindTagOfIndex(hNameTag: String; hIndex:integer;
|
|
hAnyLevel: Boolean; dListNode: TNodeList): Boolean;
|
|
|
|
function SubStringOccurences(const subString, sourceString : string; caseSensitive : boolean) : integer;
|
|
var
|
|
pEx: integer;
|
|
sub, source : string;
|
|
begin
|
|
if caseSensitive then
|
|
begin
|
|
sub := subString;
|
|
source := sourceString;
|
|
end
|
|
else
|
|
begin
|
|
sub := LowerCase(subString);
|
|
source := LowerCase(sourceString);
|
|
end;
|
|
|
|
result := 0;
|
|
pEx := PosEx(sub, source, 1);
|
|
while pEx <> 0 do
|
|
begin
|
|
Inc(result);
|
|
pEx := PosEx(sub, source, pEx + Length(sub));
|
|
end;
|
|
end;
|
|
|
|
Function FindChildTagOfIndex(aNode:TDomTreeNode):TNodeList;
|
|
var
|
|
countNode,j: integer;
|
|
enumTags:string;
|
|
begin
|
|
countNode:=0;
|
|
for j := 0 to aNode.Child.Count - 1 do
|
|
begin
|
|
if hNameTag <> '*' then
|
|
begin
|
|
if ((AnsiUpperCase(aNode.Child[j].Tag) = AnsiUpperCase(hNameTag)) and (aNode.Child[j].TypeTag <> '</%s>'))
|
|
or ((AnsiUpperCase(aNode.Child[j].Tag) = '') and (AnsiUpperCase(hNameTag)='TEXT()') and (aNode.Child[j].Text <> ''))
|
|
or ((LeftStr(AnsiUpperCase(aNode.Child[j].Tag),4) = '<!--') and (AnsiUpperCase(hNameTag)='COMMENT()'))
|
|
then
|
|
begin
|
|
Inc(countNode);
|
|
if (countNode = hIndex ) or (hIndex = 0) then dListNode.Add(aNode.Child[j])
|
|
end;
|
|
if (hAnyLevel) and (aNode.Child.Count > 0) then FindChildTagOfIndex(aNode.Child[j]) ;
|
|
end
|
|
else
|
|
begin
|
|
if (aNode.Child[j].TypeTag <> '</%s>') then
|
|
begin
|
|
enumTags:=enumTags + AnsiUpperCase(aNode.Child[j].Tag)+',';
|
|
|
|
if (SubStringOccurences(AnsiUpperCase(aNode.Child[j].Tag)+',',enumTags, false) = hIndex ) or (hIndex = 0) then dListNode.Add(aNode.Child[j])
|
|
end;
|
|
if (hAnyLevel) and (aNode.Child.Count > 0) then FindChildTagOfIndex(aNode.Child[j]) ;
|
|
end;
|
|
end;
|
|
result:=dListNode;
|
|
end;
|
|
|
|
|
|
begin
|
|
result:=false;
|
|
if FindChildTagOfIndex(self).Count > 0
|
|
then result:=true;
|
|
end;
|
|
|
|
function TDomTreeNode.FindXPath(hXPathTxt: String; dListNode: TNodeList;
|
|
dListValue:TStringList): Boolean;
|
|
var
|
|
RegExXPath, RegExXPathElmt: TPerlRegEx;
|
|
i: integer;
|
|
NextAnyLevel:boolean;
|
|
PrmXPath:TPrmRecList;
|
|
PrmXPathSTR: String;
|
|
PrmCount:integer;
|
|
procedure MatchXpath(Context,mTxtElmt:string) ;
|
|
var
|
|
Prm: PPrmRec;
|
|
begin
|
|
if (Context='/') and (trim(mTxtElmt)='') then NextAnyLevel:=true
|
|
else if (Context='/') and (trim(mTxtElmt)='..') then
|
|
begin
|
|
New(prm);
|
|
Prm.TagName:='..';
|
|
Prm.ind:=0;
|
|
Prm.Attr:='';
|
|
Prm.AnyLevel:=false;
|
|
PrmXPath.Add(Prm);
|
|
end
|
|
else
|
|
begin
|
|
RegExXPathElmt.Options := [preCaseLess];
|
|
RegExXPathElmt.Subject:=trim(mTxtElmt);
|
|
RegExXPathElmt.RegEx:='^([\.\*@A-Z][-A-Z0-9\(\)]*)\[?([0-9]*)\]?\[?@?([^\]]*)';
|
|
if RegExXPathElmt.Match then
|
|
begin
|
|
New(prm);
|
|
Prm.TagName:=RegExXPathElmt.Groups[1];
|
|
if not TryStrToInt( RegExXPathElmt.Groups[2], Prm.ind ) then Prm.ind:=0;
|
|
Prm.Attr:=RegExXPathElmt.Groups[3];
|
|
Prm.AnyLevel:=NextAnyLevel;
|
|
if (Context='/') then NextAnyLevel:=False;
|
|
PrmXPath.Add(Prm);
|
|
end
|
|
else
|
|
raise Exception.create('XPath is not correct '+ Context + mTxtElmt );
|
|
end;
|
|
end;
|
|
|
|
Function FindWithPrm(cPrm:integer; CurNode:TDomTreeNode; dListNode: TNodeList) : boolean;
|
|
var
|
|
i: integer;
|
|
cLNode: TNodeList;
|
|
begin
|
|
result:=false;
|
|
if PrmXPath[cPrm].TagName = '..' then
|
|
FindWithPrm(cPrm + 1,CurNode.Parent, dListNode)
|
|
else
|
|
begin
|
|
cLNode:=TNodeList.Create;
|
|
if CurNode.FindNode(PrmXPath[cPrm].TagName,PrmXPath[cPrm].ind,PrmXPath[cPrm].Attr,PrmXPath[cPrm].AnyLevel,cLNode) then
|
|
for I := 0 to cLNode.Count - 1 do
|
|
if cPrm < PrmCount then
|
|
FindWithPrm(cPrm + 1,cLNode[i], dListNode)
|
|
else dListNode.Add(cLNode[i]) ;
|
|
cLNode.free;
|
|
end;
|
|
if dListNode.Count > 0 then result:=true
|
|
end;
|
|
begin
|
|
PrmXPath:=nil;
|
|
RegExXPath:=nil;
|
|
RegExXPathElmt:=nil;
|
|
try
|
|
result:=false;
|
|
NextAnyLevel:=false;
|
|
PrmXPath:=TPrmRecList.Create;
|
|
PrmXPathSTR:='';
|
|
RegExXPath := TPerlRegEx.create;
|
|
RegExXPathElmt := TPerlRegEx.create;
|
|
|
|
RegExXPath.Subject:= hXPathTxt;
|
|
RegExXPath.RegEx:='(/)([\*@]?[^/]*)';
|
|
if RegExXPath.Match then
|
|
begin
|
|
MatchXpath(RegExXPath.Groups[1],RegExXPath.Groups[2]);
|
|
while RegExXPath.MatchAgain do
|
|
MatchXpath(RegExXPath.Groups[1],RegExXPath.Groups[2]);
|
|
for i := 0 to PrmXPath.Count-1 do
|
|
PrmXPathSTR:=PrmXPathSTR + PrmXPath[i].TagName +',' + inttostr(PrmXPath[i].ind) +',' + PrmXPath[i].Attr+',' + BoolToStr(PrmXPath[i].AnyLevel,True)+chr(13)+chr(10);
|
|
|
|
if PrmXPath.Count > 0 then
|
|
begin
|
|
if (PrmXPath[PrmXPath.Count-1].TagName[1]='@')
|
|
then
|
|
begin
|
|
PrmCount:= PrmXPath.Count - 2;
|
|
PrmXPath[PrmXPath.Count-1].TagName:=AnsiReplaceStr(PrmXPath[PrmXPath.Count-1].TagName,'@','');
|
|
if FindWithPrm(0,self,dListNode) then
|
|
begin
|
|
for I := 0 to dListNode.Count-1 do
|
|
if dListNode[i].GetAttrValue(PrmXPath[PrmXPath.Count-1].TagName)<>'' then
|
|
dListValue.Add(dListNode[i].GetAttrValue(PrmXPath[PrmXPath.Count-1].TagName));
|
|
if dListValue.Count > 0 then result:= true
|
|
else result:=false;
|
|
end
|
|
else result:=false;
|
|
end
|
|
else
|
|
begin
|
|
PrmCount:= PrmXPath.Count - 1;
|
|
result:= FindWithPrm(0,self,dListNode);
|
|
if (AnsiLowerCase(PrmXPath[PrmXPath.Count-1].TagName)='comment()')
|
|
or (AnsiLowerCase(PrmXPath[PrmXPath.Count-1].TagName)='text()') then
|
|
for I := 0 to dListNode.Count-1 do
|
|
begin
|
|
if (AnsiLowerCase(PrmXPath[PrmXPath.Count-1].TagName)='text()')
|
|
then dListValue.Add(dListNode[i].Text)
|
|
else dListValue.Add(TDomTreeNode(dListNode[i]).Tag) ;
|
|
|
|
end;
|
|
end;
|
|
end
|
|
else raise Exception.create('XPath is not correct or empty.');
|
|
end
|
|
else raise Exception.create('XPath is not correct or empty.');
|
|
finally
|
|
PrmXPath.Free;
|
|
RegExXPath.Free;
|
|
RegExXPathElmt.Free;
|
|
end;
|
|
|
|
end;
|
|
|
|
function TDomTreeNode.GetAttrValue(hAttrName:string): string;
|
|
begin
|
|
result:='';
|
|
if self.Attributes <> nil then
|
|
if self.Attributes.ContainsKey(hAttrName) then
|
|
if not self.Attributes.TryGetValue(hAttrName, result) then
|
|
result:='';
|
|
|
|
end;
|
|
|
|
function TDomTreeNode.GetComment(hIndex: Integer): string;
|
|
var
|
|
countNode,j: integer;
|
|
begin
|
|
result:='';
|
|
countNode:=0;
|
|
for j := 0 to self.Child.Count - 1 do
|
|
if (LeftStr(self.Child[j].Tag,4) = '<!--') and
|
|
(self.Child[j].TypeTag = '%s') and
|
|
(self.Child[j].Text = '')
|
|
then
|
|
begin
|
|
Inc(countNode);
|
|
if (countNode = hIndex ) or (hIndex = 0) then
|
|
begin
|
|
result:= self.Child[j].Tag;
|
|
break;
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function TDomTreeNode.GetTagName: string;
|
|
begin
|
|
if self.TypeTag='</%s>' then
|
|
result:= format(AnsiReplaceStr(self.TypeTag,'/',''),[self.Tag + ' ' + self.AttributesTxt] )
|
|
else
|
|
result:= format(self.TypeTag,[self.Tag + ' ' + self.AttributesTxt] );
|
|
end;
|
|
|
|
function TDomTreeNode.GetTextValue(hIndex:Integer): string;
|
|
var
|
|
countNode,j: integer;
|
|
begin
|
|
result:='';
|
|
countNode:=0;
|
|
for j := 0 to self.Child.Count - 1 do
|
|
if (self.Child[j].Tag = '') and
|
|
(self.Child[j].TypeTag = '') and
|
|
(self.Child[j].Text <> '')
|
|
then
|
|
begin
|
|
Inc(countNode);
|
|
if (countNode = hIndex ) or (hIndex = 0) then
|
|
begin
|
|
result:= self.Child[j].Text;
|
|
break;
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function TDomTreeNode.GetXPath(hRelative:boolean): string;
|
|
|
|
function GetCountTag(Node: TDomTreeNode): string;
|
|
var
|
|
CountNode, nNode, i: integer;
|
|
begin
|
|
CountNode:=0;
|
|
result:= '';
|
|
if TObject(Node.Parent) is TDomTreeNode then
|
|
begin
|
|
for i:=0 to TDomTreeNode(Node.Parent).Child.Count - 1 do
|
|
begin
|
|
if (Node.Tag = TDomTreeNode(Node.Parent).Child[i].Tag)
|
|
or ((LeftStr(Node.Tag,4)='<!--') and (LeftStr(TDomTreeNode(Node.Parent).Child[i].Tag,4)='<!--'))
|
|
then
|
|
inc(CountNode);
|
|
if Node = TDomTreeNode(Node.Parent).Child[i] then
|
|
nNode:= CountNode;
|
|
end;
|
|
if (CountNode <> nNode) or ((CountNode = nNode) and (CountNode > 1)) then
|
|
result:= format('[%d]',[nNode]);
|
|
end;
|
|
end;
|
|
|
|
function GetParent(Node: TDomTreeNode): string;
|
|
begin
|
|
if TObject(Node.Parent) is TDomTreeNode then
|
|
begin
|
|
if (hRelative) and (TDomTreeNode(Node.Parent).GetAttrValue('id') <>'') then
|
|
result:=format('//*[@id=%s]',[TDomTreeNode(Node.Parent).GetAttrValue('id')])+
|
|
'/' + result
|
|
else
|
|
result:=GetParent(Node.Parent)+
|
|
TDomTreeNode(Node.Parent).Tag + GetCountTag(Node.Parent) + '/' + result
|
|
end
|
|
else result:='.'+result;
|
|
end;
|
|
|
|
|
|
begin
|
|
if (LeftStr(self.Tag,2) <> '<?') and (LeftStr(self.Tag,9) <> '<!DOCTYPE') then
|
|
begin
|
|
if LeftStr(self.Tag,4) = '<!--' then result:='comment()'
|
|
else if self.Tag <> '' then result:=self.Tag
|
|
else result:='text()';
|
|
result:=GetParent(self) + result + GetCountTag(self);
|
|
if result[1]='.' then
|
|
result:='.'+RightStr(result, length(result) - Pos('/', result, 1)+1);
|
|
end
|
|
else result:='';
|
|
|
|
end;
|
|
|
|
function TDomTreeNode.RunParse(HtmlTxt: String): Boolean;
|
|
var
|
|
RegExHTML, RegExTag: TPerlRegEx;
|
|
prev, ErrParseHTML, ind: integer;
|
|
ChildTree: TDomTreeNode;
|
|
HtmlUtf8, RegExException: string;
|
|
tag_txt: TArray<String>;
|
|
|
|
function getAttr(mAttrTxt: string): TDictionary<string, string>;
|
|
var
|
|
sGroup,
|
|
CheckAttr: String;
|
|
procedure MatchAttr;
|
|
var
|
|
i: integer;
|
|
begin
|
|
CheckAttr := StuffString(CheckAttr,RegExTag.MatchedOffset+1, RegExTag.MatchedLength, StringOfChar(' ',RegExTag.MatchedLength));
|
|
for i := 1 to RegExTag.GroupCount do
|
|
if trim(RegExTag.Groups[i]) <> '' then
|
|
begin
|
|
try
|
|
sGroup := trim(RegExTag.Groups[i]);
|
|
if not result.ContainsKey(sGroup) then // 추가 16_0119 00:28:35 sunk
|
|
result.Add(sGroup, trim(RegExTag.Groups[i + 1]));
|
|
except
|
|
on E: Exception do
|
|
Owner.fParseErr.Add('Warning: not add Attributtes ' +
|
|
E.ClassName + ' : ' + E.Message + 'Sourse string: ' + mAttrTxt +
|
|
';' + chr(13)+chr(10)+' attributtes: ' + RegExTag.Groups[i]);
|
|
end;
|
|
break;
|
|
end;
|
|
end;
|
|
|
|
begin
|
|
try
|
|
result := TDictionary<string, string>.create;
|
|
if trim(mAttrTxt) <> '' then
|
|
begin
|
|
RegExTag.Subject := mAttrTxt;
|
|
CheckAttr := mAttrTxt;
|
|
RegExTag.Options := [preCaseLess, preMultiLine, preSingleLine];
|
|
RegExTag.Replacement:='';
|
|
// here RegExp for processing attributes of tags
|
|
// First not Empty - attribute, next - value
|
|
RegExTag.RegEx :='([^\s]*?[^\S]*)=([^\S]*".*?"[^\S]*)|'+
|
|
'([^\s]*?[^\S]*)=([^\S]*'#39'.*?'#39'[^\S]*)|'+
|
|
'([^\s]*?[^\S]*)=([^\S]*[^\s]+[^\S]*)|'+
|
|
'(allowTransparency[^\S]*)()|'+
|
|
'(allowfullscreen[^\S]*)()|'+
|
|
'(novalidate[^\S]*)()|'+
|
|
'(autofocus[^\S]*)()|'+
|
|
'(itemscope[^\S]*)()|'+
|
|
'(disabled[^\S]*)()|'+
|
|
'(readonly[^\S]*)()|'+
|
|
'(selected[^\S]*)()|'+
|
|
'(checked[^\S]*)()|'+
|
|
'(pubdate[^\S]*)()|'+
|
|
'(nowrap[^\S]*)()|'+
|
|
'(hidden[^\S]*)()|'+
|
|
'(async[^\S]*)()';
|
|
if RegExTag.Match then
|
|
begin
|
|
MatchAttr;
|
|
while RegExTag.MatchAgain do
|
|
MatchAttr;
|
|
// ***Start Check Parsing Tag Attributes Error****
|
|
if Length(Trim(CheckAttr)) > 0 then
|
|
Owner.fParseErr.Add('Warning: parsed not all attributes, ' +
|
|
'sourse string: ' + mAttrTxt + chr(13)+chr(10)+
|
|
'not parsed string: ' + Trim(CheckAttr));
|
|
// ***End Check Parsing Tag Attributes Error************
|
|
end
|
|
else
|
|
Owner.fParseErr.Add('Attributtes not found - ' +
|
|
'Sourse string: ' + mAttrTxt);
|
|
end;
|
|
except
|
|
on E: Exception do
|
|
Owner.fParseErr.Add('Attributtes - ' + E.ClassName + ' : ' +
|
|
E.Message + 'Sourse string: ' + mAttrTxt);
|
|
end;
|
|
end;
|
|
|
|
function getTagTxt(mTxt: string): TArray<String>;
|
|
begin
|
|
try
|
|
SetLength(result, 4);
|
|
result[0] := ''; // name tag
|
|
result[1] := ''; // text attributes
|
|
result[2] := ''; // text value following for tag
|
|
result[3] := ''; // type tag
|
|
if LeftStr(trim(mTxt),2) = '</' then result[3] :='</%s>' //close
|
|
else if RightStr(trim(mTxt),2) = '/>' then result[3] :='<%s/>' //selfclose
|
|
else if LeftStr(trim(mTxt),2) = '<!' then result[3] :='%s'
|
|
else if LeftStr(trim(mTxt),2) = '<?' then result[3] :='%s'
|
|
else result[3] :='<%s>'; // open
|
|
RegExTag.Subject := mTxt;
|
|
RegExTag.Options := [preCaseLess, preMultiLine, preSingleLine];
|
|
// here RegExp for processing HTML tags
|
|
// Group 1- tag, 2- attributes, 3- text
|
|
RegExTag.RegEx := '<([/A-Z][:A-Z0-9]*)\b([^>]*)>([^<]*)';
|
|
if RegExTag.Match then
|
|
begin
|
|
// ****************Start Check Parsing HTML Tag Error************
|
|
if mTxt <> '<' + RegExTag.Groups[1] + RegExTag.Groups[2] + '>' + RegExTag.Groups[3] then
|
|
Owner.fParseErr.Add('Check error Tags parsing - ' + 'Sourse string: ' + mTxt);
|
|
// ****************End Check Parsing HTML Tag Error************
|
|
result[0] := trim(RegExTag.Groups[1]);
|
|
if trim(RegExTag.Groups[2])<> '' then
|
|
if RightStr(trim(RegExTag.Groups[2]),1)= '/' then
|
|
result[1] := leftStr(trim(RegExTag.Groups[2]),length(trim(RegExTag.Groups[2]))-1)
|
|
else result[1] := trim(RegExTag.Groups[2]);
|
|
result[2] := trim(RegExTag.Groups[3]);
|
|
end
|
|
else
|
|
result[0] := trim(mTxt);
|
|
except
|
|
on E: Exception do
|
|
Owner.fParseErr.Add('Tags - ' + E.ClassName + ' : ' + E.Message +
|
|
'Sourse string: ' + mTxt);
|
|
end;
|
|
end;
|
|
|
|
function getPairTagTxt(mTxt, mPattern: string): TArray<String>;
|
|
begin
|
|
try
|
|
SetLength(result, 4);
|
|
result[0] := ''; // name tag
|
|
result[1] := ''; // text attributes
|
|
result[2] := ''; // text value following for tag
|
|
result[3] := ''; // close tag
|
|
|
|
RegExTag.Subject := mTxt;
|
|
RegExTag.Options := [preCaseLess, preMultiLine, preSingleLine];
|
|
// here RegExp for processing HTML tags
|
|
// Group 1- tag, 2- attributes, 3- text
|
|
RegExTag.RegEx := mPattern;
|
|
if RegExTag.Match then
|
|
begin
|
|
// ****************Start Check Parsing HTML Tag Error************
|
|
if trim(mTxt) <> '<' + RegExTag.Groups[1] + RegExTag.Groups[2] + '>' + RegExTag.Groups[3] + '<' +RegExTag.Groups[4] +'>' then
|
|
Owner.fParseErr.Add('Check error Exception Tags parsing - ' + 'Sourse string: ' + mTxt);
|
|
// ****************End Check Parsing HTML Tag Error************
|
|
result[0] := trim(RegExTag.Groups[1]);
|
|
result[1] := trim(RegExTag.Groups[2]);
|
|
result[2] := trim(RegExTag.Groups[3]);
|
|
result[3] := trim(RegExTag.Groups[4]);
|
|
end
|
|
else
|
|
result[0] := mTxt;
|
|
except
|
|
on E: Exception do
|
|
Owner.fParseErr.Add('Exception Tags - ' + E.ClassName + ' : ' + E.Message +
|
|
'Sourse string: ' + mTxt);
|
|
end;
|
|
end;
|
|
|
|
Function CheckParent(aChildTree: TDomTreeNode; tTag: string):TDomTreeNode;
|
|
var
|
|
ParentTag: string;
|
|
begin
|
|
result := aChildTree.Parent;
|
|
if tTag = '<%s>' then
|
|
result := aChildTree
|
|
else if tTag = '</%s>' then
|
|
if TObject(TDomTreeNode(aChildTree.Parent).Parent) is TDomTreeNode then
|
|
begin
|
|
ParentTag := TDomTreeNode(aChildTree.Parent).Tag;
|
|
if ParentTag = RightStr(aChildTree.Tag, length(aChildTree.Tag) - 1) then
|
|
result := TDomTreeNode(aChildTree.Parent).Parent
|
|
end;
|
|
end;
|
|
|
|
procedure MatchTag(mTxtMatch:string);
|
|
var
|
|
ExceptTag: string;
|
|
begin
|
|
// tag without close tag
|
|
ExceptTag :=
|
|
',META,LINK,IMG,COL,AREA,BASE,BASEFONT,ISINDEX,BGSOUNDCOMMAND,PARAM,INPUT,EMBED,FRAME,BR,WBR,HR,TRACK,';
|
|
|
|
if (leftstr(mTxtMatch, 4) = '<!--') then
|
|
begin
|
|
tag_txt[0] := trim(mTxtMatch);
|
|
tag_txt[1] := '';
|
|
tag_txt[2] := '';
|
|
tag_txt[3] := '%s';
|
|
ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, tag_txt[0], '', nil, '%s','')) ;
|
|
end
|
|
else if (AnsiUpperCase(leftstr(mTxtMatch, 7)) = '<TITLE>') // tag with any symbol
|
|
or (AnsiUpperCase(leftstr(mTxtMatch, 10)) = '<PLAINTEXT>')
|
|
or (AnsiUpperCase(leftstr(mTxtMatch, 5)) = '<XMP>')
|
|
or (AnsiUpperCase(leftstr(mTxtMatch, 7)) = '<SCRIPT')
|
|
or (AnsiUpperCase(leftstr(mTxtMatch, 9)) = '<TEXTAREA')
|
|
//or (AnsiUpperCase(leftstr(mTxtMatch, 4)) = '<PRE')
|
|
then
|
|
begin
|
|
tag_txt := getPairTagTxt(mTxtMatch,'<([A-Z][A-Z0-9]*)\b([^>]*?)>(.*)<(/\1)>');
|
|
ind:=ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, tag_txt[0], tag_txt[1], getAttr(tag_txt[1]), '<%s>','')) ;
|
|
if tag_txt[2] <> '' then ChildTree.Child[ind].Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree.Child[ind], '', '', nil, '', tag_txt[2]));
|
|
ChildTree.Child[ind].Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree.Child[ind], tag_txt[3], '', nil, '</%s>','')) ;
|
|
end
|
|
else
|
|
begin
|
|
tag_txt := getTagTxt(mTxtMatch);
|
|
ind := ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, tag_txt[0], tag_txt[1], getAttr(tag_txt[1]), tag_txt[3],''));
|
|
if (Pos(',' + AnsiUpperCase(trim(tag_txt[0])) + ',', ExceptTag) = 0)
|
|
and (LeftStr(tag_txt[0],2) <> '<?')
|
|
and (LeftStr(tag_txt[0],2) <> '<!') then
|
|
ChildTree := CheckParent(ChildTree.Child[ind],tag_txt[3]);
|
|
if tag_txt[2] <> '' then
|
|
ChildTree.Child.Add(TDomTreeNode.create(ChildTree.Owner,ChildTree, '', '', nil, '',tag_txt[2]));
|
|
|
|
end;
|
|
end;
|
|
// *************************** START PARSE HTML*************************
|
|
begin
|
|
RegExHTML:=nil;
|
|
RegExTag:=nil;
|
|
try
|
|
HtmlUtf8 := HtmlTxt;
|
|
RegExHTML := TPerlRegEx.create;
|
|
RegExTag := TPerlRegEx.create;
|
|
ErrParseHTML:=0;
|
|
RegExHTML.Options := [preCaseLess, preMultiLine, preSingleLine];
|
|
ChildTree := self;
|
|
with RegExHTML do
|
|
begin
|
|
|
|
|
|
// *********RegExp for parsing HTML**************
|
|
// (<title>.*</title>[^<]*) - title
|
|
// (<\!--.+?-->[^<]*) - comment
|
|
// (<script.*?</script>[^<]*) - script
|
|
// (<[^>]+>[^<]*) - all remaining tags
|
|
// [^<]* - text
|
|
RegExException :='(<PLAINTEXT>.*?</PLAINTEXT>[^<]*)|'+
|
|
'(<title>.*?</title>[^<]*)|'+
|
|
'(<xmp>.*?</xmp>[^<]*)|'+
|
|
'(<script.*?</script>[^<]*)|'+
|
|
'(<textarea.*?</textarea>[^<]*)|'+
|
|
// '(<pre.*?</pre>[^<]*)|'+
|
|
'(<!--.+?-->[^<]*)|';
|
|
RegEx := RegExException + '(<[^>]+>[^<]*)'; // all teg and text
|
|
Subject := HtmlUtf8;
|
|
if Match then
|
|
begin
|
|
MatchTag(RegExHTML.MatchedText);
|
|
prev := MatchedOffset + MatchedLength;
|
|
while MatchAgain do
|
|
begin
|
|
MatchTag(RegExHTML.MatchedText);
|
|
// *****Start Check Parsing HTML Error************
|
|
if MatchedOffset - prev > 0 then
|
|
begin
|
|
Owner.fParseErr.Add(IntToStr(ErrParseHTML) + '- Check error found after HTML parsing');
|
|
inc(ErrParseHTML)
|
|
end;
|
|
prev := MatchedOffset + MatchedLength;
|
|
// *****End Check Parsing HTML Error************
|
|
end;
|
|
// ***********End RegExp match cycle************
|
|
end
|
|
else
|
|
raise Exception.create('Input text not contain HTML tags');
|
|
// *************End RegExp match ************
|
|
end;
|
|
|
|
Finally
|
|
RegExHTML.Free;
|
|
RegExTag.Free;
|
|
if Owner.FCount>0 then
|
|
result := True
|
|
else result := False ;
|
|
end;
|
|
|
|
end;
|
|
|
|
end.
|