{==============================================================================| | Project : Delphi HTML/XHTML parser module | 1.1.2 | |==============================================================================| | Content: | |==============================================================================| | The contents of this file are subject to the Mozilla Public License Ver. 1.0 | | (the "License"); you may not use this file except in compliance with the | | License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ | | | | Software distributed under the License is distributed on an "AS IS" basis, | | WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for | | the specific language governing rights and limitations under the License. | |==============================================================================| | Initial Developers of the Original Code are: | | Sandbil (Russia) sandbil@ya.ru | | All Rights Reserved. | | Last Modified: | | 25.10.2014, Sandbil | |==============================================================================| | History: see README | |==============================================================================|} unit EM.DomParser; interface uses System.Classes, System.RegularExpressionsCore, System.Generics.Collections, System.Contnrs, System.StrUtils, System.SysUtils; type TNodeList = class; TChildList=class; TDomTreeNode = class; TDomTree = class private FCount: Integer; fParseErr: TStringList; fRootNode: TDomTreeNode; public constructor Create; destructor destroy; override; property Count: Integer read fCount; property RootNode: TDomTreeNode read fRootNode; property ParseErr: TStringList read fParseErr; end; TDomTreeNode = class(TObject) private fTag: string; fAttributesTxt: string; fAttributes: TDictionary; fText: string; fTypeTag: string; fChild: TChildList; fParent: Pointer; fOwner: TDomTree; public property Tag: string read fTag; property AttributesTxt: string read fAttributesTxt; property Attributes: TDictionary read fAttributes; property Text: string read fText; property TypeTag: string read fTypeTag; property Child: TChildList read fChild; property Parent: Pointer read fParent; property Owner: TDomTree read fOwner; constructor create(hOwner: TDomTree; hParent: Pointer; hTag, hAttrTxt: string; hAttr: TDictionary; hTypeTag, hText: string); destructor destroy; override; function FindNode(hNameTag: string; hIndex:integer; hAttrTxt: String; hAnyLevel: Boolean; dListNode: TNodeList): Boolean; function FindTagOfIndex(hNameTag: String; hIndex:integer; hAnyLevel: Boolean; dListNode: TNodeList): Boolean; function FindXPath(hXPathTxt: String; dListNode: TNodeList; dListValue:TStringList): Boolean; function GetAttrValue(hAttrName:string): string; function GetComment(hIndex: Integer): string; function GetTagName: string; function GetTextValue(hIndex:Integer): string; function GetXPath(hRelative:boolean): string; function RunParse(HtmlTxt: String): Boolean; end; TChildList = class(TList) private function Get(Index: Integer): TDomTreeNode; public destructor Destroy; override; property Items[Index: Integer]: TDomTreeNode read Get; default; end; TNodeList = class(TList) private function Get(Index: Integer): TDomTreeNode; public property Items[Index: Integer]: TDomTreeNode read Get; default; end; PPrmRec=^TPrmRec; TPrmRec = record TagName: string; ind: Integer; Attr: string; AnyLevel: Boolean; end; TPrmRecList = class(TList) private function Get(Index: Integer): PPrmRec; public destructor Destroy; override; property Items[Index: Integer]: PPrmRec read Get; default; end; implementation { TDomTree } { *********************************** TDomTree *********************************** } constructor TDomTree.Create; begin fParseErr:= TStringList.Create; fRootnode:= TDomTreeNode.Create(self,self,'Root','',nil,'',''); FCount:=0; end; destructor TDomTree.destroy; begin FreeAndNil(fParseErr); FreeAndNil(fRootNode); inherited; end; { TChildList } { ********************************** TChildList ********************************** } destructor TChildList.Destroy; var i: Integer; begin for i := 0 to Count - 1 do self[i].Free; inherited; end; function TChildList.Get(Index: Integer): TDomTreeNode; begin Result := TDomTreeNode(inherited Get(Index)); end; { TNodeList } function TNodeList.Get(Index: Integer): TDomTreeNode; begin Result := TDomTreeNode(inherited Get(Index)); end; { TPrmRecList } { ********************************* TPrmRecList ********************************** } destructor TPrmRecList.Destroy; var i: Integer; begin for i := 0 to Count - 1 do FreeMem(Items[i]); inherited; end; function TPrmRecList.Get(Index: Integer): PPrmRec; begin Result := PPrmRec(inherited Get(Index)); end; { TDomTreeNode } { ********************************* TDomTreeNode ********************************* } constructor TDomTreeNode.create(hOwner: TDomTree; hParent: Pointer; hTag, hAttrTxt: string; hAttr: TDictionary; hTypeTag, hText: string); begin fChild := TChildList.create; fParent := hParent; fTag := hTag; fAttributesTxt := hAttrTxt; fAttributes := hAttr; fTypeTag:= hTypeTag; fText := hText; fOwner:=hOwner; inc(hOwner.FCount); end; destructor TDomTreeNode.destroy; begin FreeAndNil(fAttributes); FreeAndNil(fChild); inherited; end; //***********FindAttr************* // hNameTag - name Tag // hIndex - number of a tag one after another (0 - all tag, 1 - each first ..) // hAttrTxt - attribute. ex. alt=1 // hAnyLevel - true - all levels after start node; false - only one child level after start node // dListNode - return TNodeList of TDomTreeNode function TDomTreeNode.FindNode(hNameTag: string; hIndex:integer; hAttrTxt: String; hAnyLevel: Boolean; dListNode: TNodeList): Boolean; var RegEx: TPerlRegEx; i,a: integer; TagNodeList:TNodeList; tValue: string; Function FindAttrChildNode(aNode:TDomTreeNode;AttrName,AttrValue: String):TNodeList; var aValue: String; j: integer; begin for j := 0 to aNode.Child.Count - 1 do begin if aNode.Child[j].Attributes <> nil then if aNode.Child[j].Attributes.ContainsKey(AttrName) then if aNode.Child[j].Attributes.TryGetValue(AttrName, aValue) then if AttrValue = aValue then dListNode.Add(aNode.Child[j]); if hAnyLevel then FindAttrChildNode(aNode.Child[j], AttrName, AttrValue); end; result:=dListNode; end; begin RegEx:=nil; try result:=false; RegEx := TPerlRegEx.create; RegEx.Subject := hAttrTxt; RegEx.RegEx :='([^\s]*?[^\S]*)=([^\S]*".*?"[^\S]*)|'+ '([^\s]*?[^\S]*)=([^\S]*#39.*?#39[^\S]*)|'+ '([^\s]*?[^\S]*)=([^\S]*[^\s]+[^\S]*)|'+ '(autofocus[^\S]*)()|'+ '(disabled[^\S]*)()|'+ '(selected[^\S]*)()'; if (not (hAttrTxt = '')) and (RegEx.Match) then begin for i := 1 to RegEx.GroupCount do if trim(RegEx.Groups[i]) <> '' then break; if hNameTag = '' then begin if FindAttrChildNode(self,RegEx.Groups[i],RegEx.Groups[i+1]).Count>0 then result:=true; end else begin TagNodeList:=TNodeList.Create; if FindTagOfIndex(hNameTag,hIndex,hAnyLevel,TagNodeList) then for a := 0 to TagNodeList.Count - 1 do if TagNodeList[a].Attributes <> nil then if TagNodeList[a].Attributes.ContainsKey(RegEx.Groups[i]) then if TagNodeList[a].Attributes.TryGetValue(RegEx.Groups[i], tValue) then //There was a strong compareson of values of attribute // if RegEx.Groups = tValue) if Pos(RegEx.Groups[i+1], tValue)>0 then begin dListNode.Add(TagNodeList[a]); result:=true; end; TagNodeList.Free; end; end else if hAttrTxt = '' then begin TagNodeList:=TNodeList.Create; if FindTagOfIndex(hNameTag,hIndex,hAnyLevel,TagNodeList) then for a := 0 to TagNodeList.Count - 1 do begin dListNode.Add(TagNodeList[a]); result:=true; end; TagNodeList.Free; end else raise Exception.create('Attribute not found: '+ hAttrTxt ); finally RegEx.free end; end; //***********FindTagOfIndex************* // hNameTag - name Tag (* - any tag, except text tag) // hIndex - number of a tag one after another (0 - all tag, 1 - each first ..) // hAnyLevel - true - all level after start node; false - only one child level after start node // dListNode - return TNodeList of TDomTreeNode function TDomTreeNode.FindTagOfIndex(hNameTag: String; hIndex:integer; hAnyLevel: Boolean; dListNode: TNodeList): Boolean; function SubStringOccurences(const subString, sourceString : string; caseSensitive : boolean) : integer; var pEx: integer; sub, source : string; begin if caseSensitive then begin sub := subString; source := sourceString; end else begin sub := LowerCase(subString); source := LowerCase(sourceString); end; result := 0; pEx := PosEx(sub, source, 1); while pEx <> 0 do begin Inc(result); pEx := PosEx(sub, source, pEx + Length(sub)); end; end; Function FindChildTagOfIndex(aNode:TDomTreeNode):TNodeList; var countNode,j: integer; enumTags:string; begin countNode:=0; for j := 0 to aNode.Child.Count - 1 do begin if hNameTag <> '*' then begin if ((AnsiUpperCase(aNode.Child[j].Tag) = AnsiUpperCase(hNameTag)) and (aNode.Child[j].TypeTag <> '')) or ((AnsiUpperCase(aNode.Child[j].Tag) = '') and (AnsiUpperCase(hNameTag)='TEXT()') and (aNode.Child[j].Text <> '')) or ((LeftStr(AnsiUpperCase(aNode.Child[j].Tag),4) = '[^<]*) - comment // ([^<]*) - script // (<[^>]+>[^<]*) - all remaining tags // [^<]* - text RegExException :='(.*?</PLAINTEXT>[^<]*)|'+ '(<title>.*?</title>[^<]*)|'+ '(<xmp>.*?</xmp>[^<]*)|'+ '(<script.*?</script>[^<]*)|'+ '(<textarea.*?</textarea>[^<]*)|'+ // '(<pre.*?</pre>[^<]*)|'+ '(<!--.+?-->[^<]*)|'; RegEx := RegExException + '(<[^>]+>[^<]*)'; // all teg and text Subject := HtmlUtf8; if Match then begin MatchTag(RegExHTML.MatchedText); prev := MatchedOffset + MatchedLength; while MatchAgain do begin MatchTag(RegExHTML.MatchedText); // *****Start Check Parsing HTML Error************ if MatchedOffset - prev > 0 then begin Owner.fParseErr.Add(IntToStr(ErrParseHTML) + '- Check error found after HTML parsing'); inc(ErrParseHTML) end; prev := MatchedOffset + MatchedLength; // *****End Check Parsing HTML Error************ end; // ***********End RegExp match cycle************ end else raise Exception.create('Input text not contain HTML tags'); // *************End RegExp match ************ end; Finally RegExHTML.Free; RegExTag.Free; if Owner.FCount>0 then result := True else result := False ; end; end; end.