{*******************************************************} { } { Tocsg.Html } { } { Copyright (C) 2020 kku } { } {*******************************************************} unit Tocsg.Html; interface uses Tocsg.Obj, System.SysUtils, System.Classes; type TTgHtmlParser = class(TTgObject) private sHtml_: String; procedure SetHtmlStr(const sHtml: String); function GetTagData(sTagPath: String): String; function GetTagText(sTagPath: String): String; public Constructor Create; property Html: String read sHtml_ write SetHtmlStr; property Text[sTagpath: String]: String read GetTagText; default; end; function ExtractElementToStrings(sTag, sHtml: String; EltList: TStrings): Integer; function ExtractElement(sTag, sHtml: String): String; function ExtractElementOnce(sTag, sHtml: String): String; function ClearElement(sTag, sHtml: String): String; overload; function ClearElement(sBTag, sETag, sHtml: String): String; overload; function ExtractAttrToStrings(sHtml: String; AttrList: TStrings): Integer; // 셋중에 하나로 정리 하자.... 19_1205 kku function HtmlDecode(const AStr: String): String; function StripHTMLTags(const strHTML: string): string; function ConvHtmlEntities(sText: String): String; implementation uses EM.DomParser, Tocsg.Strings, Tocsg.Safe; function ExtractElementToStrings(sTag, sHtml: String; EltList: TStrings): Integer; var nPosB, nPosE, nPosE_Ck: Integer; nBEltC, nEEltC: Integer; sBTag, sETag, sCheckHtml: String; begin EltList.Clear; sBTag := '<' + sTag; sETag := ''; Result := 0; nBEltC := Length(sBTag); while Length(sHtml) > 0 do begin nEEltC := Length(sETag); nPosB := Pos(sBTag, sHtml); if nPosB = 0 then break; Delete(sHtml, 1, nPosB - 1); if (sHtml[nBEltC+1] <> ' ') and (sHtml[nBEltC+1] <> '>') then begin Delete(sHtml, 1, nBEltC); continue; end; // nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1])); sCheckHtml := Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1); nPosB := Pos(sBTag, sCheckHtml); nPosE := Pos(sETag, sHtml); // nPosE_Ck := Pos('<', sCheckHtml, nPosB + 1); // if nPosE > nPosE_Ck then // begin // nPosE := nPosE_Ck; // nEEltc := 0; // end else if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then begin // /> 이렇게 끝날수도 있다. nPosE_Ck := Pos('/>', sHtml); if nPosE > nPosE_Ck then begin nPosE := nPosE_Ck; nEEltC := 2; end; end; if nPosE <> 0 then begin EltList.Add(Copy(sHtml, 1, nPosE + nEEltC - 1)); Delete(sHtml, 1, nPosE + nEEltC - 1); end else break; end; Result := EltList.Count; end; function ExtractElement(sTag, sHtml: String): String; var nPosB, nPosE: Integer; nBEltC, nEEltC: Integer; sBTag, sETag: String; begin Result := ''; sBTag := '<' + sTag; sETag := ''; nBEltC := Length(sBTag); while Length(sHtml) > 0 do begin nEEltC := Length(sETag); nPosB := Pos(sBTag, sHtml); if nPosB = 0 then break; Delete(sHtml, 1, nPosB - 1); // nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1])); nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1)); nPosE := Pos(sETag, sHtml); if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then begin // /> 이렇게 끝날수도 있다. nPosE := Pos('/>', sHtml); nEEltC := 2; end; if nPosE <> 0 then begin Result := Result + Copy(sHtml, 1, nPosE + nEEltC - 1); Delete(sHtml, 1, nPosE + nEEltC - 1); end else break; end; end; function ClearElement(sTag, sHtml: String): String; var nPosB, nPosE: Integer; nBEltC, nEEltC: Integer; sBTag, sETag: String; begin Result := ''; sBTag := '<' + sTag; sETag := ''; nBEltC := Length(sBTag); while Length(sHtml) > 0 do begin nEEltC := Length(sETag); nPosB := Pos(sBTag, sHtml); if nPosB = 0 then break; Result := Result + Copy(sHtml, 1, nPosB - 1); Delete(sHtml, 1, nPosB + nBEltC - 1); // nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1])); nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1)); nPosE := Pos(sETag, sHtml); if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then begin // /> 이렇게 끝날수도 있다. nPosE := Pos('/>', sHtml); nEEltC := 2; end; if nPosE = 0 then break else Delete(sHtml, 1, nPosE + nEEltC - 1); end; Result := Result + sHtml; end; function ClearElement(sBTag, sETag, sHtml: String): String; var nPosB, nPosE: Integer; nBEltC, nEEltC: Integer; begin Result := ''; nBEltC := Length(sBTag); while Length(sHtml) > 0 do begin nEEltC := Length(sETag); nPosB := Pos(sBTag, sHtml); if nPosB = 0 then break; Result := Result + Copy(sHtml, 1, nPosB - 1); Delete(sHtml, 1, nPosB + nBEltC - 1); // nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1])); nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1)); nPosE := Pos(sETag, sHtml); if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then begin // /> 이렇게 끝날수도 있다. nPosE := Pos('/>', sHtml); nEEltC := 2; end; if nPosE = 0 then break else Delete(sHtml, 1, nPosE + nEEltC - 1); end; Result := Result + sHtml; end; function ExtractElementOnce(sTag, sHtml: String): String; var nPosB, nPosE: Integer; nBEltC, nEEltC: Integer; sBTag, sETag: String; begin Result := ''; sBTag := '<' + sTag; sETag := ''; nBEltC := Length(sBTag); while Length(sHtml) > 0 do begin nEEltC := Length(sETag); nPosB := Pos(sBTag, sHtml); if nPosB = 0 then break; Delete(sHtml, 1, nPosB - 1); // nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1])); nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1)); nPosE := Pos(sETag, sHtml); if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then begin // /> 이렇게 끝날수도 있다. nPosE := Pos('/>', sHtml); nEEltC := 2; end; if nPosE <> 0 then begin Result := Result + Copy(sHtml, 1, nPosE + nEEltC - 1); exit; // Delete(sHtml, 1, nPosE + nEEltC - 1); end else break; end; end; function ExtractAttrToStrings(sHtml: String; AttrList: TStrings): Integer; var sBTag: String; i, nPos, nLen: Integer; begin Result := 0; AttrList.Clear; nPos := Pos('<', sHtml); if nPos > 0 then begin Delete(sHtml, 1, nPos); nPos := Pos('>', sHtml); if nPos > 0 then begin SetLength(sHtml, nPos - 1); // Delete(sHtml, nPos, Length(sHtml) - nPos); nLen := Length(sHtml); if sHtml[nLen] = '/' then SetLength(sHtml, nLen - 1); SplitString(sHtml, ' ', AttrList); for i := AttrList.Count - 1 downto 0 do if Pos('=', AttrList[i]) = 0 then AttrList.Delete(i); AttrList.Text := StringReplace(AttrList.Text, '"', '', [rfReplaceAll]); AttrList.Text := StringReplace(AttrList.Text, '''', '', [rfReplaceAll]); Result := AttrList.Count; end; end; end; { TTgHtmlParser } Constructor TTgHtmlParser.Create; begin Inherited Create; sHtml_ := ''; end; procedure TTgHtmlParser.SetHtmlStr(const sHtml: String); begin if sHtml_ <> sHtml then sHtml_ := sHtml; end; function TTgHtmlParser.GetTagData(sTagPath: String): String; var TagPaths: TStringList; i, b, e: Integer; sHtml: String; begin Result := ''; sHtml := sHtml_; Guard(TagPaths, TStringList.Create); SplitString(sTagpath, '/', TagPaths); for i := 0 to TagPaths.Count - 1 do sHtml := ExtractElementOnce(TagPaths[i], sHtml); Result := sHtml; end; function TTgHtmlParser.GetTagText(sTagPath: String): String; var b, e: Integer; sAttr: String; AttrList: TStringList; i: Integer; begin Result := ''; try b := LastDelimiter(':', sTagPath); if b > 0 then begin e := Length(sTagPath); sAttr := Copy(sTagPath, b + 1, e - b); Delete(sTagPath, b, e - b + 1); end; Result := GetTagData(sTagpath); if Result <> '' then begin b := Pos('>', Result); e := LastDelimiter('<', Result); if b < e then begin if sAttr <> '' then begin Guard(AttrList, TStringList.Create); if ExtractAttrToStrings(Result, AttrList) > 0 then begin Result := ''; for i := 0 to AttrList.Count - 1 do if Pos(sAttr + '=', AttrList[i]) = 1 then begin Result := Copy(AttrList[i], Length(sAttr) + 2, Length(AttrList[i]) - Length(sAttr) + 1); exit; end; end else Result := ''; end else begin Result := Copy(Result, b + 1, e - b - 1); Result := ClearElement('<', '>', Result) end; end else Result := ''; end; finally if Result <> '' then begin Result := StringReplace(Result, #13, ' ', [rfReplaceAll]); Result := StringReplace(Result, #10, '', [rfReplaceAll]); Result := Trim(Result); end; end; end; // By - https://stackoverflow.com/questions/1657105/delphi-html-decode function HtmlDecode(const AStr: String): String; var Sp, Rp, Cp, Tp: PChar; S: String; I, Code: Integer; begin SetLength(Result, Length(AStr)); Sp := PChar(AStr); Rp := PChar(Result); Cp := Sp; try while Sp^ <> #0 do begin case Sp^ of '&': begin Cp := Sp; Inc(Sp); case Sp^ of 'a': if AnsiStrPos(Sp, 'amp;') = Sp then { do not localize } begin Inc(Sp, 3); Rp^ := '&'; end; 'l', 'g': if (AnsiStrPos(Sp, 'lt;') = Sp) or (AnsiStrPos(Sp, 'gt;') = Sp) then { do not localize } begin Cp := Sp; Inc(Sp, 2); while (Sp^ <> ';') and (Sp^ <> #0) do Inc(Sp); if Cp^ = 'l' then Rp^ := '<' else Rp^ := '>'; end; 'n': if AnsiStrPos(Sp, 'nbsp;') = Sp then { do not localize } begin Inc(Sp, 4); Rp^ := ' '; end; 'q': if AnsiStrPos(Sp, 'quot;') = Sp then { do not localize } begin Inc(Sp,4); Rp^ := '"'; end; '#': begin Tp := Sp; Inc(Tp); while (Sp^ <> ';') and (Sp^ <> #0) do Inc(Sp); SetString(S, Tp, Sp - Tp); Val(S, I, Code); Rp^ := Chr((I)); end; else Exit; end; end else Rp^ := Sp^; end; Inc(Rp); Inc(Sp); end; except end; SetLength(Result, Rp - PChar(Result)); end; // By - https://stackoverflow.com/questions/3001443/get-the-rendered-text-from-html-delphi function StripHTMLTags(const strHTML: string): string; var P: PChar; InTag: Boolean; i, intResultLength: Integer; begin P := PChar(strHTML); Result := ''; InTag := False; repeat case P^ of '<': InTag := True; '>': InTag := False; #13, #10: ; {do nothing} else if not InTag then begin if (P^ in [#9, #32]) and ((P+1)^ in [#10, #13, #32, #9, '<']) then else Result := Result + P^; end; end; Inc(P); until (P^ = #0); {convert system characters} Result := StringReplace(Result, '"', '"', [rfReplaceAll]); Result := StringReplace(Result, ''', '''', [rfReplaceAll]); Result := StringReplace(Result, '>', '>', [rfReplaceAll]); Result := StringReplace(Result, '<', '<', [rfReplaceAll]); Result := StringReplace(Result, '&', '&', [rfReplaceAll]); Result := StringReplace(Result, ' ', ' ', [rfReplaceAll]); {here you may add another symbols from RFC if you need} end; function ConvHtmlEntities(sText: String): String; begin Result := StringReplace(sText, #9, '', [rfReplaceAll]); Result := StringReplace(Result, '<', '<', [rfReplaceAll]); Result := StringReplace(Result, '>', '>', [rfReplaceAll]); Result := StringReplace(Result, '&', '&', [rfReplaceAll]); Result := StringReplace(Result, '"', '"', [rfReplaceAll]); Result := StringReplace(Result, ' ', ' ', [rfReplaceAll]); Result := StringReplace(Result, ''', '''', [rfReplaceAll]); Result := StringReplace(Result, '·', '·', [rfReplaceAll]); end; end.