504 lines
13 KiB
Plaintext
504 lines
13 KiB
Plaintext
{*******************************************************}
|
|
{ }
|
|
{ Tocsg.Html }
|
|
{ }
|
|
{ Copyright (C) 2020 kku }
|
|
{ }
|
|
{*******************************************************}
|
|
|
|
unit Tocsg.Html;
|
|
|
|
interface
|
|
|
|
uses
|
|
Tocsg.Obj, System.SysUtils, System.Classes;
|
|
|
|
type
|
|
TTgHtmlParser = class(TTgObject)
|
|
private
|
|
sHtml_: String;
|
|
procedure SetHtmlStr(const sHtml: String);
|
|
function GetTagData(sTagPath: String): String;
|
|
function GetTagText(sTagPath: String): String;
|
|
public
|
|
Constructor Create;
|
|
|
|
property Html: String read sHtml_ write SetHtmlStr;
|
|
property Text[sTagpath: String]: String read GetTagText; default;
|
|
end;
|
|
|
|
function ExtractElementToStrings(sTag, sHtml: String; EltList: TStrings): Integer;
|
|
function ExtractElement(sTag, sHtml: String): String;
|
|
function ExtractElementOnce(sTag, sHtml: String): String;
|
|
function ClearElement(sTag, sHtml: String): String; overload;
|
|
function ClearElement(sBTag, sETag, sHtml: String): String; overload;
|
|
|
|
function ExtractAttrToStrings(sHtml: String; AttrList: TStrings): Integer;
|
|
|
|
// 셋중에 하나로 정리 하자.... 19_1205 kku
|
|
function HtmlDecode(const AStr: String): String;
|
|
function StripHTMLTags(const strHTML: string): string;
|
|
function ConvHtmlEntities(sText: String): String;
|
|
|
|
implementation
|
|
|
|
uses
|
|
EM.DomParser, Tocsg.Strings, Tocsg.Safe;
|
|
|
|
function ExtractElementToStrings(sTag, sHtml: String; EltList: TStrings): Integer;
|
|
var
|
|
nPosB, nPosE, nPosE_Ck: Integer;
|
|
nBEltC, nEEltC: Integer;
|
|
sBTag, sETag, sCheckHtml: String;
|
|
begin
|
|
EltList.Clear;
|
|
|
|
sBTag := '<' + sTag;
|
|
sETag := '</' + sTag + '>';
|
|
|
|
Result := 0;
|
|
nBEltC := Length(sBTag);
|
|
while Length(sHtml) > 0 do
|
|
begin
|
|
nEEltC := Length(sETag);
|
|
nPosB := Pos(sBTag, sHtml);
|
|
if nPosB = 0 then
|
|
break;
|
|
|
|
Delete(sHtml, 1, nPosB - 1);
|
|
if (sHtml[nBEltC+1] <> ' ') and (sHtml[nBEltC+1] <> '>') then
|
|
begin
|
|
Delete(sHtml, 1, nBEltC);
|
|
continue;
|
|
end;
|
|
|
|
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
|
|
sCheckHtml := Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1);
|
|
nPosB := Pos(sBTag, sCheckHtml);
|
|
nPosE := Pos(sETag, sHtml);
|
|
// nPosE_Ck := Pos('<', sCheckHtml, nPosB + 1);
|
|
|
|
// if nPosE > nPosE_Ck then
|
|
// begin
|
|
// nPosE := nPosE_Ck;
|
|
// nEEltc := 0;
|
|
// end else
|
|
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
|
|
begin
|
|
// /> 이렇게 끝날수도 있다.
|
|
nPosE_Ck := Pos('/>', sHtml);
|
|
if nPosE > nPosE_Ck then
|
|
begin
|
|
nPosE := nPosE_Ck;
|
|
nEEltC := 2;
|
|
end;
|
|
end;
|
|
|
|
if nPosE <> 0 then
|
|
begin
|
|
EltList.Add(Copy(sHtml, 1, nPosE + nEEltC - 1));
|
|
Delete(sHtml, 1, nPosE + nEEltC - 1);
|
|
end else break;
|
|
end;
|
|
|
|
Result := EltList.Count;
|
|
end;
|
|
|
|
function ExtractElement(sTag, sHtml: String): String;
|
|
var
|
|
nPosB, nPosE: Integer;
|
|
nBEltC, nEEltC: Integer;
|
|
sBTag, sETag: String;
|
|
begin
|
|
Result := '';
|
|
|
|
sBTag := '<' + sTag;
|
|
sETag := '</' + sTag + '>';
|
|
|
|
nBEltC := Length(sBTag);
|
|
while Length(sHtml) > 0 do
|
|
begin
|
|
nEEltC := Length(sETag);
|
|
nPosB := Pos(sBTag, sHtml);
|
|
if nPosB = 0 then
|
|
break;
|
|
|
|
Delete(sHtml, 1, nPosB - 1);
|
|
|
|
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
|
|
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
|
|
nPosE := Pos(sETag, sHtml);
|
|
|
|
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
|
|
begin
|
|
// /> 이렇게 끝날수도 있다.
|
|
nPosE := Pos('/>', sHtml);
|
|
nEEltC := 2;
|
|
end;
|
|
|
|
if nPosE <> 0 then
|
|
begin
|
|
Result := Result + Copy(sHtml, 1, nPosE + nEEltC - 1);
|
|
Delete(sHtml, 1, nPosE + nEEltC - 1);
|
|
end else break;
|
|
end;
|
|
end;
|
|
|
|
function ClearElement(sTag, sHtml: String): String;
|
|
var
|
|
nPosB, nPosE: Integer;
|
|
nBEltC, nEEltC: Integer;
|
|
sBTag, sETag: String;
|
|
begin
|
|
Result := '';
|
|
|
|
sBTag := '<' + sTag;
|
|
sETag := '</' + sTag + '>';
|
|
|
|
nBEltC := Length(sBTag);
|
|
while Length(sHtml) > 0 do
|
|
begin
|
|
nEEltC := Length(sETag);
|
|
nPosB := Pos(sBTag, sHtml);
|
|
if nPosB = 0 then
|
|
break;
|
|
|
|
Result := Result + Copy(sHtml, 1, nPosB - 1);
|
|
Delete(sHtml, 1, nPosB + nBEltC - 1);
|
|
|
|
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
|
|
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
|
|
nPosE := Pos(sETag, sHtml);
|
|
|
|
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
|
|
begin
|
|
// /> 이렇게 끝날수도 있다.
|
|
nPosE := Pos('/>', sHtml);
|
|
nEEltC := 2;
|
|
end;
|
|
|
|
if nPosE = 0 then
|
|
break
|
|
else
|
|
Delete(sHtml, 1, nPosE + nEEltC - 1);
|
|
end;
|
|
Result := Result + sHtml;
|
|
end;
|
|
|
|
function ClearElement(sBTag, sETag, sHtml: String): String;
|
|
var
|
|
nPosB, nPosE: Integer;
|
|
nBEltC, nEEltC: Integer;
|
|
begin
|
|
Result := '';
|
|
|
|
nBEltC := Length(sBTag);
|
|
while Length(sHtml) > 0 do
|
|
begin
|
|
nEEltC := Length(sETag);
|
|
nPosB := Pos(sBTag, sHtml);
|
|
if nPosB = 0 then
|
|
break;
|
|
|
|
Result := Result + Copy(sHtml, 1, nPosB - 1);
|
|
Delete(sHtml, 1, nPosB + nBEltC - 1);
|
|
|
|
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
|
|
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
|
|
nPosE := Pos(sETag, sHtml);
|
|
|
|
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
|
|
begin
|
|
// /> 이렇게 끝날수도 있다.
|
|
nPosE := Pos('/>', sHtml);
|
|
nEEltC := 2;
|
|
end;
|
|
|
|
if nPosE = 0 then
|
|
break
|
|
else
|
|
Delete(sHtml, 1, nPosE + nEEltC - 1);
|
|
end;
|
|
Result := Result + sHtml;
|
|
end;
|
|
|
|
function ExtractElementOnce(sTag, sHtml: String): String;
|
|
var
|
|
nPosB, nPosE: Integer;
|
|
nBEltC, nEEltC: Integer;
|
|
sBTag, sETag: String;
|
|
begin
|
|
Result := '';
|
|
|
|
sBTag := '<' + sTag;
|
|
sETag := '</' + sTag + '>';
|
|
|
|
nBEltC := Length(sBTag);
|
|
while Length(sHtml) > 0 do
|
|
begin
|
|
nEEltC := Length(sETag);
|
|
nPosB := Pos(sBTag, sHtml);
|
|
if nPosB = 0 then
|
|
break;
|
|
|
|
Delete(sHtml, 1, nPosB - 1);
|
|
|
|
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
|
|
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
|
|
nPosE := Pos(sETag, sHtml);
|
|
|
|
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
|
|
begin
|
|
// /> 이렇게 끝날수도 있다.
|
|
nPosE := Pos('/>', sHtml);
|
|
nEEltC := 2;
|
|
end;
|
|
|
|
if nPosE <> 0 then
|
|
begin
|
|
Result := Result + Copy(sHtml, 1, nPosE + nEEltC - 1);
|
|
exit;
|
|
// Delete(sHtml, 1, nPosE + nEEltC - 1);
|
|
end else break;
|
|
end;
|
|
end;
|
|
|
|
function ExtractAttrToStrings(sHtml: String; AttrList: TStrings): Integer;
|
|
var
|
|
sBTag: String;
|
|
i, nPos, nLen: Integer;
|
|
begin
|
|
Result := 0;
|
|
AttrList.Clear;
|
|
|
|
nPos := Pos('<', sHtml);
|
|
if nPos > 0 then
|
|
begin
|
|
Delete(sHtml, 1, nPos);
|
|
nPos := Pos('>', sHtml);
|
|
if nPos > 0 then
|
|
begin
|
|
SetLength(sHtml, nPos - 1);
|
|
// Delete(sHtml, nPos, Length(sHtml) - nPos);
|
|
nLen := Length(sHtml);
|
|
if sHtml[nLen] = '/' then
|
|
SetLength(sHtml, nLen - 1);
|
|
|
|
SplitString(sHtml, ' ', AttrList);
|
|
|
|
for i := AttrList.Count - 1 downto 0 do
|
|
if Pos('=', AttrList[i]) = 0 then
|
|
AttrList.Delete(i);
|
|
|
|
AttrList.Text := StringReplace(AttrList.Text, '"', '', [rfReplaceAll]);
|
|
AttrList.Text := StringReplace(AttrList.Text, '''', '', [rfReplaceAll]);
|
|
Result := AttrList.Count;
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
{ TTgHtmlParser }
|
|
|
|
Constructor TTgHtmlParser.Create;
|
|
begin
|
|
Inherited Create;
|
|
sHtml_ := '';
|
|
end;
|
|
|
|
procedure TTgHtmlParser.SetHtmlStr(const sHtml: String);
|
|
begin
|
|
if sHtml_ <> sHtml then
|
|
sHtml_ := sHtml;
|
|
end;
|
|
|
|
function TTgHtmlParser.GetTagData(sTagPath: String): String;
|
|
var
|
|
TagPaths: TStringList;
|
|
i, b, e: Integer;
|
|
sHtml: String;
|
|
begin
|
|
Result := '';
|
|
sHtml := sHtml_;
|
|
Guard(TagPaths, TStringList.Create);
|
|
SplitString(sTagpath, '/', TagPaths);
|
|
for i := 0 to TagPaths.Count - 1 do
|
|
sHtml := ExtractElementOnce(TagPaths[i], sHtml);
|
|
|
|
Result := sHtml;
|
|
end;
|
|
|
|
function TTgHtmlParser.GetTagText(sTagPath: String): String;
|
|
var
|
|
b, e: Integer;
|
|
sAttr: String;
|
|
AttrList: TStringList;
|
|
i: Integer;
|
|
begin
|
|
Result := '';
|
|
try
|
|
b := LastDelimiter(':', sTagPath);
|
|
if b > 0 then
|
|
begin
|
|
e := Length(sTagPath);
|
|
sAttr := Copy(sTagPath, b + 1, e - b);
|
|
Delete(sTagPath, b, e - b + 1);
|
|
end;
|
|
|
|
Result := GetTagData(sTagpath);
|
|
if Result <> '' then
|
|
begin
|
|
b := Pos('>', Result);
|
|
e := LastDelimiter('<', Result);
|
|
if b < e then
|
|
begin
|
|
if sAttr <> '' then
|
|
begin
|
|
Guard(AttrList, TStringList.Create);
|
|
if ExtractAttrToStrings(Result, AttrList) > 0 then
|
|
begin
|
|
Result := '';
|
|
for i := 0 to AttrList.Count - 1 do
|
|
if Pos(sAttr + '=', AttrList[i]) = 1 then
|
|
begin
|
|
Result := Copy(AttrList[i], Length(sAttr) + 2, Length(AttrList[i]) - Length(sAttr) + 1);
|
|
exit;
|
|
end;
|
|
end else Result := '';
|
|
end else begin
|
|
Result := Copy(Result, b + 1, e - b - 1);
|
|
Result := ClearElement('<', '>', Result)
|
|
end;
|
|
|
|
end else Result := '';
|
|
end;
|
|
finally
|
|
if Result <> '' then
|
|
begin
|
|
Result := StringReplace(Result, #13, ' ', [rfReplaceAll]);
|
|
Result := StringReplace(Result, #10, '', [rfReplaceAll]);
|
|
Result := Trim(Result);
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
// By - https://stackoverflow.com/questions/1657105/delphi-html-decode
|
|
function HtmlDecode(const AStr: String): String;
|
|
var
|
|
Sp, Rp, Cp, Tp: PChar;
|
|
S: String;
|
|
I, Code: Integer;
|
|
begin
|
|
SetLength(Result, Length(AStr));
|
|
Sp := PChar(AStr);
|
|
Rp := PChar(Result);
|
|
Cp := Sp;
|
|
try
|
|
while Sp^ <> #0 do
|
|
begin
|
|
case Sp^ of
|
|
'&': begin
|
|
Cp := Sp;
|
|
Inc(Sp);
|
|
case Sp^ of
|
|
'a': if AnsiStrPos(Sp, 'amp;') = Sp then { do not localize }
|
|
begin
|
|
Inc(Sp, 3);
|
|
Rp^ := '&';
|
|
end;
|
|
'l',
|
|
'g': if (AnsiStrPos(Sp, 'lt;') = Sp) or (AnsiStrPos(Sp, 'gt;') = Sp) then { do not localize }
|
|
begin
|
|
Cp := Sp;
|
|
Inc(Sp, 2);
|
|
while (Sp^ <> ';') and (Sp^ <> #0) do
|
|
Inc(Sp);
|
|
if Cp^ = 'l' then
|
|
Rp^ := '<'
|
|
else
|
|
Rp^ := '>';
|
|
end;
|
|
'n': if AnsiStrPos(Sp, 'nbsp;') = Sp then { do not localize }
|
|
begin
|
|
Inc(Sp, 4);
|
|
Rp^ := ' ';
|
|
end;
|
|
'q': if AnsiStrPos(Sp, 'quot;') = Sp then { do not localize }
|
|
begin
|
|
Inc(Sp,4);
|
|
Rp^ := '"';
|
|
end;
|
|
'#': begin
|
|
Tp := Sp;
|
|
Inc(Tp);
|
|
while (Sp^ <> ';') and (Sp^ <> #0) do
|
|
Inc(Sp);
|
|
SetString(S, Tp, Sp - Tp);
|
|
Val(S, I, Code);
|
|
Rp^ := Chr((I));
|
|
end;
|
|
else
|
|
Exit;
|
|
end;
|
|
end
|
|
else
|
|
Rp^ := Sp^;
|
|
end;
|
|
Inc(Rp);
|
|
Inc(Sp);
|
|
end;
|
|
except
|
|
end;
|
|
SetLength(Result, Rp - PChar(Result));
|
|
end;
|
|
|
|
// By - https://stackoverflow.com/questions/3001443/get-the-rendered-text-from-html-delphi
|
|
function StripHTMLTags(const strHTML: string): string;
|
|
var
|
|
P: PChar;
|
|
InTag: Boolean;
|
|
i, intResultLength: Integer;
|
|
begin
|
|
P := PChar(strHTML);
|
|
Result := '';
|
|
|
|
InTag := False;
|
|
repeat
|
|
case P^ of
|
|
'<': InTag := True;
|
|
'>': InTag := False;
|
|
#13, #10: ; {do nothing}
|
|
else
|
|
if not InTag then
|
|
begin
|
|
if (P^ in [#9, #32]) and ((P+1)^ in [#10, #13, #32, #9, '<']) then
|
|
else
|
|
Result := Result + P^;
|
|
end;
|
|
end;
|
|
Inc(P);
|
|
until (P^ = #0);
|
|
|
|
{convert system characters}
|
|
Result := StringReplace(Result, '"', '"', [rfReplaceAll]);
|
|
Result := StringReplace(Result, ''', '''', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '>', '>', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '<', '<', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '&', '&', [rfReplaceAll]);
|
|
Result := StringReplace(Result, ' ', ' ', [rfReplaceAll]);
|
|
{here you may add another symbols from RFC if you need}
|
|
end;
|
|
|
|
function ConvHtmlEntities(sText: String): String;
|
|
begin
|
|
Result := StringReplace(sText, #9, '', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '<', '<', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '>', '>', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '&', '&', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '"', '"', [rfReplaceAll]);
|
|
Result := StringReplace(Result, ' ', ' ', [rfReplaceAll]);
|
|
Result := StringReplace(Result, ''', '''', [rfReplaceAll]);
|
|
Result := StringReplace(Result, '·', '·', [rfReplaceAll]);
|
|
end;
|
|
|
|
end.
|