BSOne.SFC/Tocsg.Lib/VCL/Tocsg.Html.pas

504 lines
13 KiB
Plaintext

{*******************************************************}
{ }
{ Tocsg.Html }
{ }
{ Copyright (C) 2020 kku }
{ }
{*******************************************************}
unit Tocsg.Html;
interface
uses
Tocsg.Obj, System.SysUtils, System.Classes;
type
TTgHtmlParser = class(TTgObject)
private
sHtml_: String;
procedure SetHtmlStr(const sHtml: String);
function GetTagData(sTagPath: String): String;
function GetTagText(sTagPath: String): String;
public
Constructor Create;
property Html: String read sHtml_ write SetHtmlStr;
property Text[sTagpath: String]: String read GetTagText; default;
end;
function ExtractElementToStrings(sTag, sHtml: String; EltList: TStrings): Integer;
function ExtractElement(sTag, sHtml: String): String;
function ExtractElementOnce(sTag, sHtml: String): String;
function ClearElement(sTag, sHtml: String): String; overload;
function ClearElement(sBTag, sETag, sHtml: String): String; overload;
function ExtractAttrToStrings(sHtml: String; AttrList: TStrings): Integer;
// 셋중에 하나로 정리 하자.... 19_1205 kku
function HtmlDecode(const AStr: String): String;
function StripHTMLTags(const strHTML: string): string;
function ConvHtmlEntities(sText: String): String;
implementation
uses
EM.DomParser, Tocsg.Strings, Tocsg.Safe;
function ExtractElementToStrings(sTag, sHtml: String; EltList: TStrings): Integer;
var
nPosB, nPosE, nPosE_Ck: Integer;
nBEltC, nEEltC: Integer;
sBTag, sETag, sCheckHtml: String;
begin
EltList.Clear;
sBTag := '<' + sTag;
sETag := '</' + sTag + '>';
Result := 0;
nBEltC := Length(sBTag);
while Length(sHtml) > 0 do
begin
nEEltC := Length(sETag);
nPosB := Pos(sBTag, sHtml);
if nPosB = 0 then
break;
Delete(sHtml, 1, nPosB - 1);
if (sHtml[nBEltC+1] <> ' ') and (sHtml[nBEltC+1] <> '>') then
begin
Delete(sHtml, 1, nBEltC);
continue;
end;
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
sCheckHtml := Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1);
nPosB := Pos(sBTag, sCheckHtml);
nPosE := Pos(sETag, sHtml);
// nPosE_Ck := Pos('<', sCheckHtml, nPosB + 1);
// if nPosE > nPosE_Ck then
// begin
// nPosE := nPosE_Ck;
// nEEltc := 0;
// end else
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
begin
// /> 이렇게 끝날수도 있다.
nPosE_Ck := Pos('/>', sHtml);
if nPosE > nPosE_Ck then
begin
nPosE := nPosE_Ck;
nEEltC := 2;
end;
end;
if nPosE <> 0 then
begin
EltList.Add(Copy(sHtml, 1, nPosE + nEEltC - 1));
Delete(sHtml, 1, nPosE + nEEltC - 1);
end else break;
end;
Result := EltList.Count;
end;
function ExtractElement(sTag, sHtml: String): String;
var
nPosB, nPosE: Integer;
nBEltC, nEEltC: Integer;
sBTag, sETag: String;
begin
Result := '';
sBTag := '<' + sTag;
sETag := '</' + sTag + '>';
nBEltC := Length(sBTag);
while Length(sHtml) > 0 do
begin
nEEltC := Length(sETag);
nPosB := Pos(sBTag, sHtml);
if nPosB = 0 then
break;
Delete(sHtml, 1, nPosB - 1);
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
nPosE := Pos(sETag, sHtml);
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
begin
// /> 이렇게 끝날수도 있다.
nPosE := Pos('/>', sHtml);
nEEltC := 2;
end;
if nPosE <> 0 then
begin
Result := Result + Copy(sHtml, 1, nPosE + nEEltC - 1);
Delete(sHtml, 1, nPosE + nEEltC - 1);
end else break;
end;
end;
function ClearElement(sTag, sHtml: String): String;
var
nPosB, nPosE: Integer;
nBEltC, nEEltC: Integer;
sBTag, sETag: String;
begin
Result := '';
sBTag := '<' + sTag;
sETag := '</' + sTag + '>';
nBEltC := Length(sBTag);
while Length(sHtml) > 0 do
begin
nEEltC := Length(sETag);
nPosB := Pos(sBTag, sHtml);
if nPosB = 0 then
break;
Result := Result + Copy(sHtml, 1, nPosB - 1);
Delete(sHtml, 1, nPosB + nBEltC - 1);
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
nPosE := Pos(sETag, sHtml);
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
begin
// /> 이렇게 끝날수도 있다.
nPosE := Pos('/>', sHtml);
nEEltC := 2;
end;
if nPosE = 0 then
break
else
Delete(sHtml, 1, nPosE + nEEltC - 1);
end;
Result := Result + sHtml;
end;
function ClearElement(sBTag, sETag, sHtml: String): String;
var
nPosB, nPosE: Integer;
nBEltC, nEEltC: Integer;
begin
Result := '';
nBEltC := Length(sBTag);
while Length(sHtml) > 0 do
begin
nEEltC := Length(sETag);
nPosB := Pos(sBTag, sHtml);
if nPosB = 0 then
break;
Result := Result + Copy(sHtml, 1, nPosB - 1);
Delete(sHtml, 1, nPosB + nBEltC - 1);
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
nPosE := Pos(sETag, sHtml);
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
begin
// /> 이렇게 끝날수도 있다.
nPosE := Pos('/>', sHtml);
nEEltC := 2;
end;
if nPosE = 0 then
break
else
Delete(sHtml, 1, nPosE + nEEltC - 1);
end;
Result := Result + sHtml;
end;
function ExtractElementOnce(sTag, sHtml: String): String;
var
nPosB, nPosE: Integer;
nBEltC, nEEltC: Integer;
sBTag, sETag: String;
begin
Result := '';
sBTag := '<' + sTag;
sETag := '</' + sTag + '>';
nBEltC := Length(sBTag);
while Length(sHtml) > 0 do
begin
nEEltC := Length(sETag);
nPosB := Pos(sBTag, sHtml);
if nPosB = 0 then
break;
Delete(sHtml, 1, nPosB - 1);
// nPosB := Pos(sBTag, PChar(@sHtml[nBEltC + 1]));
nPosB := Pos(sBTag, Copy(sHtml, nBEltC + 1, Length(sHtml) - nBEltC + 1));
nPosE := Pos(sETag, sHtml);
if (nPosE = 0) or ((nPosB > 0) and (nPosB < nPosE)) then
begin
// /> 이렇게 끝날수도 있다.
nPosE := Pos('/>', sHtml);
nEEltC := 2;
end;
if nPosE <> 0 then
begin
Result := Result + Copy(sHtml, 1, nPosE + nEEltC - 1);
exit;
// Delete(sHtml, 1, nPosE + nEEltC - 1);
end else break;
end;
end;
function ExtractAttrToStrings(sHtml: String; AttrList: TStrings): Integer;
var
sBTag: String;
i, nPos, nLen: Integer;
begin
Result := 0;
AttrList.Clear;
nPos := Pos('<', sHtml);
if nPos > 0 then
begin
Delete(sHtml, 1, nPos);
nPos := Pos('>', sHtml);
if nPos > 0 then
begin
SetLength(sHtml, nPos - 1);
// Delete(sHtml, nPos, Length(sHtml) - nPos);
nLen := Length(sHtml);
if sHtml[nLen] = '/' then
SetLength(sHtml, nLen - 1);
SplitString(sHtml, ' ', AttrList);
for i := AttrList.Count - 1 downto 0 do
if Pos('=', AttrList[i]) = 0 then
AttrList.Delete(i);
AttrList.Text := StringReplace(AttrList.Text, '"', '', [rfReplaceAll]);
AttrList.Text := StringReplace(AttrList.Text, '''', '', [rfReplaceAll]);
Result := AttrList.Count;
end;
end;
end;
{ TTgHtmlParser }
Constructor TTgHtmlParser.Create;
begin
Inherited Create;
sHtml_ := '';
end;
procedure TTgHtmlParser.SetHtmlStr(const sHtml: String);
begin
if sHtml_ <> sHtml then
sHtml_ := sHtml;
end;
function TTgHtmlParser.GetTagData(sTagPath: String): String;
var
TagPaths: TStringList;
i, b, e: Integer;
sHtml: String;
begin
Result := '';
sHtml := sHtml_;
Guard(TagPaths, TStringList.Create);
SplitString(sTagpath, '/', TagPaths);
for i := 0 to TagPaths.Count - 1 do
sHtml := ExtractElementOnce(TagPaths[i], sHtml);
Result := sHtml;
end;
function TTgHtmlParser.GetTagText(sTagPath: String): String;
var
b, e: Integer;
sAttr: String;
AttrList: TStringList;
i: Integer;
begin
Result := '';
try
b := LastDelimiter(':', sTagPath);
if b > 0 then
begin
e := Length(sTagPath);
sAttr := Copy(sTagPath, b + 1, e - b);
Delete(sTagPath, b, e - b + 1);
end;
Result := GetTagData(sTagpath);
if Result <> '' then
begin
b := Pos('>', Result);
e := LastDelimiter('<', Result);
if b < e then
begin
if sAttr <> '' then
begin
Guard(AttrList, TStringList.Create);
if ExtractAttrToStrings(Result, AttrList) > 0 then
begin
Result := '';
for i := 0 to AttrList.Count - 1 do
if Pos(sAttr + '=', AttrList[i]) = 1 then
begin
Result := Copy(AttrList[i], Length(sAttr) + 2, Length(AttrList[i]) - Length(sAttr) + 1);
exit;
end;
end else Result := '';
end else begin
Result := Copy(Result, b + 1, e - b - 1);
Result := ClearElement('<', '>', Result)
end;
end else Result := '';
end;
finally
if Result <> '' then
begin
Result := StringReplace(Result, #13, ' ', [rfReplaceAll]);
Result := StringReplace(Result, #10, '', [rfReplaceAll]);
Result := Trim(Result);
end;
end;
end;
// By - https://stackoverflow.com/questions/1657105/delphi-html-decode
function HtmlDecode(const AStr: String): String;
var
Sp, Rp, Cp, Tp: PChar;
S: String;
I, Code: Integer;
begin
SetLength(Result, Length(AStr));
Sp := PChar(AStr);
Rp := PChar(Result);
Cp := Sp;
try
while Sp^ <> #0 do
begin
case Sp^ of
'&': begin
Cp := Sp;
Inc(Sp);
case Sp^ of
'a': if AnsiStrPos(Sp, 'amp;') = Sp then { do not localize }
begin
Inc(Sp, 3);
Rp^ := '&';
end;
'l',
'g': if (AnsiStrPos(Sp, 'lt;') = Sp) or (AnsiStrPos(Sp, 'gt;') = Sp) then { do not localize }
begin
Cp := Sp;
Inc(Sp, 2);
while (Sp^ <> ';') and (Sp^ <> #0) do
Inc(Sp);
if Cp^ = 'l' then
Rp^ := '<'
else
Rp^ := '>';
end;
'n': if AnsiStrPos(Sp, 'nbsp;') = Sp then { do not localize }
begin
Inc(Sp, 4);
Rp^ := ' ';
end;
'q': if AnsiStrPos(Sp, 'quot;') = Sp then { do not localize }
begin
Inc(Sp,4);
Rp^ := '"';
end;
'#': begin
Tp := Sp;
Inc(Tp);
while (Sp^ <> ';') and (Sp^ <> #0) do
Inc(Sp);
SetString(S, Tp, Sp - Tp);
Val(S, I, Code);
Rp^ := Chr((I));
end;
else
Exit;
end;
end
else
Rp^ := Sp^;
end;
Inc(Rp);
Inc(Sp);
end;
except
end;
SetLength(Result, Rp - PChar(Result));
end;
// By - https://stackoverflow.com/questions/3001443/get-the-rendered-text-from-html-delphi
function StripHTMLTags(const strHTML: string): string;
var
P: PChar;
InTag: Boolean;
i, intResultLength: Integer;
begin
P := PChar(strHTML);
Result := '';
InTag := False;
repeat
case P^ of
'<': InTag := True;
'>': InTag := False;
#13, #10: ; {do nothing}
else
if not InTag then
begin
if (P^ in [#9, #32]) and ((P+1)^ in [#10, #13, #32, #9, '<']) then
else
Result := Result + P^;
end;
end;
Inc(P);
until (P^ = #0);
{convert system characters}
Result := StringReplace(Result, '&quot;', '"', [rfReplaceAll]);
Result := StringReplace(Result, '&apos;', '''', [rfReplaceAll]);
Result := StringReplace(Result, '&gt;', '>', [rfReplaceAll]);
Result := StringReplace(Result, '&lt;', '<', [rfReplaceAll]);
Result := StringReplace(Result, '&amp;', '&', [rfReplaceAll]);
Result := StringReplace(Result, '&nbsp;', ' ', [rfReplaceAll]);
{here you may add another symbols from RFC if you need}
end;
function ConvHtmlEntities(sText: String): String;
begin
Result := StringReplace(sText, #9, '', [rfReplaceAll]);
Result := StringReplace(Result, '&lt;', '<', [rfReplaceAll]);
Result := StringReplace(Result, '&gt;', '>', [rfReplaceAll]);
Result := StringReplace(Result, '&amp;', '&', [rfReplaceAll]);
Result := StringReplace(Result, '&quot;', '"', [rfReplaceAll]);
Result := StringReplace(Result, '&nbsp;', ' ', [rfReplaceAll]);
Result := StringReplace(Result, '&#039;', '''', [rfReplaceAll]);
Result := StringReplace(Result, '&middot;', '·', [rfReplaceAll]);
end;
end.