德尔福HTML解码

我使用德尔福2009年，并希望以解码HTML编码字符串，例如：德尔福HTML解码

&#39; -> '

却找不到任何内置的功能，这样做的。

在此先感谢

来源

2009-11-01 tekBlues

您使用的是什么HTML解析库？它有没有这样的功能？ – 2009-11-01 13:53:13

嗨，Rob，我使用Winninet从网站获取数据。 – tekBlues 2009-11-01 14:42:11

这不是我问的。一旦你从Web服务器下载资源，你用什么库来解析它？你用什么把字符序列变成有意义的东西？（如果你没有使用任何东西，为什么不是吗？） – 2009-11-01 15:43:31

看那HTTPApp单元。 HTTPDecode和HTMLDecode（以及Encode函数）。你应该在你的Source/Win32/Internet文件夹中找到它。

来源

2009-11-01 22:09:54 Gerard

-4

这是一个原型功能。没有检查有效性。 只是吸收这个想法。

function HtmlDecode(const AStr: string) : char; 
var 
iPos : integer; 
begin 
    iPos := AnsiPos(';', AStr); 
    Result := Chr(StrToInt(Copy(AStr, 3, iPos))); 
end;

再见！

来源

2009-11-02 02:17:04 Olaf

这是我的HTMLDecode过程（从皮质颗粒HTTPApp单元略有修改）：

function HTMLDecode(const AStr: String): String; 
var 
    Sp, Rp, Cp, Tp: PChar; 
    S: String; 
    I, Code: Integer; 
begin 
    SetLength(Result, Length(AStr)); 
    Sp := PChar(AStr); 
    Rp := PChar(Result); 
    Cp := Sp; 
    try 
    while Sp^ <> #0 do 
    begin 
     case Sp^ of 
     '&': begin 
       Cp := Sp; 
       Inc(Sp); 
       case Sp^ of 
       'a': if AnsiStrPos(Sp, 'amp;') = Sp then { do not localize } 
         begin 
         Inc(Sp, 3); 
         Rp^ := '&'; 
         end; 
       'l', 
       'g': if (AnsiStrPos(Sp, 'lt;') = Sp) or (AnsiStrPos(Sp, 'gt;') = Sp) then { do not localize } 
         begin 
         Cp := Sp; 
         Inc(Sp, 2); 
         while (Sp^ <> ';') and (Sp^ <> #0) do 
          Inc(Sp); 
         if Cp^ = 'l' then 
          Rp^ := '<' 
         else 
          Rp^ := '>'; 
         end; 
       'n': if AnsiStrPos(Sp, 'nbsp;') = Sp then { do not localize } 
         begin 
         Inc(Sp, 4); 
         Rp^ := ' '; 
         end; 
       'q': if AnsiStrPos(Sp, 'quot;') = Sp then { do not localize } 
         begin 
         Inc(Sp,4); 
         Rp^ := '"'; 
         end; 
       '#': begin 
         Tp := Sp; 
         Inc(Tp); 
         while (Sp^ <> ';') and (Sp^ <> #0) do 
          Inc(Sp); 
         SetString(S, Tp, Sp - Tp); 
         Val(S, I, Code); 
         Rp^ := Chr((I)); 
         end; 
       else 
        Exit; 
       end; 
      end 
     else 
     Rp^ := Sp^; 
     end; 
     Inc(Rp); 
     Inc(Sp); 
    end; 
    except 
    end; 
    SetLength(Result, Rp - PChar(Result)); 
end;

来源

2009-11-24 18:48:55 invarbrass

你修改了什么，为什么？ – dummzeuch 2010-11-07 18:27:03

提供的单元HTTPApp中的同名函数不处理令牌。这是基于该代码，但确实如此。 – MarkAurelius 2015-10-28 03:10:34

的HttpApp.HttpDecode功能不解码HTTP实体（https://www.w3.org/TR/html4/sgml/entities.html#sym）

例如：& there4; → ＆there4;

function HtmlDecode(s: UnicodeString): UnicodeString; 
{ 
    Public domain: No attribution required 
    Known issue, it doesn't handle entities with characters code points above $FFFF (65536) 
    e.g.: &; 

    That's because UTF-16 requires 2 characters to encode one character. 

    function UCS4CharToString(uch: UCS4Char): UnicodeString; 
    var 
     s: UCS4String; 
    begin 
     SetLength(s, 2); 
     s[0] := uch; 
     s[1] := 0; //null terminator 
     Result := UCS4StringToUnicodeString(s); 
    end; 

    function GetCharRef(sValue: UnicodeString; StartIndex: Integer; out CharRef: string): UnicodeString; 
    var 
     i: Integer; 
     len: Integer; 
     nChar: UCS4Char; 
    begin 
     { 
      Character references come in either decimal or hex forms: 

       &#9830; //decimal 
       &#x2666; //hexidecimal 

      As per the definition: 

       CharRef ::= '&#' [0-9]+ ';' 
            | 
            '&#x' [0-9a-fA-F]+ ';' 
     } 
     Result := ''; 
     CharRef := ''; 

     len := Length(sValue) - StartIndex + 1; 
     if len < 4 then 
      Exit; 
     i := StartIndex; 
     if sValue[i] <> '&' then Exit; 
     Inc(i); 
     if sValue[i] <> '#' then Exit; 
     Inc(i); 

     if sValue[i] = 'x' then 
     begin 
      { 
       Hex character reference 

        CharRef ::= '&#x' [0-9a-fA-F]+ ';' 

       E.g. &#x2666; 
      } 
      Inc(i); //skip the x 
      while CharInSet(sValue[i], ['0'..'9', 'a'..'f', 'A'..'F']) do 
      begin 
       Inc(i); 
       if i > Length(sValue) then 
        Exit; 
      end; 
      if sValue[i] <> ';' then 
       Exit; 

      charRef := Copy(sValue, StartIndex, (i-StartIndex)+1); 
      nChar := StrToInt('$'+Copy(charRef, 4, Length(charRef)-4)); 
     end 
     else 
     begin 
      { 
       Decimal character reference 

        CharRef ::= '&#' [0-9]+ ';' 

       E.g. &#9830; 
      } 

      while CharInSet(sValue[i], ['0'..'9']) do 
      begin 
       Inc(i); 
       if i > Length(sValue) then 
        Exit; 
      end; 
      if sValue[i] <> ';' then 
       Exit; 

      charRef := Copy(sValue, StartIndex, (i-StartIndex)+1); 
      nChar := StrToInt(Copy(charRef, 3, Length(charRef)-3)); 
     end; 
     Result := UCS4CharToString(nChar); 
    end; 

    function GetEntityRef(sValue: string; StartIndex: Integer; out CharRef: string): UnicodeString; 

     function IsNameStartChar(ch: WideChar): Boolean; 
     begin 
      { 
       NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] 
      } 
      Result := False; 

      case ch of 
      ':', 'A'..'Z', '_', 'a'..'z', #$C0..#$D6, #$D8..#$F6, #$F8..#$FF: Result := True; 
      #$100..#$2FF, #$370..#$37D, #$37F..#$FFF: Result := True; 
      #$1000..#$1FFF, #$200C..#$200D, #$2070..#$218F, #$2C00..#$2FEF, #$3001..#$D7FF, #$F900..#$FDCF, #$FDF0..#$FFFD: Result := True; 
      else 
       //We assume strings are UTF-16. But by assuming one 16-bit word is the same as one character is just wrong. 
       //UTF-16, like UTF-8 can be multi-byte. 
       //But it's just so haaaard to support. 
       //The correct action is to convert the string to UCS4, where one code-point is always one character. 
       case Integer(ch) of 
       $10000..$EFFFF: Result := True; 
       end; 
      end; 
     end; 

     function IsNameChar(ch: WideChar): Boolean; 
     begin 
      if IsNameStartChar(ch) then 
      begin 
       Result := True; 
       Exit; 
      end; 

      case ch of 
      '-', '.', '0'..'9', #$B7, #$0300..#$036F, #$203F..#$2040: Result := True; 
      else 
       Result := False; 
      end; 
     end; 

     type 
      THtmlEntity = record 
       entity: string; 
       ch: UCS4Char; 
      end; 
     const 
      //https://www.w3.org/TR/html4/sgml/entities.html#sym 
      //html entities are case sensitive (e.g. "larr" is different from "lArr") 
      HtmlEntities: array[0..252] of THtmlEntity = (
       (entity: 'apos';  ch: 39;), // apostrophe (originally only existed in xml, and not in HTML. Was added to HTML5 
       (entity: 'quot';  ch: 34;), // quotation mark = APL quote, U+0022 
       (entity: 'amp';  ch: 38;), // ampersand, U+0026 
       (entity: 'lt';   ch: 60;), // less-than sign, U+003C 
       (entity: 'gt';   ch: 62;), // greater-than sign, U+003E 
       (entity: 'OElig';  ch: 338; ), // latin capital ligature OE, U+0152 
       (entity: 'oelig';  ch: 339; ), // latin small ligature oe, U+0153 
       (entity: 'Scaron'; ch: 352; ), // latin capital letter S with caron, U+0160 
       (entity: 'scaron'; ch: 353; ), // latin small letter s with caron, U+0161 
       (entity: 'Yuml';  ch: 376; ), // latin capital letter Y with diaeresis, U+0178 
       (entity: 'circ';  ch: 710; ), // modifier letter circumflex accent, U+02C6 
       (entity: 'tilde';  ch: 732; ), // small tilde, U+02DC 
       (entity: 'nbsp';  ch: 160; ), // no-break space = non-breaking space, U+00A0 
       (entity: 'iexcl';  ch: 161; ), // inverted exclamation mark, U+00A1 
       (entity: 'cent';  ch: 162; ), // cent sign, U+00A2 
       (entity: 'pound';  ch: 163; ), // pound sign, U+00A3 
       (entity: 'curren'; ch: 164; ), // currency sign, U+00A4 
       (entity: 'yen';  ch: 165; ), // yen sign = yuan sign, U+00A5 
       (entity: 'brvbar'; ch: 166; ), // broken bar = broken vertical bar, U+00A6 
       (entity: 'sect';  ch: 167; ), // section sign, U+00A7 
       (entity: 'uml';  ch: 168; ), // diaeresis = spacing diaeresis, U+00A8 
       (entity: 'copy';  ch: 169; ), // copyright sign, U+00A9 
       (entity: 'ordf';  ch: 170; ), // feminine ordinal indicator, U+00AA 
       (entity: 'laquo';  ch: 171; ), // left-pointing double angle quotation mark = left pointing guillemet, U+00AB 
       (entity: 'not';  ch: 172; ), // not sign, U+00AC 
       (entity: 'shy';  ch: 173; ), // soft hyphen = discretionary hyphen, U+00AD 
       (entity: 'reg';  ch: 174; ), // registered sign = registered trade mark sign, U+00AE 
       (entity: 'macr';  ch: 175; ), // macron = spacing macron = overline = APL overbar, U+00AF 
       (entity: 'deg';  ch: 176; ), // degree sign, U+00B0 
       (entity: 'plusmn'; ch: 177; ), // plus-minus sign = plus-or-minus sign, U+00B1 
       (entity: 'sup2';  ch: 178; ), // superscript two = superscript digit two = squared, U+00B2 
       (entity: 'sup3';  ch: 179; ), // superscript three = superscript digit three = cubed, U+00B3 
       (entity: 'acute';  ch: 180; ), // acute accent = spacing acute, U+00B4 
       (entity: 'micro';  ch: 181; ), // micro sign, U+00B5 
       (entity: 'para';  ch: 182; ), // pilcrow sign = paragraph sign, U+00B6 
       (entity: 'middot'; ch: 183; ), // middle dot = Georgian comma = Greek middle dot, U+00B7 
       (entity: 'cedil';  ch: 184; ), // cedilla = spacing cedilla, U+00B8 
       (entity: 'sup1';  ch: 185; ), // superscript one = superscript digit one, U+00B9 
       (entity: 'ordm';  ch: 186; ), // masculine ordinal indicator, U+00BA 
       (entity: 'raquo';  ch: 187; ), // right-pointing double angle quotation mark = right pointing guillemet, U+00BB 
       (entity: 'frac14'; ch: 188; ), // vulgar fraction one quarter = fraction one quarter, U+00BC 
       (entity: 'frac12'; ch: 189; ), // vulgar fraction one half = fraction one half, U+00BD 
       (entity: 'frac34'; ch: 190; ), // vulgar fraction three quarters = fraction three quarters, U+00BE 
       (entity: 'iquest'; ch: 191; ), // inverted question mark = turned question mark, U+00BF 
       (entity: 'Agrave'; ch: 192; ), // latin capital letter A with grave = latin capital letter A grave, U+00C0 
       (entity: 'Aacute'; ch: 193; ), // latin capital letter A with acute, U+00C1 
       (entity: 'Acirc';  ch: 194; ), // latin capital letter A with circumflex, U+00C2 
       (entity: 'Atilde'; ch: 195; ), // latin capital letter A with tilde, U+00C3 
       (entity: 'Auml';  ch: 196; ), // latin capital letter A with diaeresis, U+00C4 
       (entity: 'Aring';  ch: 197; ), // latin capital letter A with ring above = latin capital letter A ring, U+00C5 
       (entity: 'AElig';  ch: 198; ), // latin capital letter AE = latin capital ligature AE, U+00C6 
       (entity: 'Ccedil'; ch: 199; ), // latin capital letter C with cedilla, U+00C7 
       (entity: 'Egrave'; ch: 200; ), // latin capital letter E with grave, U+00C8 
       (entity: 'Eacute'; ch: 201; ), // latin capital letter E with acute, U+00C9 
       (entity: 'Ecirc';  ch: 202; ), // latin capital letter E with circumflex, U+00CA 
       (entity: 'Euml';  ch: 203; ), // latin capital letter E with diaeresis, U+00CB 
       (entity: 'Igrave'; ch: 204; ), // latin capital letter I with grave, U+00CC 
       (entity: 'Iacute'; ch: 205; ), // latin capital letter I with acute, U+00CD 
       (entity: 'Icirc';  ch: 206; ), // latin capital letter I with circumflex, U+00CE 
       (entity: 'Iuml';  ch: 207; ), // latin capital letter I with diaeresis, U+00CF 
       (entity: 'ETH';  ch: 208; ), // latin capital letter ETH, U+00D0 
       (entity: 'Ntilde'; ch: 209; ), // latin capital letter N with tilde, U+00D1 
       (entity: 'Ograve'; ch: 210; ), // latin capital letter O with grave, U+00D2 
       (entity: 'Oacute'; ch: 211; ), // latin capital letter O with acute, U+00D3 
       (entity: 'Ocirc';  ch: 212; ), // latin capital letter O with circumflex, U+00D4 
       (entity: 'Otilde'; ch: 213; ), // latin capital letter O with tilde, U+00D5 
       (entity: 'Ouml';  ch: 214; ), // latin capital letter O with diaeresis, U+00D6 
       (entity: 'times';  ch: 215; ), // multiplication sign, U+00D7 
       (entity: 'Oslash'; ch: 216; ), // latin capital letter O with stroke = latin capital letter O slash, U+00D8 
       (entity: 'Ugrave'; ch: 217; ), // latin capital letter U with grave, U+00D9 
       (entity: 'Uacute'; ch: 218; ), // latin capital letter U with acute, U+00DA 
       (entity: 'Ucirc';  ch: 219; ), // latin capital letter U with circumflex, U+00DB 
       (entity: 'Uuml';  ch: 220; ), // latin capital letter U with diaeresis, U+00DC 
       (entity: 'Yacute'; ch: 221; ), // latin capital letter Y with acute, U+00DD 
       (entity: 'THORN';  ch: 222; ), // latin capital letter THORN, U+00DE 
       (entity: 'szlig';  ch: 223; ), // latin small letter sharp s = ess-zed, U+00DF 
       (entity: 'agrave'; ch: 224; ), // latin small letter a with grave = latin small letter a grave, U+00E0 
       (entity: 'aacute'; ch: 225; ), // latin small letter a with acute, U+00E1 
       (entity: 'acirc';  ch: 226; ), // latin small letter a with circumflex, U+00E2 
       (entity: 'atilde'; ch: 227; ), // latin small letter a with tilde, U+00E3 
       (entity: 'auml';  ch: 228; ), // latin small letter a with diaeresis, U+00E4 
       (entity: 'aring';  ch: 229; ), // latin small letter a with ring above = latin small letter a ring, U+00E5 
       (entity: 'aelig';  ch: 230; ), // latin small letter ae = latin small ligature ae, U+00E6 
       (entity: 'ccedil'; ch: 231; ), // latin small letter c with cedilla, U+00E7 
       (entity: 'egrave'; ch: 232; ), // latin small letter e with grave, U+00E8 
       (entity: 'eacute'; ch: 233; ), // latin small letter e with acute, U+00E9 
       (entity: 'ecirc';  ch: 234; ), // latin small letter e with circumflex, U+00EA 
       (entity: 'euml';  ch: 235; ), // latin small letter e with diaeresis, U+00EB 
       (entity: 'igrave'; ch: 236; ), // latin small letter i with grave, U+00EC 
       (entity: 'iacute'; ch: 237; ), // latin small letter i with acute, U+00ED 
       (entity: 'icirc';  ch: 238; ), // latin small letter i with circumflex, U+00EE 
       (entity: 'iuml';  ch: 239; ), // latin small letter i with diaeresis, U+00EF 
       (entity: 'eth';  ch: 240; ), // latin small letter eth, U+00F0 
       (entity: 'ntilde'; ch: 241; ), // latin small letter n with tilde, U+00F1 
       (entity: 'ograve'; ch: 242; ), // latin small letter o with grave, U+00F2 
       (entity: 'oacute'; ch: 243; ), // latin small letter o with acute, U+00F3 
       (entity: 'ocirc';  ch: 244; ), // latin small letter o with circumflex, U+00F4 
       (entity: 'otilde'; ch: 245; ), // latin small letter o with tilde, U+00F5 
       (entity: 'ouml';  ch: 246; ), // latin small letter o with diaeresis, U+00F6 
       (entity: 'divide'; ch: 247; ), // division sign, U+00F7 
       (entity: 'oslash'; ch: 248; ), // latin small letter o with stroke, = latin small letter o slash, U+00F8 
       (entity: 'ugrave'; ch: 249; ), // latin small letter u with grave, U+00F9 
       (entity: 'uacute'; ch: 250; ), // latin small letter u with acute, U+00FA 
       (entity: 'ucirc';  ch: 251; ), // latin small letter u with circumflex, U+00FB 
       (entity: 'uuml';  ch: 252; ), // latin small letter u with diaeresis, U+00FC 
       (entity: 'yacute'; ch: 253; ), // latin small letter y with acute, U+00FD 
       (entity: 'thorn';  ch: 254; ), // latin small letter thorn, U+00FE 
       (entity: 'yuml';  ch: 255; ), // latin small letter y with diaeresis, U+00FF 
       (entity: 'fnof';  ch: 402; ), // latin small f with hook = function = florin, U+0192 
       (entity: 'Alpha';  ch: 913; ), // greek capital letter alpha, U+0391 
       (entity: 'Beta';  ch: 914; ), // greek capital letter beta, U+0392 
       (entity: 'Gamma';  ch: 915; ), // greek capital letter gamma, U+0393 
       (entity: 'Delta';  ch: 916; ), // greek capital letter delta, U+0394 
       (entity: 'Epsilon'; ch: 917; ), // greek capital letter epsilon, U+0395 
       (entity: 'Zeta';  ch: 918; ), // greek capital letter zeta, U+0396 
       (entity: 'Eta';  ch: 919; ), // greek capital letter eta, U+0397 
       (entity: 'Theta';  ch: 920; ), // greek capital letter theta, U+0398 
       (entity: 'Iota';  ch: 921; ), // greek capital letter iota, U+0399 
       (entity: 'Kappa';  ch: 922; ), // greek capital letter kappa, U+039A 
       (entity: 'Lambda'; ch: 923; ), // greek capital letter lambda, U+039B 
       (entity: 'Mu';   ch: 924; ), // greek capital letter mu, U+039C 
       (entity: 'Nu';   ch: 925; ), // greek capital letter nu, U+039D 
       (entity: 'Xi';   ch: 926; ), // greek capital letter xi, U+039E 
       (entity: 'Omicron'; ch: 927; ), // greek capital letter omicron, U+039F 
       (entity: 'Pi';   ch: 928; ), // greek capital letter pi, U+03A0 
       (entity: 'Rho';  ch: 929; ), // greek capital letter rho, U+03A1 
       // there is no Sigmaf, and no U+03A2 character either 
       (entity: 'Sigma';  ch: 931; ), // greek capital letter sigma, U+03A3 
       (entity: 'Tau';  ch: 932; ), // greek capital letter tau, U+03A4 
       (entity: 'Upsilon'; ch: 933; ), // greek capital letter upsilon, U+03A5 
       (entity: 'Phi';  ch: 934; ), // greek capital letter phi, U+03A6 
       (entity: 'Chi';  ch: 935; ), // greek capital letter chi, U+03A7 
       (entity: 'Psi';  ch: 936; ), // greek capital letter psi, U+03A8 
       (entity: 'Omega';  ch: 937; ), // greek capital letter omega, U+03A9 
       (entity: 'alpha';  ch: 945; ), // greek small letter alpha, U+03B1 
       (entity: 'beta';  ch: 946; ), // greek small letter beta, U+03B2 
       (entity: 'gamma';  ch: 947; ), // greek small letter gamma, U+03B3 
       (entity: 'delta';  ch: 948; ), // greek small letter delta, U+03B4 
       (entity: 'epsilon'; ch: 949; ), // greek small letter epsilon, U+03B5 
       (entity: 'zeta';  ch: 950; ), // greek small letter zeta, U+03B6 
       (entity: 'eta';  ch: 951; ), // greek small letter eta, U+03B7 
       (entity: 'theta';  ch: 952; ), // greek small letter theta, U+03B8 
       (entity: 'iota';  ch: 953; ), // greek small letter iota, U+03B9 
       (entity: 'kappa';  ch: 954; ), // greek small letter kappa, U+03BA 
       (entity: 'lambda'; ch: 955; ), // greek small letter lambda, U+03BB 
       (entity: 'mu';   ch: 956; ), // greek small letter mu, U+03BC 
       (entity: 'nu';   ch: 957; ), // greek small letter nu, U+03BD 
       (entity: 'xi';   ch: 958; ), // greek small letter xi, U+03BE 
       (entity: 'omicron'; ch: 959; ), // greek small letter omicron, U+03BF NEW 
       (entity: 'pi';   ch: 960; ), // greek small letter pi, U+03C0 
       (entity: 'rho';  ch: 961; ), // greek small letter rho, U+03C1 
       (entity: 'sigmaf'; ch: 962; ), // greek small letter final sigma, U+03C2 
       (entity: 'sigma';  ch: 963; ), // greek small letter sigma, U+03C3 
       (entity: 'tau';  ch: 964; ), // greek small letter tau, U+03C4 
       (entity: 'upsilon'; ch: 965; ), // greek small letter upsilon, U+03C5 
       (entity: 'phi';  ch: 966; ), // greek small letter phi, U+03C6 
       (entity: 'chi';  ch: 967; ), // greek small letter chi, U+03C7 
       (entity: 'psi';  ch: 968; ), // greek small letter psi, U+03C8 
       (entity: 'omega';  ch: 969; ), // greek small letter omega, U+03C9 
       (entity: 'thetasym'; ch: 977; ), // greek small letter theta symbol, U+03D1 NEW 
       (entity: 'upsih';  ch: 978; ), // greek upsilon with hook symbol, U+03D2 NEW 
       (entity: 'piv';  ch: 982; ), // greek pi symbol, U+03D6 
       (entity: 'bull';  ch: 8226; ), // bullet = black small circle, U+2022 
       (entity: 'hellip'; ch: 8230; ), // horizontal ellipsis = three dot leader, U+2026 
       (entity: 'prime';  ch: 8242; ), // prime = minutes = feet, U+2032 
       (entity: 'Prime';  ch: 8243; ), // double prime = seconds = inches, U+2033 
       (entity: 'oline';  ch: 8254; ), // overline = spacing overscore, U+203E NEW 
       (entity: 'frasl';  ch: 8260; ), // fraction slash, U+2044 NEW 
       (entity: 'ensp';  ch: 8194; ), // en space, U+2002 
       (entity: 'emsp';  ch: 8195; ), // em space, U+2003 
       (entity: 'thinsp'; ch: 8201; ), // thin space, U+2009 
       (entity: 'zwnj';  ch: 8204; ), // zero width non-joiner, U+200C NEW RFC 2070 
       (entity: 'zwj';  ch: 8205; ), // zero width joiner, U+200D NEW RFC 2070 
       (entity: 'lrm';  ch: 8206; ), // left-to-right mark, U+200E NEW RFC 2070 
       (entity: 'rlm';  ch: 8207; ), // right-to-left mark, U+200F NEW RFC 2070 
       (entity: 'ndash';  ch: 8211; ), // en dash, U+2013 
       (entity: 'mdash';  ch: 8212; ), // em dash, U+2014 
       (entity: 'lsquo';  ch: 8216; ), // left single quotation mark, U+2018 
       (entity: 'rsquo';  ch: 8217; ), // right single quotation mark, U+2019 
       (entity: 'sbquo';  ch: 8218; ), // single low-9 quotation mark, U+201A NEW 
       (entity: 'ldquo';  ch: 8220; ), // left double quotation mark, U+201C 
       (entity: 'rdquo';  ch: 8221; ), // right double quotation mark, U+201D 
       (entity: 'bdquo';  ch: 8222; ), // double low-9 quotation mark, U+201E NEW 
       (entity: 'dagger'; ch: 8224; ), // dagger, U+2020 
       (entity: 'Dagger'; ch: 8225; ), // double dagger, U+2021 
       (entity: 'permil'; ch: 8240; ), // per mille sign, U+2030 
       (entity: 'lsaquo'; ch: 8249; ), // single left-pointing angle quotation mark, U+2039 
       (entity: 'rsaquo'; ch: 8250; ), // single right-pointing angle quotation mark, U+203A 
       (entity: 'euro';  ch: 8364; ), // euro sign, U+20AC NEW 
       (entity: 'weierp'; ch: 8472; ), // script capital P = power set = Weierstrass p, U+2118 
       (entity: 'image';  ch: 8465; ), // blackletter capital I = imaginary part, U+2111 
       (entity: 'real';  ch: 8476; ), // blackletter capital R = real part symbol, U+211C 
       (entity: 'trade';  ch: 8482; ), // trade mark sign, U+2122 
       (entity: 'alefsym'; ch: 8501; ), // alef symbol = first transfinite cardinal, U+2135 NEW (alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters) 
       (entity: 'larr';  ch: 8592; ), // leftwards arrow, U+2190 
       (entity: 'uarr';  ch: 8593; ), // upwards arrow, U+2191 
       (entity: 'rarr';  ch: 8594; ), // rightwards arrow, U+2192 
       (entity: 'darr';  ch: 8595; ), // downwards arrow, U+2193 
       (entity: 'harr';  ch: 8596; ), // left right arrow, U+2194 
       (entity: 'crarr';  ch: 8629; ), // downwards arrow with corner leftwards = carriage return, U+21B5 NEW 
       (entity: 'lArr';  ch: 8656; ), // leftwards double arrow, U+21D0 
       (entity: 'uArr';  ch: 8657; ), // upwards double arrow, U+21D1 
       (entity: 'rArr';  ch: 8658; ), // rightwards double arrow, U+21D2 
       (entity: 'dArr';  ch: 8659; ), // downwards double arrow, U+21D3 
       (entity: 'hArr';  ch: 8660; ), // left right double arrow, U+21D4 
       (entity: 'forall'; ch: 8704; ), // for all, U+2200 
       (entity: 'part';  ch: 8706; ), // partial differential, U+2202 
       (entity: 'exist';  ch: 8707; ), // there exists, U+2203 
       (entity: 'empty';  ch: 8709; ), // empty set = null set = diameter, U+2205 
       (entity: 'nabla';  ch: 8711; ), // nabla = backward difference, U+2207 
       (entity: 'isin';  ch: 8712; ), // element of, U+2208 
       (entity: 'notin';  ch: 8713; ), // not an element of, U+2209 
       (entity: 'ni';   ch: 8715; ), // contains as member, U+220B 
       (entity: 'prod';  ch: 8719; ), // n-ary product = product sign, U+220F 
       (entity: 'sum';  ch: 8721; ), // n-ary sumation, U+2211 
       (entity: 'minus';  ch: 8722; ), // minus sign, U+2212 
       (entity: 'lowast'; ch: 8727; ), // asterisk operator, U+2217 
       (entity: 'radic';  ch: 8730; ), // square root = radical sign, U+221A 
       (entity: 'prop';  ch: 8733; ), // proportional to, U+221D 
       (entity: 'infin';  ch: 8734; ), // infinity, U+221E 
       (entity: 'ang';  ch: 8736; ), // angle, U+2220 
       (entity: 'and';  ch: 8743; ), // logical and = wedge, U+2227 
       (entity: 'or';   ch: 8744; ), // logical or = vee, U+2228 
       (entity: 'cap';  ch: 8745; ), // intersection = cap, U+2229 
       (entity: 'cup';  ch: 8746; ), // union = cup, U+222A 
       (entity: 'int';  ch: 8747; ), // integral, U+222B 
       (entity: 'there4'; ch: 8756; ), // therefore, U+2234 
       (entity: 'sim';  ch: 8764; ), // tilde operator = varies with = similar to, U+223C 
       (entity: 'cong';  ch: 8773; ), // approximately equal to, U+2245 
       (entity: 'asymp';  ch: 8776; ), // almost equal to = asymptotic to, U+2248 
       (entity: 'ne';   ch: 8800; ), // not equal to, U+2260 
       (entity: 'equiv';  ch: 8801; ), // identical to, U+2261 
       (entity: 'le';   ch: 8804; ), // less-than or equal to, U+2264 
       (entity: 'ge';   ch: 8805; ), // greater-than or equal to, U+2265 
       (entity: 'sub';  ch: 8834; ), // subset of, U+2282 
       (entity: 'sup';  ch: 8835; ), // superset of, U+2283 
       (entity: 'nsub';  ch: 8836; ), // not a subset of, U+2284 
       (entity: 'sube';  ch: 8838; ), // subset of or equal to, U+2286 
       (entity: 'supe';  ch: 8839; ), // superset of or equal to, U+2287 
       (entity: 'oplus';  ch: 8853; ), // circled plus = direct sum, U+2295 
       (entity: 'otimes'; ch: 8855; ), // circled times = vector product, U+2297 
       (entity: 'perp';  ch: 8869; ), // up tack = orthogonal to = perpendicular, U+22A5 
       (entity: 'sdot';  ch: 8901; ), // dot operator, U+22C5 
       (entity: 'lceil';  ch: 8968; ), // left ceiling = apl upstile, U+2308 
       (entity: 'rceil';  ch: 8969; ), // right ceiling, U+2309 
       (entity: 'lfloor'; ch: 8970; ), // left floor = apl downstile, U+230A 
       (entity: 'rfloor'; ch: 8971; ), // right floor, U+230B 
       (entity: 'lang';  ch: 9001; ), // left-pointing angle bracket = bra, U+2329 
       (entity: 'rang';  ch: 9002; ), // right-pointing angle bracket = ket, U+232A 
       (entity: 'loz';  ch: 9674; ), // lozenge, U+25CA 
       (entity: 'spades'; ch: 9824; ), // black spade suit, U+2660 
       (entity: 'clubs';  ch: 9827; ), // black club suit = shamrock, U+2663 
       (entity: 'hearts'; ch: 9829; ), // black heart suit = valentine, U+2665 
       (entity: 'diams';  ch: 9830; ) // black diamond suit, U+2666 
      ); 


    var 
     i: Integer; 
     len: Integer; 
     nChar: UCS4Char; 
     runEntity: string; 
    begin 
     { 
      EntityRef ::= '&' Name ';' 

       Name ::= NameStartChar (NameChar)* 

        NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] 
        NameChar   ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] 
     } 
     Result := ''; 
     CharRef := ''; 

     len := Length(sValue) - StartIndex + 1; 
     if len < 4 then 
      Exit; 
     i := StartIndex; 
     if sValue[i] <> '&' then Exit; 
     Inc(i); 

     if not IsNameStartChar(sValue[i]) then 
      Exit; 

     Inc(i); 
     while IsNameChar(sValue[i]) do 
     begin 
      Inc(i); 
      if i > Length(sValue) then 
       Exit; 
     end; 
     if sValue[i] <> ';' then 
      Exit; 

     charRef := Copy(sValue, StartIndex, (i-StartIndex)+1); 

     for i := Low(HtmlEntities) to High(HtmlEntities) do 
     begin 
      //now strip off the & and ; 
      runEntity := Copy(charRef, 2, Length(charRef)-2); 

      //Case sensitive check; as entites are case sensitive 
      if runEntity = HtmlEntities[i].entity then 
      begin 
       nChar := HtmlEntities[i].ch; 
       Result := UCS4CharToString(nChar); 
       Exit; 
      end; 
     end; 

     //It looks like a valid entity reference, but we don't recognize the text. 
     //It's probably garbage that we might be able to fix 
     if IsDebuggerPresent then 
      OutputDebugString(PChar('HtmlDecode: Unknown HTML entity reference: "'+charRef+'"')); 
    end; 

var 
    i: Integer; 
    entity: UnicodeString; 
    entityChar: UnicodeString; 
begin 
    i := 1; 
    Result := ''; 

    while i <= Length(s) do 
    begin 
     if s[i] <> '&' then 
     begin 
      Result := Result + s[i]; 
      Inc(i); 
      Continue; 
     end; 

     entityChar := GetCharRef(s, i, {out}entity); 
     if entityChar <> '' then 
     begin 
      Result := Result + entityChar; 
      Inc(i, Length(entity)); 
      Continue; 
     end; 

     entityChar := GetEntityRef(s, i, {out}entity); 
     if entityChar <> '' then 
     begin 
      Result := Result + entityChar; 
      Inc(i, Length(entity)); 
      Continue; 
     end; 

     Result := Result + s[i]; 
     Inc(i); 
    end; 
end;

来源

2017-12-18 19:29:31

德尔福HTML解码

回答

相关问题