2012-04-19 16 views
0

我有这样的功能:如何从SQL Server中的字符串中删除具有函数的所有HTML(em/strong除外)?

CREATE FUNCTION [dbo].[udf_StripHTML] 
(@HTMLText VARCHAR(MAX)) 
RETURNS VARCHAR(MAX) 
AS 
BEGIN 
DECLARE @Start INT 
DECLARE @End INT 
DECLARE @Length INT 
SET @Start = CHARINDEX('<',@HTMLText) 
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText)) 
SET @Length = (@End - @Start) + 1 
WHILE @Start > 0 
AND @End > 0 
AND @Length > 0 
BEGIN 
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'') 
SET @Start = CHARINDEX('<',@HTMLText) 
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText)) 
SET @Length = (@End - @Start) + 1 
END 
RETURN LTRIM(RTRIM(@HTMLText)) 
END 

我需要剥去一切,除了<em><strong>标签。

谢谢

托马斯通过CLR函数

回答

1

XSLT转换(例如,从MDS组件之一)

更新

下面是一个使用XSLT转换方法的答案: How to remove all tags except for some using Nokogiri

更新2

然后剩下的唯一选择是使用正则表达式。再次通过CLR功能。

Strip all HTML tags except links

安装的MDS装配

Deploy SQL 2008 R2 MDS Functions without MDS

+0

这将工作与形成不良的HTML。我需要删除adsf的属性没有引用,没有HREF – tsdexter 2012-04-19 20:08:02

+0

Sql Server不适合这样的事情,但我敢肯定你可以在你的输入html字符串上应用** HTML Tidy **,然后提交它进行处理 – 2012-04-19 20:12:29

+0

因为这是用新数据替换旧的不正确数据的一部分,并且旧数据非常具体(即:html标签全部大写,而我想保留的大写字母小写)我能够更轻松地做到这一点使用PATINDEX而不是CHARINDEX - 请参阅我的答案。 – tsdexter 2012-04-19 20:26:02

0

编辑:下面是我用我的问题,因为我的HTML是非常具体的。我原来的问题的正确答案是我在丹尼斯上面接受的答案。

我使用的数据非常具体。

我需要删除的标签都是大写,即:

那些我想保持较低的情况下,即:

所以我能够用PATINDEX而不是CHARINDEX做到这一点很容易:

ALTER FUNCTION [dbo].[udf_StripHTMLlinks] 
(@HTMLText VARCHAR(MAX)) 
RETURNS VARCHAR(MAX) 
AS 
BEGIN 
DECLARE @Start INT 
DECLARE @End INT 
DECLARE @Length INT 
SET @Start = PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS) 
SET @End = CHARINDEX('>',@HTMLText,PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS)) 
SET @Length = (@End - @Start) + 1 
WHILE @Start > 0 
AND @End > 0 
AND @Length > 0 
BEGIN 
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'') 
SET @Start = PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS) 
SET @End = CHARINDEX('>',@HTMLText,PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS)) 
SET @Length = (@End - @Start) + 1 
END 
RETURN LTRIM(RTRIM(@HTMLText)) 
END 

感谢您的意见。

0

您可以使用SQL函数是这样的:

ALTER FUNCTION [dbo].[StripOutHTML] 
(
    @HTMLText VARCHAR(max), 
    @stripDisallowedOnly BIT 
) 
returns VARCHAR(max) 
AS 
    BEGIN 
     DECLARE @Start INT 
     DECLARE @End INT 
     DECLARE @Length INT 

     -- Replace the HTML entity & with the '&' character (this needs to be done first, as 
     -- '&' might be double encoded as '&amp;') 
     SET @Start = Charindex('&amp;', @HTMLText) 
     SET @End = @Start + 4 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '&') 
      SET @Start = Charindex('&amp;', @HTMLText) 
      SET @End = @Start + 4 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Replace the HTML entity < with the '<' character 
     SET @Start = Charindex('&lt;', @HTMLText) 
     SET @End = @Start + 3 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '<') 
      SET @Start = Charindex('&lt;', @HTMLText) 
      SET @End = @Start + 3 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Replace the HTML entity > with the '>' character 
     SET @Start = Charindex('&gt;', @HTMLText) 
     SET @End = @Start + 3 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '>') 
      SET @Start = Charindex('&gt;', @HTMLText) 
      SET @End = @Start + 3 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Replace the HTML entity & with the '&' character 
     SET @Start = Charindex('&amp;amp;', @HTMLText) 
     SET @End = @Start + 4 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '&') 
      SET @Start = Charindex('&amp;amp;', @HTMLText) 
      SET @End = @Start + 4 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Replace the HTML entity with the ' ' character 
     SET @Start = Charindex('&nbsp;', @HTMLText) 
     SET @End = @Start + 5 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, ' ') 
      SET @Start = Charindex('&nbsp;', @HTMLText) 
      SET @End = @Start + 5 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Replace any <P>, </P>tags with a <BR>, so they will be replaced with a new line in next step 
     SET @HTMLText = REPLACE(@HTMLText, '<P>', '<br>') 
     SET @HTMLText = REPLACE(@HTMLText, '</P>', '<br>') 

     -- Replace any <BR> tags with a newline 
     SET @Start = Charindex('<br>', @HTMLText) 
     SET @End = @Start + 3 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, 
          Char(13) + Char(10)) 
      SET @Start = Charindex('<br>', @HTMLText) 
      SET @End = @Start + 3 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Replace any tags with a newline 
     SET @Start = Charindex('<br/>', @HTMLText) 
     SET @End = @Start + 4 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, 
          'CHAR(13) + CHAR(10)') 
      SET @Start = Charindex('<br/>', @HTMLText) 
      SET @End = @Start + 4 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Replace any tags with a newline 
     SET @Start = Charindex('<br />', @HTMLText) 
     SET @End = @Start + 5 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, 
          'CHAR(13) + CHAR(10)') 
      SET @Start = Charindex('<br />', @HTMLText) 
      SET @End = @Start + 5 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Remove anything between tags 
     SET @Start = Charindex('<', @HTMLText) 
     SET @End = Charindex('>', @HTMLText, Charindex('<', @HTMLText)) 
     SET @Length = (@End - @Start) + 1 

     WHILE (@Start > 0 
       AND @End > 0 
       AND @Length > 0) 
     BEGIN 
      IF @stripDisallowedOnly = 1 
       BEGIN 
        IF (Upper(Substring(@HTMLText, @Start, 2)) <> '<B') 
        AND (Upper(Substring(@HTMLText, @Start, 3)) <> '</B') 
        AND (Upper(Substring(@HTMLText, @Start, 2)) <> '<U') 
        AND (Upper(Substring(@HTMLText, @Start, 3)) <> '</U') 
        AND (Upper(Substring(@HTMLText, @Start, 2)) <> '<I') 
        AND (Upper(Substring(@HTMLText, @Start, 3)) <> '</I') 
        BEGIN 
         SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
        END 
        ELSE 
        BEGIN 
         SET @Length = 0 
        END 
       END 
      ELSE 
       BEGIN 
        SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
       END 

      SET @Start = Charindex('<', @HTMLText, @End - @Length) 
      SET @End = Charindex('>', @HTMLText, Charindex('<', @HTMLText, 
               @Start) 
         ) 
      SET @Length = (@End - @Start) + 1 
     END 

     -- Remove any leading space/carriage return 
     DECLARE @trimchars VARCHAR(10) 
     SET @trimchars = CHAR(9)+CHAR(10)+CHAR(13)+CHAR(32) 
     IF @HTMLText LIKE '[' + @trimchars + ']%' SET @HTMLText = SUBSTRING(@HTMLText, PATINDEX('%[^' + @trimchars + ']%', @HTMLText), LEN(@HTMLText)) 
     RETURN Ltrim(Rtrim(@HTMLText)) 
    END 

您需要修改以下行,以保持类似em标签:(Upper(Substring(@HTMLText, @Start, 2)) <> '<B'):SQL函数不具有替代<BR><P>标签新线,但你可以很容易地删除这些线,如果你不需要。希望这可以帮助你或指出你在一个正确的方向

WHILE (@Start > 0 
     AND @End > 0 
     AND @Length > 0) 
BEGIN 
    IF @stripDisallowedOnly = 1 
     BEGIN 
      IF (Upper(Substring(@HTMLText, @Start, 2)) <> '<B') 
      AND (Upper(Substring(@HTMLText, @Start, 3)) <> '</B') 
      AND (Upper(Substring(@HTMLText, @Start, 2)) <> '<U') 
      AND (Upper(Substring(@HTMLText, @Start, 3)) <> '</U') 
      AND (Upper(Substring(@HTMLText, @Start, 2)) <> '<I') 
      AND (Upper(Substring(@HTMLText, @Start, 3)) <> '</I') 
      BEGIN 
       SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
      END 
      ELSE 
      BEGIN 
       SET @Length = 0 
      END 
     END 
    ELSE 
     BEGIN 
      SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
     END 

    SET @Start = Charindex('<', @HTMLText, @End - @Length) 
    SET @End = Charindex('>', @HTMLText, Charindex('<', @HTMLText, 
             @Start) 
       ) 
    SET @Length = (@End - @Start) + 1 
END 
相关问题