使用正则表达式和 HTML 清理器安全问题

发布于 2024-10-08 22:34:09 字数 4397 浏览 8 评论 0原文

我知道用正则表达式解析 HTML 不好，而且它不能适用于所有情况（Stack Overflow 上有很多关于这方面的主题）。但我仍然想尝试使用基于白名单方法的正则表达式来清理 HTML。

我想向您展示我的代码（用 PHP 5.2 编写）。看起来工作正常，但我仍然想知道是否存在安全问题。

那么，我是不是搞错了什么？

基本原理是使用 Html_Sanitizer::sanitize()

函数首先用 token 替换允许的没有属性的标签。然后解析具有属性的标签并将其替换为标记。
然后解析 HTML 标签以检测允许的属性（使用 cleanTag 函数）。因此，HTML 标签以（希望）安全的方式重新构建。
htmlspecialchars 用于确保剩余代码是干净的
标记被替换为安全标记。

代码：

class Html_Sanitizer
{
    const VALIDATOR_CSS_UNIT = '(([\+\-]?[0-9\.]+)(em|ex|px|in|cm|mm|pt|pc|\%))|0';
    const VALIDATOR_URL = 'http://\\S+';
    const VALIDATOR_CSS_PROPERTY = '[a-z\-]+';
    const VALIDATOR_STYLE = '[^"]*';

    protected static $_tags = 'a|b|blockquote|br|cite|d[ldt]|h[1-6]|i|img|li|ol|p|span|strong|u|ul';

    protected static $_attributes = array(
        'img' => array(
            'width' => '[0-9]+',
            'height' => '[0-9]+',
            'src' => self::VALIDATOR_URL,
            'style' => self::VALIDATOR_STYLE
            ),
        'span' => array(
            'style' => self::VALIDATOR_STYLE
            ),
        'p' => array(
            'style' => self::VALIDATOR_STYLE
            ),
        'a' =>  array(
            'href' => self::VALIDATOR_URL
            )
    );

    protected static $_styleValidators = array(
        'color' => '(\#[a-fA-F0-9]+)|([a-z ]+)',
        'background-color' => '\#[a-zA-Z0-9]+',
        'font-style' => '(normal|italic|oblique)',
        'font-size' => '[\-a-z]+',
        'margin-left' => self::VALIDATOR_CSS_UNIT,
        'margin-right' => self::VALIDATOR_CSS_UNIT,
        'text-align' => '(left|right|center|justify)',
        'text-indent' => self::VALIDATOR_CSS_UNIT,
        'text-decoration' => '(none|overline|underline|blink|line-through)',
        'width' => self::VALIDATOR_CSS_UNIT,
        'height' => self::VALIDATOR_CSS_UNIT
    );

    public static function sanitize($str)
    {
        $tokens = array();

        //tokenize opening tags with no attributes
        $pattern = '#<(/)?('. self::$_tags .')>#';
        $replace = '__SAFE_TAG_$1$2__';
        $str = preg_replace($pattern, $replace, $str);

        // tokenize tags with attributes
        $pattern = '#<('. self::$_tags .')(?:\s+(?:[a-z]+)="(?:[^"\\\]*(?:\\\"[^"\\\]*)*)")*\s*(/)?>#';
        preg_match_all($pattern, $str, $matches, PREG_SET_ORDER);
        foreach($matches as $i => $match) {
            $tokens[$i] = self::cleanTag($match[1], $match[0]);
            $str = str_replace($match[0], '__SAFE_TOKEN_'.$i.'__', $str);
        }

        $str = htmlspecialchars($str);

        foreach ($tokens as $i => $cleanTag) {
            $str = str_replace('__SAFE_TOKEN_'.$i.'__', $cleanTag, $str);
        }

        $pattern = '#__SAFE_TAG_(/?(?:'. self::$_tags .'))__#';
        $replace = '<$1>';
        $str = preg_replace($pattern, $replace, $str);

        return $str;
    }

    public static function cleanTag($tag, $str)
    {
        $cleanTag = '<' . $tag;

        if ($tag === 'a') {
            $cleanTag .= ' rel="nofolow" target="_blank"';
        }

        if (isset(self::$_attributes[$tag])) {
            foreach(self::$_attributes[$tag] as $attr => $attrPattern) {
                $pattern = '#'.$attr.'="('. $attrPattern .')"#';
                preg_match($pattern, $str, $match);
                if (isset($match[1])) {
                    if ($attr == 'style') {
                        $cleanTag .= ' style="' . self::cleanStyle($match[1]) . '"';
                    } else {
                        $cleanTag .= ' ' . $attr . '="' . $match[1] . '"';
                    }
                }
            }
        }

        if ($tag === 'img') {
            $cleanTag .= ' /';
        }

        $cleanTag .= '>';
        return $cleanTag;
    }

    public static function cleanStyle($style)
    {
        $cleanStyle = '';

        foreach(self::$_styleValidators as $stl => $stlPattern) {
            $pattern = '#[; ]?' . $stl . '\s*:\s*(' . $stlPattern . ')\s*;#i';
            preg_match($pattern, $style, $match);
            if (isset($match[1])) {
                $cleanStyle .= ($cleanStyle ? ' ' : '') . $stl . ':' . $match[1] . ';';
            }
        }

        return $cleanStyle;
    }
}

原文

I know that parsing HTML with regexp is bad, and it can not work for all cases (there are plenty topics about that on Stack Overflow).
But I still wanted to try to sanitize HTML with regex based on a whitelist method.

I would like to show you my code bellow (written in PHP 5.2).
It seems to work fine, but I'm still wondering if there are security issues.

So, did I got something wrong ?

Basic principle is to use Html_Sanitizer::sanitize()

The function first replaces allowed tags with no attributes with tokens. Then parse for tags with attributes and replace them with token too.
The HTML tags are then parsed to detect the allowed attributes (using the cleanTag function). The HTML tag is therefore re-builded in a (lets-hope) safe way.
htmlspecialchars is used to be sure that remaining code is clean
tokens are replaced with safe tags.

Code:

class Html_Sanitizer
{
    const VALIDATOR_CSS_UNIT = '(([\+\-]?[0-9\.]+)(em|ex|px|in|cm|mm|pt|pc|\%))|0';
    const VALIDATOR_URL = 'http://\\S+';
    const VALIDATOR_CSS_PROPERTY = '[a-z\-]+';
    const VALIDATOR_STYLE = '[^"]*';

    protected static $_tags = 'a|b|blockquote|br|cite|d[ldt]|h[1-6]|i|img|li|ol|p|span|strong|u|ul';

    protected static $_attributes = array(
        'img' => array(
            'width' => '[0-9]+',
            'height' => '[0-9]+',
            'src' => self::VALIDATOR_URL,
            'style' => self::VALIDATOR_STYLE
            ),
        'span' => array(
            'style' => self::VALIDATOR_STYLE
            ),
        'p' => array(
            'style' => self::VALIDATOR_STYLE
            ),
        'a' =>  array(
            'href' => self::VALIDATOR_URL
            )
    );

    protected static $_styleValidators = array(
        'color' => '(\#[a-fA-F0-9]+)|([a-z ]+)',
        'background-color' => '\#[a-zA-Z0-9]+',
        'font-style' => '(normal|italic|oblique)',
        'font-size' => '[\-a-z]+',
        'margin-left' => self::VALIDATOR_CSS_UNIT,
        'margin-right' => self::VALIDATOR_CSS_UNIT,
        'text-align' => '(left|right|center|justify)',
        'text-indent' => self::VALIDATOR_CSS_UNIT,
        'text-decoration' => '(none|overline|underline|blink|line-through)',
        'width' => self::VALIDATOR_CSS_UNIT,
        'height' => self::VALIDATOR_CSS_UNIT
    );

    public static function sanitize($str)
    {
        $tokens = array();

        //tokenize opening tags with no attributes
        $pattern = '#<(/)?('. self::$_tags .')>#';
        $replace = '__SAFE_TAG_$1$2__';
        $str = preg_replace($pattern, $replace, $str);

        // tokenize tags with attributes
        $pattern = '#<('. self::$_tags .')(?:\s+(?:[a-z]+)="(?:[^"\\\]*(?:\\\"[^"\\\]*)*)")*\s*(/)?>#';
        preg_match_all($pattern, $str, $matches, PREG_SET_ORDER);
        foreach($matches as $i => $match) {
            $tokens[$i] = self::cleanTag($match[1], $match[0]);
            $str = str_replace($match[0], '__SAFE_TOKEN_'.$i.'__', $str);
        }

        $str = htmlspecialchars($str);

        foreach ($tokens as $i => $cleanTag) {
            $str = str_replace('__SAFE_TOKEN_'.$i.'__', $cleanTag, $str);
        }

        $pattern = '#__SAFE_TAG_(/?(?:'. self::$_tags .'))__#';
        $replace = '<$1>';
        $str = preg_replace($pattern, $replace, $str);

        return $str;
    }

    public static function cleanTag($tag, $str)
    {
        $cleanTag = '<' . $tag;

        if ($tag === 'a') {
            $cleanTag .= ' rel="nofolow" target="_blank"';
        }

        if (isset(self::$_attributes[$tag])) {
            foreach(self::$_attributes[$tag] as $attr => $attrPattern) {
                $pattern = '#'.$attr.'="('. $attrPattern .')"#';
                preg_match($pattern, $str, $match);
                if (isset($match[1])) {
                    if ($attr == 'style') {
                        $cleanTag .= ' style="' . self::cleanStyle($match[1]) . '"';
                    } else {
                        $cleanTag .= ' ' . $attr . '="' . $match[1] . '"';
                    }
                }
            }
        }

        if ($tag === 'img') {
            $cleanTag .= ' /';
        }

        $cleanTag .= '>';
        return $cleanTag;
    }

    public static function cleanStyle($style)
    {
        $cleanStyle = '';

        foreach(self::$_styleValidators as $stl => $stlPattern) {
            $pattern = '#[; ]?' . $stl . '\s*:\s*(' . $stlPattern . ')\s*;#i';
            preg_match($pattern, $style, $match);
            if (isset($match[1])) {
                $cleanStyle .= ($cleanStyle ? ' ' : '') . $stl . ':' . $match[1] . ';';
            }
        }

        return $cleanStyle;
    }
}

分享到QQ

分享到微博