《php清除XSS通用类:CI框架的XSS移除类》要点:
本文介绍了php清除XSS通用类:CI框架的XSS移除类,希望对您有用。如果有疑问,可以联系我们。
前端在线编辑器提供给用户录入文章内容,或论坛贴子,因此也会成为用户攻击的一个渠道,其中主要有XSS攻击和图片上传攻击等等。
本文着重讲解XSS攻击和过滤,让网站更安全。
下面是一个CI框架的XSS过滤类。处理这类攻击很好用。
调用:
$str = ' 这是一个正常链接 <a href="http://asdfsd.com">链接1</a>, 这是一个有罪恶属性的链接 <a href="http://www.asdfsd.com" onmousemove="alert(\'a\')">链接1</a>,<br /> 这是一个图片<img src="http://www.baidu.com/img/baidu_sylogo1.gif" border="0" onclick="alert(\'hi\')" />, 这是一个脚本<script>alert("hi")</script> 这是一个文本框<input type="text" size="14" onmousemove="alert(\'a\')" border="1" /> <br /> 这是一个加粗字<strong>STRONG</strong>和<b>B</b> '; $xssFilter = new cleanXSS(); $str = $xssFilter->xss_clean($str,FALSE); echo $str; /* 这是一个正常链接 <a href="http://asdfsd.com">链接1</a>, 这是一个有罪恶属性的链接 <a >链接1</a>,<br /> 这是一个图片<img />, 这是一个脚本[removed]alert("hi")[removed] 这是一个文本框<input type="text" size="14" border="1" /><br /> 这是一个加粗字<strong>STRONG</strong>和<b>B</b>'; 即: 任何有非法属性的脚本均标记为[removed]alert("hi")[removed] 对于链接、图片则只留空标签 注意:框<input type="text" size="14" onmousemove="alert(\'a\')" border="1" /> <br />这两个标 签,后面的<br />如果紧靠前面一个,则会被转化成<br /> */
类文件:
//适用于在线编辑器、URL class cleanXSS { protected $_xss_hash = ''; protected $_never_allowed_str = array( 'document.cookie' => '[removed]', 'document.write' => '[removed]', '.parentNode' => '[removed]', '.innerHTML' => '[removed]', 'window.location' => '[removed]', '-moz-binding' => '[removed]', '<!--' => '<!--', '-->' => '-->', '<![CDATA[' => '<![CDATA[', '<comment>' => '<comment>' ); /* never allowed, regex replacement */ /** * List of never allowed regex replacement * * @var array * @access protected */ protected $_never_allowed_regex = array( 'javascript\s*:', 'expression\s*(\(|&\#40;)', // CSS and IE 'vbscript\s*:', // IE, surprise! 'Redirect\s+302', "([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?" ); public function __construct () { } public function xss_clean ( $str, $is_image = FALSE ) { /* * Is the string an array? * */ if ( is_array ( $str ) ) { while ( list( $key ) = each ( $str ) ) { $str[$key] = $this->xss_clean ( $str[$key] ); } return $str; } /* * Remove Invisible Characters */ $str = remove_invisible_characters ( $str ); // Validate Entities in URLs $str = $this->_validate_entities ( $str ); /* * URL Decode * * Just in case stuff like this is submitted: * * <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> * * Note: Use rawurldecode() so it does not remove plus signs * */ $str = rawurldecode ( $str ); /* * Convert character entities to ASCII * * This permits our tests below to work reliably. * We only convert entities that are within tags since * these are the ones that will pose security problems. */ $str = preg_replace_callback ( "/[a-z]+=([\'\"]).*?\\1/si", array( $this, '_convert_attribute' ), $str ); $str = preg_replace_callback ( "/<\w+.*?(?=>|<|$)/si", array( $this, '_decode_entity' ), $str ); /* * Remove Invisible Characters Again! */ $str = remove_invisible_characters ( $str ); /* * Convert all tabs to spaces * This prevents strings like this: ja vascript * NOTE: we deal with spaces between characters later. * NOTE: preg_replace was found to be amazingly slow here on * large blocks of data, so we use str_replace. */ if ( strpos ( $str, "\t" ) !== FALSE ) { $str = str_replace ( "\t", ' ', $str ); } /* * Capture converted string for later comparison */ $converted_string = $str; // Remove Strings that are never allowed $str = $this->_do_never_allowed ( $str ); /* * Makes PHP tags safe * * Note: XML tags are inadvertently replaced too: * * <?xml * * But it doesn't seem to pose a problem. */ if ( $is_image === TRUE ) { // Images have a tendency to have the PHP short opening and // closing tags every so often so we skip those and only // do the long opening tags. $str = preg_replace ( '/<\?(php)/i', "<?\\1", $str ); } else { $str = str_replace ( array( '<?', '?' . '>' ), array( '<?', '?>' ), $str ); } /* * Compact any exploded words * This corrects words like: j a v a s c r i p t * These words are compacted back to their correct state. */ $words = array( 'javascript', 'expression', 'vbscript', 'script', 'base64', 'applet', 'alert', 'document', 'write', 'cookie', 'window' ); foreach ( $words as $word ) { $temp = ''; for ( $i = 0, $wordlen = strlen ( $word ); $i < $wordlen; $i++ ) { $temp .= substr ( $word, $i, 1 ) . "\s*"; } // 再删除空白 // We only want to do this when it is followed by a non-word character // That way valid stuff like "dealer to" does not become "dealerto" // $temp的值比如:"w\s*i\s*n\s*d\s*o\s*w\s*",substr($temp, 0, -3)即截取头到最后面空格前的一个字母。 $str = preg_replace_callback ( '#(' . substr ( $temp, 0, -3 ) . ')(\W)#is', array( $this, '_compact_exploded_words' ), $str ); } /* * Remove disallowed Javascript in links or img tags * We used to do some version comparisons and use of stripos for PHP5, * but it is dog slow compared to these simplified non-capturing * preg_match(), especially if the pattern exists in the string */ do { $original = $str; if ( preg_match ( "/<a/i", $str ) ) { $str = preg_replace_callback ( "#<a\s+([^>]*?)(>|$)#si", array( $this, '_js_link_removal' ), $str ); } if ( preg_match ( "/<img/i", $str ) ) { $str = preg_replace_callback ( "#<img\s+([^>]*?)(\s?/?>|$)#si", array( $this, '_js_img_removal' ), $str ); } if ( preg_match ( "/script/i", $str ) OR preg_match ( "/xss/i", $str ) ) { $str = preg_replace ( "#<(/*)(script|xss)(.*?)\>#si", '[removed]', $str ); } } while ( $original != $str ); unset( $original ); // Remove evil attributes such as style, onclick and xmlns $str = $this->_remove_evil_attributes ( $str, $is_image ); /* * Sanitize naughty HTML elements * If a tag containing any of the words in the list * below is found, the tag gets converted to entities. * So this: <blink> * Becomes: <blink> */ $naughty = 'alert|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object|plaintext|style|script|textarea|title|video|xml|xss'; $str = preg_replace_callback ( '#<(/*\s*)(' . $naughty . ')([^><]*)([><]*)#is', array( $this, '_sanitize_naughty_html' ), $str ); /* * Sanitize naughty scripting elements * Similar to above, only instead of looking for * tags it looks for PHP and JavaScript commands * that are disallowed. Rather than removing the * code, it simply converts the parenthesis to entities * rendering the code un-executable. * For example: eval('some code') * Becomes: eval('some code') */ $str = preg_replace ( '#(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2(\\3)", $str ); // Final clean up // This adds a bit of extra precaution in case // something got through the above filters $str = $this->_do_never_allowed ( $str ); /* * Images are Handled in a Special Way * - Essentially, we want to know that after all of the character * conversion is done whether any unwanted, likely XSS, code was found. * If not, we return TRUE, as the image is clean. * However, if the string post-conversion does not matched the * string post-removal of XSS, then it fails, as there was unwanted XSS * code found and removed/changed during processing. */ if ( $is_image === TRUE ) { return ( $str == $converted_string ) ? TRUE : FALSE; } $str = preg_replace ( '/(\[removed\]).*\\1/iUs', '', $str ); return $str; } // -------------------------------------------------------------------- /** * Random Hash for protecting URLs * * @return string */ public function xss_hash () { if ( $this->_xss_hash == '' ) { mt_srand (); $this->_xss_hash = md5 ( time () + mt_rand ( 0, 1999999999 ) ); } return $this->_xss_hash; } // -------------------------------------------------------------------- /** * HTML Entities Decode * This function is a replacement for html_entity_decode() * The reason we are not using html_entity_decode() by itself is because * while it is not technically correct to leave out the semicolon * at the end of an entity most browsers will still interpret the entity * correctly. html_entity_decode() does not convert entities without * semicolons, so we are left with our own little solution here. Bummer. * * @param string * @param string * @return string */ public function entity_decode ( $str, $charset = 'UTF-8' ) { if ( stristr ( $str, '&' ) === FALSE ) { return $str; } $str = html_entity_decode ( $str, ENT_COMPAT, $charset ); $str = preg_replace ( '~&#x(0*[0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $str ); return preg_replace ( '~&#([0-9]{2,4})~e', 'chr(\\1)', $str ); } // -------------------------------------------------------------------- /** * Filename Security * @param string * @param bool * @return string */ public function sanitize_filename ( $str, $relative_path = FALSE ) { $bad = array( "../", "<!--", "-->", "<", ">", "'", '"', '&', '$', '#', '{', '}', '[', ']', '=', ';', '?', "%20", "%22", "%3c", // < "%253c", // < "%3e", // > "%0e", // > "%28", // ( "%29", // ) "%2528", // ( "%26", // & "%24", // $ "%3f", // ? "%3b", // ; "%3d" // = ); if ( !$relative_path ) { $bad[] = './'; $bad[] = '/'; } $str = remove_invisible_characters ( $str, FALSE ); return stripslashes ( str_replace ( $bad, '', $str ) ); } // ---------------------------------------------------------------- /** $str = preg_replace_callback('#('.substr($temp, 0, -3).')(\W)#is', array($this, '_compact_exploded_words'), $str); * Compact Exploded Words * * Callback function for xss_clean() to remove whitespace from * things like j a v a s c r i p t * * @param type * @return type */ protected function _compact_exploded_words ( $matches ) { return preg_replace ( '/\s+/s', '', $matches[1] ) . $matches[2]; } // -------------------------------------------------------------------- /* * Remove Evil HTML Attributes (like evenhandlers and style) * * It removes the evil attribute and either: * - Everything up until a space * For example, everything between the pipes: * <a |style=document.write('hello');alert('world');| class=link> * - Everything inside the quotes * For example, everything between the pipes: * <a |style="document.write('hello'); alert('world');"| class="link"> * * @param string $str The string to check * @param boolean $is_image TRUE if this is an image * @return string The string with the evil attributes removed */ protected function _remove_evil_attributes ( $str, $is_image ) { // All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns $evil_attributes = array( 'on\w*', 'style', 'xmlns', 'formaction' ); if ( $is_image === TRUE ) { /* * Adobe Photoshop puts XML metadata into JFIF images, * including namespacing, so we have to allow this for images. */ unset( $evil_attributes[array_search ( 'xmlns', $evil_attributes )] ); } do { $count = 0; $attribs = array(); // find occurrences of illegal attribute strings without quotes preg_match_all ( '/(' . implode ( '|', $evil_attributes ) . ')\s*=\s*([^\s>]*)/is', $str, $matches, PREG_SET_ORDER ); foreach ( $matches as $attr ) { $attribs[] = preg_quote ( $attr[0], '/' ); } // find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes) preg_match_all ( "/(" . implode ( '|', $evil_attributes ) . ")\s*=\s*(\042|\047)([^\\2]*?)(\\2)/is", $str, $matches, PREG_SET_ORDER ); foreach ( $matches as $attr ) { $attribs[] = preg_quote ( $attr[0], '/' ); } // replace illegal attribute strings that are inside an html tag if ( count ( $attribs ) > 0 ) { $str = preg_replace ( "/<(\/?[^><]+?)([^A-Za-z<>\-])(.*?)(" . implode ( '|', $attribs ) . ")(.*?)([\s><])([><]*)/i", '<$1 $3$5$6$7', $str, -1, $count ); } } while ( $count ); return $str; } // -------------------------------------------------------------------- /** * Sanitize Naughty HTML * * Callback function for xss_clean() to remove naughty HTML elements * * @param array * @return string */ protected function _sanitize_naughty_html ( $matches ) { // encode opening brace $str = '<' . $matches[1] . $matches[2] . $matches[3]; // encode captured opening or closing brace to prevent recursive vectors $str .= str_replace ( array( '>', '<' ), array( '>', '<' ), $matches[4] ); return $str; } // -------------------------------------------------------------------- /** * JS Link Removal * * Callback function for xss_clean() to sanitize links * This limits the PCRE backtracks, making it more performance friendly * and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in * PHP 5.2+ on link-heavy strings * * @param array * @return string */ protected function _js_link_removal ( $match ) { return str_replace ( $match[1], preg_replace ( '#href=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si', '', $this->_filter_attributes ( str_replace ( array( '<', '>' ), '', $match[1] ) ) ), $match[0] ); } // -------------------------------------------------------------------- /** * JS Image Removal * * Callback function for xss_clean() to sanitize image tags * This limits the PCRE backtracks, making it more performance friendly * and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in * PHP 5.2+ on image tag heavy strings * * @param array * @return string */ protected function _js_img_removal ( $match ) { return str_replace ( $match[1], preg_replace ( '#src=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si', '', $this->_filter_attributes ( str_replace ( array( '<', '>' ), '', $match[1] ) ) ), $match[0] ); } // -------------------------------------------------------------------- /** * Attribute Conversion * Used as a callback for XSS Clean * @param array * @return string */ protected function _convert_attribute ( $match ) { return str_replace ( array( '>', '<', '\\' ), array( '>', '<', '\\\\' ), $match[0] ); } // -------------------------------------------------------------------- /** * Filter Attributes * Filters tag attributes for consistency and safety * @param string * @return string */ protected function _filter_attributes ( $str ) { $out = ''; if ( preg_match_all ( '#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $str, $matches ) ) { foreach ( $matches[0] as $match ) { $out .= preg_replace ( "#/\*.*?\*/#s", '', $match ); } } return $out; } // -------------------------------------------------------------------- /** * HTML Entity Decode Callback * Used as a callback for XSS Clean * @param array * @return string */ protected function _decode_entity ( $match ) { //return $this->entity_decode($match[0], strtoupper(config_item('charset'))); return $this->entity_decode ( $match[0], 'utf-8' ); } // -------------------------------------------------------------------- /** * Validate URL entities * Called by xss_clean() * @param string * @return string */ protected function _validate_entities ( $str ) { /* * 检测GET变量中的URLs,先用xss_hash()替换掉"&",稍后再换"&"回 */ // 901119URL5918AMP18930PROTECT8198 #$this->_xss_hash = md5(time() + mt_rand(0, 1999999999)); $str = preg_replace ( '|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-]+)|i', $this->xss_hash () . "\\1=\\2", $str ); /* * 验证标准字符实体。 * 添加一个分号(如无分号的话),这么做是为了稍后转换实体到ASCII符 */ $str = preg_replace ( '#(&\#?[0-9a-z]{2,})([\x00-\x20])*;?#i', "\\1;\\2", $str ); /* * 验证 UTF16 双字节编码 (x00) * 同上,如无分号则添加分号。 */ $str = preg_replace ( '#(&\#x?)([0-9A-F]+);?#i', "\\1\\2;", $str ); /* * Un-Protect GET variables in URLs */ $str = str_replace ( $this->xss_hash (), '&', $str ); return $str; } // ---------------------------------------------------------------------- /** * Do Never Allowed * A utility function for xss_clean() * @param string * @return string */ protected function _do_never_allowed ( $str ) { //把其中的document.cookie等替换成[removed]等。 $str = str_replace ( array_keys ( $this->_never_allowed_str ), $this->_never_allowed_str, $str ); foreach ( $this->_never_allowed_regex as $regex ) { $str = preg_replace ( '#' . $regex . '#is', '[removed]', $str ); } return $str; } } function remove_invisible_characters ( $str, $url_encoded = TRUE ) { $non_displayables = array(); // every control character except newline (dec 10) // carriage return (dec 13), and horizontal tab (dec 09) if ( $url_encoded ) { $non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15 $non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31 } $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 do { $str = preg_replace ( $non_displayables, '', $str, -1, $count ); } while ( $count ); return $str; }
转载请注明本页网址:
http://www.vephp.com/jiaocheng/155.html