plugins/spamx/SLVbase.class.php
author Dirk Haun <dirk@haun-online.de>
Thu, 29 Oct 2009 13:00:11 +0100
branchHEAD
changeset 7397 c27e9026f22a
parent 6838 cb1ba8d99085
permissions -rw-r--r--
Fixed inclusion protection
     1 <?php
     2 
     3 /**
     4 * File: SLVbase.class.php
     5 * Spam Link Verification (SLV) Base Class
     6 *
     7 * Copyright (C) 2006 by the following authors:
     8 * Author        Dirk Haun       dirk AT haun-online DOT de
     9 *
    10 * Licensed under the GNU General Public License
    11 *
    12 * @package Spam-X
    13 * @subpackage Modules
    14 */
    15 
    16 if (strpos(strtolower($_SERVER['PHP_SELF']), 'slvbase.class.php') !== false) {
    17     die('This file can not be used on its own!');
    18 }
    19 
    20 /**
    21 * Sends posts to SLV (linksleeve.org) for examination
    22 *
    23 * @author Dirk Haun     dirk AT haun-online DOT de
    24 * based on the works of Tom Willet (Spam-X) and Russ Jones (SLV)
    25 * @package Spam-X
    26 *
    27 */
    28 class SLVbase {
    29 
    30     var $_debug = false;
    31 
    32     var $_verbose = false;
    33 
    34     /**
    35     * Constructor
    36     */
    37     function SLVbase()
    38     {
    39         $this->_debug = false;
    40         $this->_verbose = false;
    41     }
    42 
    43     /**
    44     * Check for spam links
    45     *
    46     * @param    string  $post   post to check for spam
    47     * @return   boolean         true = spam found, false = no spam
    48     *
    49     * Note: Also returns 'false' in case of problems communicating with SLV.
    50     *       Error messages are logged in Geeklog's error.log
    51     *
    52     */
    53     function CheckForSpam ($post)
    54     {
    55         global $_SPX_CONF;
    56 
    57         require_once ('XML/RPC.php');
    58 
    59         $retval = false;
    60 
    61         if (empty ($post)) {
    62             return $retval;
    63         }
    64 
    65         $links = $this->prepareLinks ($post);
    66         if (empty ($links)) {
    67             return $retval;
    68         }
    69 
    70         if (!isset ($_SPX_CONF['timeout'])) {
    71             $_SPX_CONF['timeout'] = 5; // seconds
    72         }
    73 
    74         if ($this->_verbose) {
    75             SPAMX_log ("Sending to SLV: $links");
    76         }
    77 
    78         $params = array (new XML_RPC_Value ($links, 'string'));
    79         $msg = new XML_RPC_Message ('slv', $params);
    80         $cli = new XML_RPC_Client ('/slv.php', 'http://www.linksleeve.org');
    81 
    82         if ($this->_debug) {
    83             $client->setDebug (1);
    84         }
    85 
    86         $resp = $cli->send ($msg, $_SPX_CONF['timeout']);
    87         if (!$resp) {
    88             COM_errorLog ('Error communicating with SLV: ' . $cli->errstr
    89                           . '; Message was ' . $msg->serialize());
    90         } else if ($resp->faultCode ()) {
    91             COM_errorLog ('Error communicating with SLV. Fault code: '
    92                           . $resp->faultCode() . ', Fault reason: '
    93                           . $resp->faultString() . '; Message was '
    94                           . $msg->serialize());
    95         } else {
    96             $val = $resp->value();
    97             // note that SLV returns '1' for acceptable posts and '0' for spam
    98             if ($val->scalarval() != '1') {
    99                 $retval = true;
   100                 SPAMX_log ("SLV: spam detected");
   101             } else if ($this->_verbose) {
   102                 SPAMX_log ("SLV: no spam detected");
   103             }
   104         }
   105 
   106         return $retval;
   107     }
   108 
   109     /**
   110     * Check whitelist
   111     *
   112     * Check against our whitelist of sites not to report to SLV. Note that
   113     * URLs starting with $_CONF['site_url'] have already been removed earlier.
   114     *
   115     * @param    array   &$links     array of URLs from a post
   116     * @return   void ($links is passed by reference and modified in place)
   117     *
   118     */
   119     function checkWhitelist (&$links)
   120     {
   121         global $_TABLES;
   122 
   123         $result = DB_query ("SELECT value FROM {$_TABLES['spamx']} WHERE name='SLVwhitelist'", 1);
   124         $nrows = DB_numRows ($result);
   125 
   126         for ($i = 0; $i < $nrows; $i++) {
   127             $A = DB_fetchArray ($result);
   128             $val = $A['value'];
   129             $val = str_replace ('#', '\\#', $val);
   130 
   131             foreach ($links as $key => $link) {
   132                 if (!empty ($link)) {
   133                     if (preg_match ("#$val#i", $link)) {
   134                         $links[$key] = '';
   135                     }
   136                 }
   137             }
   138         }
   139     }
   140 
   141     /**
   142     * Extract links
   143     *
   144     * Extracts all the links from a post; expects HTML links, i.e. <a> tags
   145     *
   146     * @param    string  $comment    The post to check
   147     * @return   array               All the URLs in the post
   148     *
   149     */
   150     function getLinks ($comment)
   151     {
   152         global $_CONF;
   153 
   154         $links = array();
   155 
   156         preg_match_all( "/<a[^>]*href=[\"']([^\"']*)[\"'][^>]*>(.*?)<\/a>/i",
   157                         $comment, $matches );
   158         for ($i = 0; $i < count ($matches[0]); $i++) {
   159             $url = $matches[1][$i];
   160             if (!empty ($_CONF['site_url']) &&
   161                     strpos ($url, $_CONF['site_url']) === 0) {
   162                 // skip links to our own site
   163                 continue;
   164             } else {
   165                 $links[] = $url;
   166             }
   167         }
   168 
   169         return $links;
   170     }
   171 
   172     /**
   173     * Extract only the links from the post
   174     *
   175     * SLV has a problem with non-ASCII character sets, so we feed it the URLs
   176     * only. We also remove all URLs containing our site's URL.
   177     *
   178     * Since we don't know if the post is in HTML or plain ASCII, we run it
   179     * through getLinks() twice.
   180     *
   181     * @param    string  $comment    The post to check
   182     * @return   string              All the URLs in the post, sep. by linefeeds
   183     *
   184     */
   185     function prepareLinks ($comment)
   186     {
   187         $links = array();
   188         $linklist = '';
   189 
   190         // some spam posts have extra backslashes
   191         $comment = stripslashes ($comment);
   192 
   193         // some spammers have yet to realize that we're not supporting BBcode
   194         // but since we want the URLs, convert it here ...
   195         $comment = preg_replace ('/\[url=([^\]]*)\]/i', '<a href="\1">',
   196                                  $comment);
   197         $comment = str_replace (array ('[/url]', '[/URL]'),
   198                                 array ('</a>',   '</a>'  ), $comment);
   199 
   200         // get all links from <a href="..."> tags
   201         $links = $this->getLinks ($comment);
   202 
   203         // strip all HTML, then get all the plain text links
   204         $comment = COM_makeClickableLinks (strip_tags ($comment));
   205         $links += $this->getLinks ($comment);
   206 
   207         if (count ($links) > 0) {
   208             $this->checkWhitelist ($links);
   209             $linklist = implode ("\n", $links);
   210         }
   211 
   212         return trim ($linklist);
   213     }
   214 }
   215 
   216 ?>