`
wbj0110
  • 浏览: 1552024 次
  • 性别: Icon_minigender_1
  • 来自: 上海
文章分类
社区版块
存档分类
最新评论

使用Google Analytics跟踪搜索引擎的抓取记录

阅读更多
<?php
/*
 * Name:Tracking Robots With Google Analytics
 * Author:biaodianfu
 * URI;http://www.biaodianfu.com/tracking-robots-with-google-analytics.html
 */
$utmac = 'UA-16811947-5'; //输入Goolgle Analytics配置生成的跟踪ID
$domain = 'biaodianfu.com'; //输入要统计的网站的域名
$utmGifLocation = "http://www.google-analytics.com/__utm.gif"; //请求URL地址
$utmv = "4.8.9"; //Google Analytics统计版本
$title = ""; //网站标题,wp_title() ;

/* Robots
 * Google  http://www.google.com/support/webmasters/bin/answer.py?hl=cn&answer=1061943
 * Baidu  http://tieba.baidu.com/club/9374916/p/10669831
 * Yahoo  http://en.wikipedia.org/wiki/Yahoo!_Slurp
 * Bing  http://www.bing.com/community/site_blogs/b/webmaster/archive/2009/07/17/new-bot-work-continues-at-bing.aspx
 * SOSO  http://help.soso.com/webspider.htm
 */
$bots = array( 'compatible; Googlebot/([0-9.]{1,10})?' => 'Google',
                'Googlebot/([0-9.]{1,10})?'=>'Google',
                'Googl(e|ebot)(-News)/([0-9.]{1,10})' => 'Google News',
                'Googl(e|ebot)(-News)/' => 'Google News',
                'Googl(e|ebot)(-Image)/([0-9.]{1,10})' => 'Google Image',
                'Googl(e|ebot)(-Image)/' => 'Google Image',
                'Googl(e|ebot)(-Video)/([0-9.]{1,10})' => 'Google Video',
                'Googl(e|ebot)(-Video)/' => 'Google Video',
                'Googl(e|ebot)(-Sitemaps)/([0-9.]{1,10})?' => 'Google-Sitemaps',
                'Googl(e|ebot)(-Sitemaps)' => 'Google-Sitemaps',
                'compatible; Googlebot-Mobile/([0-9.]{1,10})?' => 'Google Mobile',
                'Googl(e|ebot)(-Mobile)/([0-9.]{1,10})?' => 'Google Mobile',
                'compatible; Mediapartners-Google/([0-9.]{1,10})?' => 'Google Mediapartners',
                'Mediapartners-Google[ /]([0-9.]{1,10})' => 'Google Mediapartners',
                'Mediapartners-Google' => 'Google Mediapartners',
                '^AdsBot-Google' => 'Google-AdsBot',
                '^Feedfetcher-Google' => 'Google-Feedfetcher',
                'compatible; Baiduspider/([0-9.]{1,10})?' => 'Baidu',
                'Baiduspider' => 'Baidu',
                'BaiduCustomer' => 'Baidu Customer',
                'Baidu-Thumbnail' => 'Baidu Thumbnail',
                'Baidu-Transcoder' => 'Baidu Mobile',
                'baiduspider-mobile-gate' => 'Baidu Mobile',
                'Yahoo(! ([a-z]{1,3} )?Slurp|-)' => 'Yahoo',
                'Yahoo! Slurp China' => 'Yahoo China',
                'YahooFeedSeeker' => 'Yahoo Feed',
                'Yahoo-Blogs' => 'Yahoo Blog',
                'Yahoo ContentMatch Crawler' => 'Yahoo Ads',
                'Yahoo-MMCrawler ' => 'Yahoo Image',
                'MSN(BOT|PTC)[ /]([0-9.]{1,10})' => 'MSN',
                'MS Search ([0-9.]{1,10}) Robot' => 'MSN',
                'MSNBOT_Mobile' => 'MSN Mobile',
                'MSMOBOT' => 'MSN Mobile',
                'MSNBOT-(MEDIA|PRODUCTS|ACADEMIC|NEWSBLOGS)[ /]([0-9.]{1,10})' => 'MS Live Search',
                'Sosospider' => 'SoSo',
                'Sosoblogspider' => 'SoSo Blog',
                'Sosoimagespider' => 'SoSo IMAGE',
                'Sogou web spider[ /]([0-9.]{1,10})' => 'Sogou',
                'Sogou-Test-Spider[ /]([0-9.]{1,10})' => 'Sogou',
                'Sogou web robot' => 'Sogou',
                'Sogou orion spider[ /]([0-9.]{1,10})' => 'Sogou',
                'YodaoBot[ /]([0-9.]{1,10})' => 'Youdao',
                'YodaoBot-Image[ /]([0-9.]{1,10})' => 'Youdao Image',
                'YodaoBot-Reader[ /]([0-9.]{1,10})' => 'Youdao Reader',
                'QihooBot[ /]([0-9.]{1,10})' => 'Qihoo',
                'gougou' => 'GouGou',
                '(robot|spider|harvest|bot|(?<!msie)crawler)' => 'Unknown Robot'
                );

$os = array ( 'wi(n|ndows)?' => 'windows',
              'linux[ /\-]([a-z0-9._]{1,10})' => 'linux',
              'linux' => 'linux',
              'Mac[ _]?OS[ _]?X[ /]([0-9.]{1,10})' => 'macosx',
              'Mac[ _]?OS[ _]?X' => 'macosx',
              'Mac 10.([0-9.]{1,10})' => 'macosx',
              'Mac(_Power|intosh.+P)PC' => 'macppc',
              'beos[ a-z]*([0-9.]{1,10})' => 'beos',
              'beos' => 'beos',
              'fedora' => 'fedora',
              'free[ \-]?bsd[ /]([a-z0-9._]{1,10})' => 'freebsd',
              'free[ \-]?bsd' => 'freebsd',
              'open[ \-]?bsd[ /]([a-z0-9._]{1,10})' => 'openbsd',
              'open[ \-]?bsd' => 'openbsd',
              'PCLinuxOS[ /]?([0-9.]{1,10})' => 'pclinux',
              'ubuntu' => 'ubuntu'
              );

function domainHash($domain) {
 if(!$domain || $domain=="") return 1;
 $h=0; $g=0;
 for($i=strlen($domain)-1;$i>=0;$i--) {
  $c = (int)(ord($domain[$i]));
  $h = (($h << 6) & 0xfffffff) + $c + ($c << 14);
  $g = ($h & 0xfe00000);
  if($g!=0) $h = ($h ^ ($g >> 21));
 }
 return $h;
}

function httpRequest($utmUrl){
    if(function_exists('curl_exec')){
    $ch = curl_init();
        curl_setopt($ch, CURLOPT_HEADER, 1);
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  curl_setopt($ch, CURLOPT_URL, $utmUrl);
  $data = curl_exec($ch);
  curl_close($ch);
    }
    elseif(function_exists('file_get_contents')){
        $options = array(
            "http" => array(
                "method" => "GET",
                "user_agent" => $_SERVER["HTTP_USER_AGENT"],
                "header" => ("Accepts-Language: " . $_SERVER["HTTP_ACCEPT_LANGUAGE"]))
        );
        $data = file_get_contents( $utmUrl, false, stream_context_create($options));
    }
}

if ( empty( $_SERVER['HTTP_REFERER'] ) && $_SERVER["HTTP_USER_AGENT"] ){
    foreach ( $os as $patternos => $o ){
        if ( preg_match('#'.$patternos.'#msi', $_SERVER["HTTP_USER_AGENT"] ) == 0){
            foreach( $bots as $patternbots => $bot ){
                if (preg_match( '#'.$patternbots.'#i' , $_SERVER['HTTP_USER_AGENT'] ) == 1){
                    $botname = preg_replace ( "/\\s{1,}/i" , '-' , $bot );
                    $utmUrl = $utmGifLocation . "?" .
                              "utmwv=" . $utmv .
                              "&utmn=" . rand(0, 0x7fffffff) .
                              "&utmhn=" . urlencode($_SERVER["SERVER_NAME"]) .
                              "&utmdt=" . urlencode($title).
                              "&utmr=-" .
                              "&utmp=" . urlencode($_SERVER["REQUEST_URI"]) .
                              "&utmac=" . $utmac .
                              "&utmcc=" .
                                '__utma%3D'.domainHash($domain).'.'.rand(0, 0x7fffffff).'.'.time().'.'.time

().'.'.time().'.1%3B%2B'.
                                '__utmb%3D'.domainHash($domain).'%3B%2B'.
                                '__utmc%3D'.domainHash($domain).'%3B%2B'.
                                '__utmz%3D'.domainHash($domain).'.'.time().'.1.1.utmccn%3D(organic)%7Cutmcsr%

3D'.$botname.'%7Cutmctr%3D'.$_SERVER["REQUEST_URI"].'%7Cutmcmd%3Dorganic%3B%2B'.
                                '__utmv%3D'.domainHash($domain).'.Robot%20hostname%3A%20'.gethostbyaddr( $_SERVER

['REMOTE_ADDR'] ).'%3B';
                    httpRequest($utmUrl);
                }
            }
        }
    }
}
?>

本方法适合使用虚拟主机的朋友,如果您自己有服务器的话建议还是开启服务器日志使用awstats进行分析,英文这样你才能真正的了解蜘蛛,特别是对服务器状态码分析统计。

以上代码参考了一个法文网站,由于代码比较老(2008年的),同时中间的搜索引擎的User-Agent和不太适合中国网站,百度也在近期修改了User-Agent。自己修改了下代码。本代码还未测试,如果发现问题请及时联系。

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics