vergilwang

浏览: 125979 次
性别:
来自: 北京

最近访客更多访客>>

iris19860111

u_lama

KEYS123456789

2644781824

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

Customization larbin

博客分类：

Crawler

配置英文原版说明：

http://larbin.sourceforge.net/custom-eng.html#larbin.conf

larbin.conf

###############################################
# Who are you ?
# mail of the one who launched larbin (YOUR mail)
From larbin2.6.3@unspecified.mail
# name of the bot (sent with http headers)
UserAgent larbin_2.6.3

############################################
# What are the inputs and ouputs of larbin
# port on which is launched the http statistic webserver
# if unset or set to 0, no webserver is launched//larbin在运行时可以通过 http://localhost:8081查看运行情况；如果值为0，则不启动web服务器。
httpPort 8081
# port on which you can submit urls to fetch
# no input is possible if you comment this line or use port 0
#inputPort 1976

############################################
# parameters to adapt depending on your network
# Number of connexions in parallel (to adapt depending of your network speed)//并行获取网页的数量
pagesConnexions 100
# Number of dns calls in parallel//并行解析dns的数量
dnsConnexions 5
# How deep do you want to go in a site//网页抓取深度
depthInSite 5
# do you want to follow external links//是否允许抓取域名外连接
#noExternalLinks
# time between 2 calls on the same server (in sec) : NEVER less than 30//对同一个服务器获取网页的间隔时间
waitDuration 60
# Make requests through a proxy (use with care)
#proxy www 8080

##############################################
# now, let's customize the search

# first page to fetch (you can specify several urls)
startUrl http://www.csdn.net/ //抓取网页的其实URL,可指定多值

# Do you want to limit your search to a specific domain ?
# if yes, uncomment the following line//限制爬虫抓取的网址域名后缀。
#limitToDomain .fr .dk .uk end

# What are the extensions you surely don't want//限制不被下载的对象的后缀,可通过注释或者增加后缀控制下载
# never forbid .html, .htm and so on : larbin needs them
forbiddenExtensions
#.tar .gz .tgz .zip .Z .rpm .deb
#.ps .dvi .pdf
#.png .jpg .jpeg .bmp .smi .tiff .gif
#.mov .avi .mpeg .mpg .mp3 .qt .wav .ram .rm
#.jar .java .class .diff
#.doc .xls .ppt .mdb .rtf .exe .pps .so .psd
end

option.h

// Larbin
// Sebastien Ailleret
// 27-05-01 -> 09-03-02

#ifndef LARBIN_CONFIG
#define LARBIN_CONFIG

#include "config.h"

/* This files allows a lot of customizations of larbin
* see doc/custom-eng.html for more details
*/

/////////////////////////////////////////////////////////////
// Select the output module you want to use
//相关代码在src/interf/useroutput.cc

//#define DEFAULT_OUTPUT // do nothing. //除了统计，其他什么都不做，不会下载网页
#define SIMPLE_SAVE // save in files named save/dxxxxxx/fyyyyyy //以fyyyyy为文件名下载到save/dxxxxx目录下，该目录还包含一个index文件，记录已经下载的网址
//#define MIRROR_SAVE // save in files (respect sites hierarchy) //网页下载到save/dxxxxx/url目录下，其中url是起始网址
//#define STATS_OUTPUT // do some stats on pages //输出统计，可以通过http://localhost:8081/output.html查看

////////////////////////////////////////////////////////////
// Set up a specific search
//相关代码在src/fetch/specbuf.cc

//#define SPECIFICSEARCH //寻找特定的文档
//#define contentTypes ((char *[]) { "audio/mpeg", NULL }) //寻找内容的类型
//#define privilegedExts ((char *[]) { ".mp3", NULL }) //文件的扩展名

// how do you want to manage specific pages (select one of the followings)
//#define DEFAULT_SPECIFIC //默认像html一样保存
//#define SAVE_SPECIFIC //特别的页面保存在磁盘上
//#define DYNAMIC_SPECIFIC //对于大的文件，使用动态分配的缓冲区

//////////////////////////////////////////////////////////
// What do you want the crawler to do

// do you want to follow links in pages//如果此选项未设置，HTML页面不会被解析和链接，就不会跟踪。
#define FOLLOW_LINKS

// do you want the crawler to associate to each page the list of its sons//加入个个页面连接包含的链表
//#define LINKS_INFO

// do you want to associate a tag to pages (given in input)
// this allows to follow a page from input to output (and follow redirection)
//#define URL_TAGS

// do you want to suppress duplicate pages//如果将此选项设置，当遇到和旧的内容相同的网页时，larbin不返回成功
#define NO_DUP

// do you want larbin to stop when everything has been fetched//完成时是否要退出
//#define EXIT_AT_END

// do you want to fetch images //是否想要下载图像
// if you enable this option, update forbiddenExtensions in larbin.conf
//#define IMAGES

// downlaod everything (ie no check of content type in http headers)//下载任何东西
//#define ANYTYPE

// do you want to manage cookies//对cookies进行管理
//#define COOKIES

//////////////////////////////////////////////////////////
// Various options

// do you want to get cgi //获取指定的cgi
// 0 : yes ; 1 : no ; 2 : NO ! //0代表所有的cgi，1代表拒绝urls里有‘？’或‘＝’的cgi，2代表禁止所有的cgi
#define CGILEVEL 0

// limit bandwith usage (in octets/sec) //设置限制的带宽，不设置则没有限制
// be carefull, larbin might use 10 to 20% more //larbin可能可以用到10％——20％或更多
//#define MAXBANDWIDTH 200000

// the depth is initialized each time a link goes to another site //如果此选项，当一个链接指向另一个网站，新的URL深度会初始化，否则它永远不会
#define DEPTHBYSITE

//////////////////////////////////////////////////////////
// Efficiency vs feature

// do we need a special thread for output//如果没有设置，在程序中只有一个线程
// This is compulsory if it can block
// (not needed if you did not add code yourself)
//#define THREAD_OUTPUT

// if this option is set, larbin saves the hashtable from time to time
// this way it can restart from where it last stopped
// by reloading the table //从上次停止的地方开始执行
//#define RELOAD

//////////////////////////////////////////////////////////
// now it's just if you need to know how it works

// do not launch the webserver //启动web服务器
// this can be usefull in order to launch no thread at all
//#define NOWEBSERVER

// do you want nice graphs for in the stats page //在统计页面实时显示直方图
#define GRAPH

// uncomment if you are not interested in debugging information
//#define NDEBUG //不在web上显示调试信息

// enable this if you really dislike stats (in the webserver)//不在web上显示统计信息
//#define NOSTATS

// enable this if you really like stats (on stdout)
#define STATS //每8秒显示一次统计信息
//#define BIGSTATS //在屏幕上显示获取到的所有页面，但会减慢larbin速度

// Please enable this option if you want to report a crash//当崩溃是报告
// then compile with "make debug"
//#define CRASH

#endif // LARBIN_CONFIG

types.h

// Larbin
// Sebastien Ailleret
// 12-01-00 -> 10-12-01

#ifndef TYPES_H
#define TYPES_H

// Size of the HashSize (max number of urls that can be fetched)//hash表的大小（最大的可以提取的网址数量）
#define hashSize 64000000

// Size of the duplicate hashTable//复制哈希表的大小
#define dupSize hashSize
#define dupFile "dupfile.bak"

// Size of the arrays of Sites in main memory//主存网址数组的大小
#define namedSiteListSize 20000
#define IPSiteListSize 10000

// Max number of urls in ram//随机存取存储器的大小
#define ramUrls 100000
#define maxIPUrls 80000 // this should allow less dns call

// Max number of urls per site in Url//每个网站的网址的最大数量
#define maxUrlsBySite 254 // must fit in uint8_t

// time out when reading a page (in sec)//读一个网页超时的时间
#define timeoutPage 30 // default time out
#define timeoutIncr 2000 // number of bytes for 1 more sec

// How long do we keep dns answers and robots.txt//保持域名解释的时间
#define dnsValidTime 2*24*3600

// Maximum size of a page//可以下载的网页的最大大小
#define maxPageSize 1000000
#define nearlyFullPage 90000

// Maximum size of a robots.txt that is read
// the value used is min(maxPageSize, maxRobotsSize)
#define maxRobotsSize 10000

// How many forbidden items do we accept in a robots.txt
#define maxRobotsItem 100

// file name used for storing urls on disk//在硬盘上存储urls的文件名
#define fifoFile "fifo"
#define fifoFileWait "fifowait"

// number of urls per file on disk//每个文件的urls个数
// should be equal to ramUrls for good interaction with restart//为了在重起时有好的影响，应该和随机存储器的大小相等
#define urlByFile ramUrls

// Size of the buffer used to read sockets//套接字缓冲区的大小
#define BUF_SIZE 16384
#define STRING_SIZE 1024

// Max size for a url//url的最大值
#define maxUrlSize 512
#define maxSiteSize 40 // max size for the name of a site

// max size for cookies//cookies的最大大小
#define maxCookieSize 128

// Standard size of a fifo in a Site
#define StdVectSize maxRobotsItem

// maximum number of input connections//输入链接的最大数
#define maxInput 5

// if we save files, how many files per directory and where
#define filesPerDir 2000 //每个文件夹保存的网页数量
#define saveDir "save/" //下载的网页保存的路径
#define indexFile "index.html" // for MIRROR_SAVE
#define nbDir 1000 // for MIRROR_SAVE

// options for SPECIFICSEARCH (except with DEFAULT_SPECIFIC)
#define specDir "specific/" //特殊文件的保存路径
#define maxSpecSize 5000000 //特殊文件的最大大小

// Various reasons of error when getting a page//下载一个网页时各种错误的原因
#define nbAnswers 16
enum FetchError
{
success,
noDNS,
noConnection,
forbiddenRobots,
timeout,
badType,
tooBig,
err30X,
err40X,
earlyStop,
duplicate,
fastRobots,
fastNoConn,
fastNoDns,
tooDeep,
urlDup
};

// standard types
typedef unsigned int uint;

#endif // TYPES_H

分享到：

Fedora QQ | Larbin使用方法2

2012-10-11 16:47
浏览 270
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Customization larbin

larbin.conf

option.h

types.h

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Customization larbin

larbin.conf

option.h

types.h

评论

发表评论

相关推荐

python crawler(1)

python crawler(2)

python spider code

python 爬虫抓站

scapy递归爬

scapy安装and简介

抓取网页并解析HTML

sgmllib Introduction

HTML and URL Parser

Crawler Index Page

Larbin 安装遇到的问题（fedora）

Larbin的使用

Larbin使用方法2

最近访客更多访客>>