python spider code

vergilwang

浏览: 125055 次
性别:
来自: 北京

最近访客更多访客>>

iris19860111

u_lama

KEYS123456789

2644781824

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Crawler
Python

Python代码
#FileName:toolbox_insight.py
fromsgmllibimportSGMLParser
importthreading
importtime
importurllib2
importStringIO
importgzip
importstring
importos
#rewriteSGMLParserforstart_a
classBasegeturls(SGMLParser):#这个Basegeturls类作用是分析下载的网页，把网页中的所有链接放在self.url中。
defreset(self):
self.url=[]
SGMLParser.reset(self)
defstart_a(self,attrs):
href=[vfork,vinattrsifk=='href']
ifhref:
self.url.extend(href)
#forquicklyfinding
classNewlist(list):#这个类其实是一个添加了find方法的LIST。当num变量在LIST中，返回True,当不在LIST中，返回False并把num按二分法插入LIST中
deffind(self,num):
l=len(self)
first=0
end=l-1
mid=0
ifl==0:
self.insert(0,num)
returnFalse
whilefirst<end:
mid=(first+end)/2
ifnum>self[mid]:
first=mid+1
elifnum<self[mid]:
end=mid-1
else:
break
iffirst==end:
ifself[first]>num:
self.insert(first,num)
returnFalse
elifself[first]<num:
self.insert(first+1,num)
returnFalse
else:
returnTrue
eliffirst>end:
self.insert(first,num)
returnFalse
else:
returnTrue
#下面的reptile顾名思义是一个爬虫
classreptile(threading.Thread):
#Name:是爬虫是名字，queue是任务队列，所有的爬虫共用同一个任务队列
#从中取出一个任务项进行运行，每个任务项是一个要下载网页的URL
#result:也是一个队列，将下载的网页中包含的URL放入该队列中
#inittime:在本程序中没有用，只是一个为了以后扩展用的
#downloadway:是下载的网页存放的路径
#configfile:是配置文件，存放网页的URL和下载下后的路径
#maxnum:每个爬虫有个最大下载量，当下载了这么多网页后，爬虫dead
def__init__(self,Name,queue,result,Flcok,inittime=0.00001,downloadway='D:\\bbs\\',configfile='D:\\bbs\\conf.txt',maxnum=10000):
threading.Thread.__init__(self,name=Name)
self.queue=queue
self.result=result
self.Flcok=Flcok
self.inittime=inittime
self.mainway=downloadway
self.configfile=configfile
self.num=0#已下载的网页个数
self.maxnum=maxnum
os.makedirs(downloadway+self.getName())#系统调用：在存放网页的文件夹中创建一个以该爬虫name为名字的文件夹
self.way=downloadway+self.getName()+'\\'
defrun(self):
opener=urllib2.build_opener()#创建一个开启器
whileTrue:
url=self.queue.get()#从队列中取一个URL
ifurl==None:#当取得一个None后表示爬虫结束工作，用于外部方便控制爬虫的生命期
break
parser=Basegeturls()#创建一个网页分析器
request=urllib2.Request(url)#网页请求
request.add_header('Accept-encoding','gzip')#下载的方式是gzip压缩后的网页，gzip是大多数服务器支持的一种格式
try:#这样可以减轻网络压力
page=opener.open(request)#发送请求报文
ifpage.code==200:#当请求成功
predata=page.read()#下载gzip格式的网页
pdata=StringIO.StringIO(predata)#下面6行是实现解压缩
gzipper=gzip.GzipFile(fileobj=pdata)
try:
data=gzipper.read()
except(IOError):
print'unusedgzip'
data=predata#当有的服务器不支持gzip格式，那么下载的就是网页本身
try:
parser.feed(data)#分析网页
except:
print'Iamhere'#有的网页分析不了，如整个网页就是一个图片
foriteminparser.url:
self.result.put(item)#分析后的URL放入队列中
way=self.way+str(self.num)+'.html'#下面的是网页的保存，不多说了
self.num+=1
file=open(way,'w')
file.write(data)
file.close()
self.Flcok.acquire()
confile=open(self.configfile,'a')
confile.write(way+''+url+'\n')
confile.close()
self.Flcok.release()
page.close()
ifself.num>=self.maxnum:#达到最大量后退出
break
except:
print'enderror'
#和爬虫一样是个线程类,作用是将爬虫中的result中存入的URL加以处理。只要同一个服务器的网页
classproinsight(threading.Thread):
def__init__(self,queue,list,homepage,inqueue):
threading.Thread.__init__(self)
self.queue=queue#和爬虫中的result队列是同一个
self.list=list#是上面Newlist的对象
self.homepage=homepage#主页
self.inqueue=inqueue#处理完后的URL的去处
defrun(self):
length=len(self.homepage)
whileTrue:
item=self.queue.get()
ifitem==None:
break
ifitem[0:4]=='\r\n':
item=item[4:]
ifitem[-1]=='/':
item=item[:-1]
iflen(item)>=len('http://')anditem[0:7]=='http://':
iflen(item)>=lengthanditem[0:length]==self.homepage:
ifself.list.find(item)==False:
self.inqueue.put(item)
elifitem[0:5]=='/java'oritem[0:4]=='java':
pass
else:
ifitem[0]!='/':
item='/'+item
item=self.homepage+item
ifself.list.find(item)==False:
self.inqueue.put(item)
下面的是一个主函数过程
我下载的网站是http://bbs.hit.edu.cn
开始网页是http://bbs.hit.edu.cn/mainpage.php
#FileName:test
fromtoolbox_insightimport*
fromQueueimportQueue
importthreading
importsys
num=int(raw_input('Enterthenumberofthread:'))
pnum=int(raw_input('Enterthenumberofdownloadpages:'))
mainpage=str(raw_input('Themainpage:'))
startpage=str(raw_input('Startpage:'))
queue=Queue()
key=Queue()
inqueue=Queue()
list=Newlist()
thlist=[]
Flock=threading.RLock()
foriinrange(num):
th=reptile('th'+str(i),queue,key,Flock)
thlist.append(th)
pro=proinsight(key,list,mainpage,inqueue)
pro.start()
foriinthlist:
i.start()
queue.put(startpage)
foriinrange(pnum):
queue.put(inqueue.get())
foriinrange(num):
queue.put(None) 

分享到：

python 爬虫抓站 | python crawler(2)

2012-07-20 15:19
浏览 254
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

python spider code

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

python spider code

评论

发表评论

相关推荐

python编码问题总结

python补全插件

python类型转换

python正则

python匹配中文

python读写

python路径文件api

pythonGB2312乱码问题

python文件读写2

python debug【】

python crawler(1)

python crawler(2)

python 爬虫抓站

scapy递归爬

scapy安装and简介

pydev eclipse插件安装

python list

List Tuple Dictionary 区别

抓取网页并解析HTML

python IO

最近访客更多访客>>