`
wbj0110
  • 浏览: 1559376 次
  • 性别: Icon_minigender_1
  • 来自: 上海
文章分类
社区版块
存档分类
最新评论

Python登录人人网并抓取新鲜事

阅读更多

from sgmllib import SGMLParser

02 import sys,urllib2,urllib,cookielib
03 class spider(SGMLParser):
04     def __init__(self,email,password):
05         SGMLParser.__init__(self)
06         self.h3=False
07         self.h3_is_ready=False
08         self.div=False
09         self.h3_and_div=False
10         self.a=False
11         self.depth=0
12         self.names=""
13         self.dic={}  
14           
15         self.email=email
16         self.password=password
17         self.domain='renren.com'
18         try:
19             cookie=cookielib.CookieJar()
20             cookieProc=urllib2.HTTPCookieProcessor(cookie)
21         except:
22             raise
23         else:
24             opener=urllib2.build_opener(cookieProc)
25             urllib2.install_opener(opener)      
26  
27     def login(self):
28         url='http://www.renren.com/PLogin.do'
29         postdata={
30                   'email':self.email,
31                   'password':self.password,
32                   'domain':self.domain 
33                   }
34         req=urllib2.Request(
35                             url,
36                             urllib.urlencode(postdata)           
37                             )
38          
39         self.file=urllib2.urlopen(req).read()
40         #print self.file
41     def start_h3(self,attrs):
42         self.h3 = True
43     def end_h3(self):
44         self.h3=False
45         self.h3_is_ready=True
46          
47     def start_a(self,attrs):
48         if self.h3 or self.div:
49             self.a=True
50     def end_a(self):
51         self.a=False
52          
53     def start_div(self,attrs):
54         if self.h3_is_ready == False:
55             return
56         if self.div==True:
57             self.depth += 1
58              
59         for k,v in attrs:
60             if k == 'class' and v == 'content':
61                 self.div=True;
62                 self.h3_and_div=True   #h3 and div is connected
63     def end_div(self):
64         if self.depth == 0:
65             self.div=False
66             self.h3_and_div=False
67             self.h3_is_ready=False
68             self.names=""
69         if self.div == True:
70             self.depth-=1
71     def handle_data(self,text):
72         #record the name
73         if self.h3 and self.a:
74             self.names+=text
75         #record says
76         if self.h3 and (self.a==False):
77             if not text:pass
78             else: self.dic.setdefault(self.names,[]).append(text)
79             return
80         if self.h3_and_div:
81             self.dic.setdefault(self.names,[]).append(text)
82              
83     def show(self):
84         type = sys.getfilesystemencoding()
85         for key in self.dic:
86             print ( (''.join(key)).replace(' ','')).decode('utf-8').encode(type), \
87                   ( (''.join(self.dic[key])).replace(' ','')).decode('utf-8').encode(type)
88  
89  
90  
91  
92 renrenspider=spider('your email','your password')
93 renrenspider.login()
94 renrenspider.feed(renrenspider.file)
95 renrenspider.show()
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics