python 爬虫小结2

jackyrong

浏览: 7852042 次
性别:
来自: 广州

最近访客更多访客>>

尘与飞

深情痞子

Crow00

zihai367

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

python

1 LXML是比beautisoup速度更快的解析，使用的是XPATH，来个例子：
from lxml import etree
import requests
import csv

fp = open('d://doubanbook.csv','wt',newline='',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('name', 'url', 'author', 'publisher', 'date', 'price', 'rate', 'comment'))

urls = ['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)]

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

for url in urls:
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//tr[@class="item"]')
    for info in infos:
        name = info.xpath('td/div/a/@title')[0]
        url = info.xpath('td/div/a/@href')[0]
        book_infos = info.xpath('td/p/text()')[0]
        author = book_infos.split('/')[0]
        publisher = book_infos.split('/')[-3]
        date = book_infos.split('/')[-2]
        price = book_infos.split('/')[-1]
        rate = info.xpath('td/div/span[2]/text()')[0]
        comments = info.xpath('td/p/span/text()')
        comment = comments[0] if len(comments) != 0 else "空"
        writer.writerow((name,url,author,publisher,date,price,rate,comment))

fp.close()

注意写成CSV后，要记事本打开后，保存为UTF-8格式才能打开；

2）针对EXCEL的读取
import xlwt
import requests
from lxml import etree
import time

all_info_list = []

def get_info(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//ul[@class="all-img-list cf"]/li')
    for info in infos:
        title = info.xpath('div[2]/h4/a/text()')[0]
        author = info.xpath('div[2]/p[1]/a[1]/text()')[0]
        style_1 = info.xpath('div[2]/p[1]/a[2]/text()')[0]
        style_2 = info.xpath('div[2]/p[1]/a[3]/text()')[0]
        style = style_1+'·'+style_2
        complete = info.xpath('div[2]/p[1]/span/text()')[0]
        introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
        word = info.xpath('div[2]/p[3]/span/span/text()')[0].strip('万字')
        info_list = [title,author,style,complete,introduce,word]
        all_info_list.append(info_list)
    time.sleep(5)

if __name__ == '__main__':
    urls = ['http://a.qidian.com/?page={}'.format(str(i)) for i in range(1,2)]
    for url in urls:
        get_info(url)
    header = ['title','author','style','complete','introduce','word']
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('Sheet1')
    for h in range(len(header)):
        sheet.write(0, h, header[h])
    i = 1
    for list in all_info_list:
        j = 0
        for data in list:
            sheet.write(i, j, data)
            j += 1
        i += 1
    book.save('xiaoshuo.xls')

3 selenium 和phantomjs 配合使用，比如登录网页
     from selenium import webdriver
driver = webdriver.PhantomJS()
driver.get('https://www.douban.com/')
driver.implicitly_wait(10)
driver.find_element_by_id('form_email').clear()
driver.find_element_by_id('form_email').send_keys('用户名')
driver.find_element_by_id('form_password').clear()
driver.find_element_by_id('form_password').send_keys('密码')
driver.find_element_by_class_name('bn-submit').click()
print(driver.page_source)
     也可以针对AJAX轻松不用逆向工程
    比如爬QQ空间的说说：
from selenium import webdriver
import time
import csv
#import pymongo

#client = pymongo.MongoClient('localhost', 27017)
#mydb = client['mydb']
#qq_shuo = mydb['qq_shuo']

driver = webdriver.PhantomJS()
driver.maximize_window()

def get_info(qq):
    driver.get('http://user.qzone.qq.com/{}/311'.format(qq))
    driver.implicitly_wait(10)
    try:
        driver.find_element_by_id('login_div')
        a = True
    except:
        a = False
    if a == True:
        driver.switch_to.frame('login_frame')
        driver.find_element_by_id('switcher_plogin').click()
        driver.find_element_by_id('u').clear()
        driver.find_element_by_id('u').send_keys('XXXX')
        driver.find_element_by_id('p').clear()
        driver.find_element_by_id('p').send_keys('XXXX')
        driver.find_element_by_id('login_button').click()
        time.sleep(5)
    driver.implicitly_wait(3)
    try:
        driver.find_element_by_id('QM_OwnerInfo_Icon')
        b = True
    except:
        b = False
    if b == True:
        driver.switch_to.frame('app_canvas_frame')
        contents = driver.find_elements_by_css_selector('.content')

        times = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail')
        for content, tim in zip(contents, times):
            data = {
                'time': tim.text,
                'content': content.text
            }
            print(content.text)
         #   qq_shuo.insert_one(data)

if __name__ == '__main__':
    qq_lists = []
    fp = open('C:/Users/lyr/Downloads/QQmail.csv')
    reader = csv.DictReader(fp)
    for row in reader:
        qq_lists.append(row['电子邮件'].split('@')[0])
    fp.close()
    for item in qq_lists:
        get_info(item)


4 from selenium import webdriver
from lxml import etree
import time
#import pymongo

#client = pymongo.MongoClient('localhost', 27017)
#mydb = client['mydb']
#taobao = mydb['taobao']

driver = webdriver.PhantomJS()
driver.maximize_window()

def get_info(url,page):
    page = page + 1
    driver.get(url)
    driver.implicitly_wait(10)
    selector = etree.HTML(driver.page_source)
    infos = selector.xpath('//div[@class="item J_MouserOnverReq "]')
    for info in infos:
        data = info.xpath('div/div/a')[0]
        goods = data.xpath('string(.)').strip()
        price = info.xpath('div/div/div/strong/text()')[0]
        sell = info.xpath('div/div/div[@class="deal-cnt"]/text()')[0]
        shop = info.xpath('div[2]/div[3]/div[1]/a/span[2]/text()')[0]
        address = info.xpath('div[2]/div[3]/div[2]/text()')[0]
        print(goods)
        print(price)

        commodity = {
            'good':goods,
            'price':price,
            'sell':sell,
            'shop':shop,
            'address':address
        }
      # taobao.insert_one(commodity)

    if page <= 50:
        NextPage(url,page)
    else:
        pass

def NextPage(url,page):
    driver.get(url)
    driver.implicitly_wait(10)
//模拟点击下一页
    driver.find_element_by_xpath('//a[@trace="srp_bottom_pagedown"]').click()
    time.sleep(4)
    driver.get(driver.current_url)
    driver.implicitly_wait(10)
    get_info(driver.current_url,page)

if __name__ == '__main__':
    page = 1
    url = 'https://www.taobao.com/'
    driver.get(url)
    driver.implicitly_wait(10)
    driver.find_element_by_id('q').clear()
    driver.find_element_by_id('q').send_keys('男士短袖')
    driver.find_element_by_class_name('btn-search').click()
    get_info(driver.current_url,page)

4 scrapy快速使用
    在某个目录下，可以scrapy startproject 项目名

然后要抓取的项，写在items.py 中
from scrapy.item import Item,Field

class XiaozhuItem(Item):
    title= Field()
    address = Field()
    price = Field()
    lease_type = Field()
    suggestion = Field()
    bed = Field()

然后在spiders目录下新建立文件：
   from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from xiaozhu.items import XiaozhuItem

class xiaozhu(CrawlSpider):
    name = 'xiaozhu'
    start_urls = ['http://bj.xiaozhu.com/fangzi/6937392816.html']

    def parse(self, response):
        item = XiaozhuItem()
        selector = Selector(response)
        title = selector.xpath('//h4/em/text()').extract()[0]
        address = selector.xpath('//p/span[@class="pr5"]/text()').extract()[0].strip()
        price = selector.xpath('//*[@id="pricePart"]/div[1]/span/text()').extract()[0]
        lease_type = selector.xpath('//*[@id="introduce"]/li[1]/h6/text()').extract()[0]
        suggestion = selector.xpath('//*[@id="introduce"]/li[2]/h6/text()').extract()[0]
        bed = selector.xpath('//*[@id="introduce"]/li[3]/h6/text()').extract()[0]

        item['title'] = title
        item['address'] = address
        item['price'] = price
        item['lease_type'] = lease_type
        item['suggestion'] = suggestion
        item['bed'] = bed

        yield item

对于抓取后的字段保存和处理，使用pipeline:
   class XiaozhuPipeline(object):
    def process_item(self, item, spider):
        fp = open('d:/xiaozhu.txt','a+')
        fp.write(item['title']+'\n')
        fp.write(item['address']+'\n')
        fp.write(item['price'] + '\n')
        fp.write(item['lease_type'] + '\n')
        fp.write(item['suggestion'] + '\n')
        fp.write(item['bed'] + '\n')
        return item

    最后进行设置：
ITEM_PIPELINES = {'xiaozhu.pipelines.XiaozhuPipeline':300}
可以搞个MAIN程序，就可以不在命令行下运行了，在spiders目录下，设置
main.py
   from scrapy import cmdline
cmdline.execute("scrapy crawl xiaozhu".split())
5 如果要scrapy 设置请求头和导出CSV，可以
设置settings.py:
ROBOTSTXT_OBEY = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
DOWNLOAD_DELAY=4
FEED_URI = 'file:d:/photo/zhuanti.csv'
FEED_FORMAT = 'csv'


6 scrapy后，保存到MYSQL文件中

import pymysql
class JianshuitPipeline(object):
    def __init__(self):
        conn = pymysql.connect(host='localhost', user='root', passwd='123456', db='mydb', port=3306, charset='utf8')
        cursor = conn.cursor()
        self.post = cursor
    def process_item(self, item, spider):
        cursor = self.post
        cursor.execute("use mydb")
        sql = "insert into jianshu1 (user,time,title,view,comment,lik,gain) values(%s,%s,%s,%s,%s,%s,%s)"
        cursor.execute(sql,(item['user'],item['time'],item['title'],item['view'],item['comment'],item['like'],item['gain']))
        cursor.connection.commit()
        return item

分享到：

springboot集成logback | Ubuntu中root用户和user用户的相互切换

2018-04-08 19:08
浏览 645
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论