`

python + request + lxml的几个例子

 
阅读更多
例子没有加入失败后重做的功能,这个也可以考虑增加。
第三个例子加入了访问频率控制
遍历图片的例子加入多线程,明显爬得快很多

解析163新闻列表的例子:
#!/usr/bin/python
# encoding=gbk

# 我只是尝试遍历新闻而已,只有有很多链接博客,主题之类的没有操作
# 如果要实现,就自己判断url来分析到底是什么,然后做相应的处理

import sys
import requests
import datetime
import time
import MySQLdb

import chardet

import lxml.html.soupparser as soupparser
import lxml.etree as etree

start_datetime = datetime.datetime.now()


def parseFromWin1252(str):
    # 因为新闻有一些是乱码,编码是windows-1252,需要转换成GBK
    #print len(tt.decode("ISO-8859-1").encode("windows-1252").decode("GBK"))
    #print len(tt)
    try:
        return str.encode("windows-1252").decode("GBK")
    except UnicodeEncodeError:
        #print "UnicodeEncodeError"
        return str
    except UnicodeDecodeError:
        #print "UnicodeDecodeError"
        return str


def resolveAndSaveNewContentFromLink(link, linkTitle, cursor):
    # 打开一个链接,并得到里面的内容
    # 有两种情况无法得到,1.没有标题的,可能是一个主题的页面;2.报异常的,还没处理,所以无法拿到内容
    print u"处理:", link
    request = requests.get(link)

    try:
        dom = soupparser.fromstring(request.content)
        body = dom[0]
        titles = body.xpath("//h1[@id='h1title']")
        if len(titles) > 0:
            #有标题
            title = parseFromWin1252(titles[0].text)
            print u"@TITLE:", request.encoding, title, link
            newContents = body.xpath("//div[@id='endText']//p")

            alist = []
            for content in newContents:
                if content.text != None:
                    alist.append(content.text)

            text = parseFromWin1252("<br><br>".join(alist))

            values = [link, title, text, "Success"]
            cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)

        else:
            #无标题
            title = parseFromWin1252(linkTitle)
            print u"#NO_TITLE:", request.encoding, title, link

            values = [link, title, "", "NO_TITLE"]
            cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)
    except TypeError:
        #报异常
        title = parseFromWin1252(linkTitle)
        print u"$TypeError:", request.encoding, title, link

        values = [link, title, "", "TypeError"]
        cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)


#定义方法
def resolveAndSaveLinks(body, cursor):
    print u"解析html的Link"
    links = body.xpath("//ul[@class='mod-list main-list']//a")

    print u"处理数据"
    count = 1;
    for item in links:
        # 有em标签的无法解析
        if item.text != None:
            values = [item.get("href"), item.text]
            cursor.execute("insert into links(url,text) value(%s,%s)", values)
            resolveAndSaveNewContentFromLink(item.get("href"), item.text, cursor)
            #time.sleep(100) #是否需要暂停,免得被封掉?
            print u"完成","<resolveAndSaveLinks>[%s:%s]" %(len(links), count)
            count = count + 1
            print "----------------------------------------------------------"

    print u"保存数据完成,记录数[", len(links), "]"


def resolveAndSaveEmInLinks(body, cursor):
    print u"解析html的包含em元素的Link"
    ems = body.xpath("//ul[@class='mod-list main-list']//em")

    print u"处理数据"
    count = 1;
    for item in ems:
        values = [item.getparent().get("href"), item.text]
        cursor.execute("insert into links(url,text) value(%s,%s)", values)
        resolveAndSaveNewContentFromLink(item.getparent().get("href"), item.text, cursor)
        #time.sleep(100) #是否需要暂停,免得被封掉?
        print u"完成","<resolveAndSaveEmInLinks>[%s:%s]" %(len(ems), count)
        count = count + 1
        print "----------------------------------------------------------"

    print u"保存数据完成,记录数[", len(ems), "]"


def resolve():
    print u"打开链接"
    req = requests.get("http://news.163.com/")
    content = req.content
    dom = soupparser.fromstring(content)
    body = dom[1]

    print u"链接数据库"
    conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")
    cursor = conn.cursor()
    cursor.execute("delete from links")
    cursor.execute("delete from texts")

    #resolveAndSaveNewContentFromLink("http://auto.163.com/13/0929/02/99TGSGRJ00084TUR.html", u"测试", cursor)
    #if True:
    #    return

    print u"解析并保存到数据库"
    #遍历不包含em标签的link
    resolveAndSaveLinks(body, cursor)
    #遍历包含em标签的link
    resolveAndSaveEmInLinks(body, cursor)

    cursor.close()
    conn.close()
    print u"遍历完成"

#开始调用
resolve()
end_datetime = datetime.datetime.now()
print u"耗时", (end_datetime - start_datetime).seconds, u"秒"


遍历糗事百科的文章,只遍历导航上面的几个分类,热门,最新,等等
#!/usr/bin/ScanningQiuShiBaiKe.py
# encoding=gbk

import sys
import os
import MySQLdb
import requests
import datetime
import time
import lxml.html.soupparser as soupparser
import lxml.etree as etree

currentPageId = "currentPageId"


def getImageFile(imgUrl): #文件下载,并写入本地硬盘,返回文件名
    local_filename = imgUrl.split('/')[-1]
    local_filename=  "/home/pandy/tmp/"+local_filename
    print u"下载文件成功: ", local_filename
    r = requests.get(imgUrl, stream=True) # here we need to set stream = True parameter
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
        f.close()
        return local_filename
    return None


def scannintArticle(cursor, type, url, article):   #处理一个主题的信息
    articleStr = etree.tostring(article)
    articleBody = soupparser.fromstring(articleStr)
    details = articleBody.xpath("//div[@class='detail']")
    authors = articleBody.xpath("//div[@class='author']")
    contents = articleBody.xpath("//div[@class='content']")
    thumbs = articleBody.xpath("//div[@class='thumb']")

    values = [type, url]
    if len(details) > 0:
        detailStr = etree.tostring(details[0])
        detail = soupparser.fromstring(detailStr)
        values.append(detail.xpath("//a")[0].text)
        values.append(detail.xpath("//a")[0].get("href"))
    else:
        values.append("")
        values.append("")

    if len(authors) > 0:
        authorStr = etree.tostring(authors[0])
        author = soupparser.fromstring(authorStr)
        values.append(author.xpath("//a")[0].text)
        values.append(author.xpath("//a")[0].get("href"))
    else:
        values.append("")
        values.append("")

    if len(contents) > 0:
        contentStr = etree.tostring(contents[0])
        values.append(contents[0].text)
    else:
        values.append("")
        values.append("")

    if len(thumbs) > 0:
        thumbStr = etree.tostring(thumbs[0])
        thumb = soupparser.fromstring(thumbStr)
        imgUrl = thumb.xpath("//img")[0].get("src")
        values.append(imgUrl)

        #下载图片,先临时存放,然后在读取出来保存到数据库,并删除
        local_filename = getImageFile(imgUrl)
        f = open( local_filename , "rb" )
        b = f.read()
        f.close()
        os.remove(local_filename)
        values.append(MySQLdb.Binary(b))
    else:
        values.append("")
        values.append(None)


    values.append("Success")
    print values
    cursor.execute(
        "INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content,status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
        values)


def scanning4typeArticle(cursor, type, url): #扫描一页
    request = requests.get(url)
    #print request.encoding
    print url
    #print len(request.content)
    #print request.content
    try:
        dom = soupparser.fromstring(request.content)
        body = dom[1]
        #查找一页下面的主题
        articleList = body.xpath("//div[@class='block untagged mb15 bs2']")
        for article in articleList:
            scannintArticle(cursor, type, url, article)
    except:
        print "Error"
        values = [type, url, '', '', '', '', '', '',None, "Error"]
        cursor.execute(
            "INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content, status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
            values)


def scanning4type(cursor, type, url, subfix):   #得到分页数,然后一页一页的打开
    print u"开始扫描文章"

    request = requests.get(url);
    dom = soupparser.fromstring(request.content)
    body = dom[0]

    #得到底部分页的最大值
    pagebars = body.xpath("//div[@class='pagebar']/a")
    if len(pagebars) > 2:
        maxPageSize = int(pagebars[len(pagebars) - 2].text) + 1
        #一页一页的打开
        for i in range(1, maxPageSize):
            scanningUrl = "".join([url, subfix]).replace(currentPageId, str(i))
            scanning4typeArticle(cursor, type, scanningUrl)

    print u"扫描文章完成"


def main(): # 主方法

    #打开数据库
    conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")
    cursor = conn.cursor()
    cursor.execute("delete from qs_article")

    #扫描几个类型,就是导航的前几个分类
    scanning4type(cursor, "8HR", "http://www.qiushibaike.com/8hr", "".join(["/page/", "currentPageId", "?s=4602020"]))
    #scanning4type(cursor, "HOT", "http://www.qiushibaike.com/hot", "".join(["/page/", "currentPageId", "?s=4602057"]))
    #scanning4type(cursor, "IMGRANK", "http://www.qiushibaike.com/imgrank", "".join(["/page/", "currentPageId", "?s=4602057"]))
    #scanning4type(cursor, "LATE", "http://www.qiushibaike.com/late", "".join(["/page/", "currentPageId", "?s=4602057"]))

    #scanning4typeArticle(cursor, type, "http://www.qiushibaike.com/late/page/346?s=4602057")

    #关闭数据库
    cursor.close()
    conn.close()


#开始运行主程序
main()



遍历新浪一些博客的图片,加入了访问频率控制
#!/usr/bin/python
# encoding=gbk


#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=

import sys
import os
import requests
import MySQLdb
import lxml.html.soupparser as soupparser
import lxml.etree as etree
import json
import time

maxPage = 100 # 定义被扫描的最大页数
requests.adapters.DEFAULT_RETRIES = 5

#加入控制打开频率
DEFAULT_OPEN_PAGE_FREQUENCY = 1  #打开页面的间隔事件
DEFAULT_OPEN_IMAGE_FREQUENCY = 3  #打开图片页面的间隔事件
DEFAULT_IMAGE_COUNT = 0  #图片计数器
DEFAULT_IMAGE_SIZE = 20  #打开size张图片后,要sleep DEFAULT_OPEN_IMAGE_FREQUENCY秒钟


def saveImage(title, imageSrc): # 保存图片
    if title == None:
        title = u"无题"
    print u"标题:%s     图片:%s" % (title, imageSrc)
    dirStr = u"/mnt/E/新浪图集/" + title + "/"

    if not os.path.exists(dirStr):
        os.makedirs(dirStr)

    fileName = imageSrc.split('/')[-1]
    request = requests.get(imageSrc, stream=True)
    with open(dirStr + fileName, "wb") as file:
        for chunk in request.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks                                        5
                file.write(chunk)
                file.flush()
        file.close()


def listPicPage(pageUrl): #从首页打开链接,然后进行图片的页面
    global DEFAULT_IMAGE_COUNT

    request = requests.get(pageUrl)
    dom = soupparser.fromstring(request.content)
    body = dom[1]
    title = body.xpath("//h3[@class='title']")
    titleStr = "";
    if len(title) > 0:
        titleStr = title[0].text
    imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")
    print u"遍历图片页面,  标题:%s,   地址: %s " % (titleStr, pageUrl)

    imageSrc = None
    for image in imageList:
        # 这里好像有两个地址,先用real_src,否在用src
        if image.get("real_src") != None:
            imageSrc = image.get("real_src")
        else:
            imageSrc = image.get("src")
            #要存在图片地址,才需要继续解析
        if imageSrc != None:
            saveImage(titleStr, imageSrc)

        #访问频率控制
        DEFAULT_IMAGE_COUNT = DEFAULT_IMAGE_COUNT + 1
        if DEFAULT_IMAGE_COUNT % DEFAULT_IMAGE_SIZE == 0:
            print u"图片计数:%s, 休息 %s 秒钟后继续\n" % (DEFAULT_IMAGE_COUNT, DEFAULT_OPEN_IMAGE_FREQUENCY)
            time.sleep(DEFAULT_OPEN_IMAGE_FREQUENCY)


def listPicIndex(): #遍历首页
    # 根据页数来打开url
    for i in range(1, maxPage + 1):
        url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(
            i) + "&type=2&blogid=67f899b332002zdw&ch="
        request = requests.get(url)
        json_obj = json.loads(request.content)
        for item in json_obj["data"]["list"]:
            #找到这一页的所有图片链接,然后进行打开这个链接,才是显示图片的页面
            dom = soupparser.fromstring(item)
            link = dom.xpath("//a[@class='pic']")
            if len(link) > 0:
                #遍历图片的页面
                listPicPage(link[0].get("href"))
            print u"---------------------------------------------完成一个图片链接, 页数:", i

            #访问频率控制
            # time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)
    print u"---------------------------------------------完成页数", maxPage, ":", i


def main():
    listPicIndex()
    #listPicPage("http://qing.blog.sina.com.cn/tj/a1509eee330044am.html")


if __name__ == "__main__":
    main()



上面的例子改成多线程
#!/usr/bin/python
# encoding=gbk
#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=
import sys
import os
import requests
import MySQLdb
import lxml.html.soupparser as soupparser
import lxml.etree as etree
import json
import time
import threading

MAX_PAGE = 100 # 定义被扫描的最大页数
MAX_ERROR = 10 # 定义线程允许出现的最大错误数,当不超过这个数字的时候,会自动继续重试
PAGE_SIZE = 5 #段数
DEFAULT_OPEN_PAGE_FREQUENCY = 2 #完成一页休眠的时间
DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY = 5 #出现异常之后等待重试的事件
requests.adapters.DEFAULT_RETRIES = 5


def saveImage(thName, title, imageSrc, currentPath): # 保存图片
    if title == None:
        title = u"无题"
    print u"线程名称:%s,  页码:%s,   标题:%s     图片:%s" % (thName, currentPath, title, imageSrc)
    dirStr = u"/mnt/E/新浪图集/" + title + "/"

    if not os.path.exists(dirStr):
        os.makedirs(dirStr)

    fileName = imageSrc.split('/')[-1]
    request = requests.get(imageSrc, stream=True)
    with open(dirStr + fileName, "wb") as file:
        for chunk in request.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks                                        5
                file.write(chunk)
                file.flush()
        file.close()


def listPicPage(thName, pageUrl, currentPath): #从首页打开链接,然后进行图片的页面
    global DEFAULT_IMAGE_COUNT

    request = requests.get(pageUrl)
    dom = soupparser.fromstring(request.content)
    body = dom[1]
    title = body.xpath("//h3[@class='title']")
    titleStr = "";
    if len(title) > 0:
        titleStr = title[0].text
    imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")
    #print u"\n\n页码:%s, 遍历图片页面,  标题:%s,   地址: %s " % (currentPath, titleStr, pageUrl)

    imageSrc = None
    for image in imageList:
        # 这里好像有两个地址,先用real_src,否在用src
        if image.get("real_src") != None:
            imageSrc = image.get("real_src")
        else:
            imageSrc = image.get("src")
            #要存在图片地址,才需要继续解析
        if imageSrc != None:
            saveImage(thName, titleStr, imageSrc, currentPath)


def listPicIndex(thName, startPath, endPath): #遍历首页
    # 根据页数来打开url
    for i in range(startPath, endPath + 1):
        url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(
            i) + "&type=2&blogid=67f899b332002zdw&ch="
        print url
        request = requests.get(url)
        json_obj = json.loads(request.content)

        error_count = 0
        for item in json_obj["data"]["list"]:
            #找到这一页的所有图片链接,然后进行打开这个链接,才是显示图片的页面
            dom = soupparser.fromstring(item)
            link = dom.xpath("//a[@class='pic']")
            if len(link) > 0:
                #遍历图片的页面
                try:
                    listPicPage(thName, link[0].get("href"), i)
                except:
                    if error_count < MAX_ERROR:
                        error_count = error_count + 1
                        #错先错误的话,等待一会儿,再重试
                        print u"---------------------------------------------休眠%s秒钟后重试, 页数:%s" % (
                            DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY, i)
                        time.sleep(DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY)
                        listPicPage(thName, link[0].get("href"), i)
                    else:
                        print u"出错超过预设次数,退出爬虫。"

            #print u"---------------------------------------------完成一个图片链接, 页数:", i

            #访问频率控制
            time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)
    print u"---------------------------------------------完成页数", MAX_PAGE, ":", i
    return True


class MyThread(threading.Thread):
    def __init__(self, name, startPath, endPage):
        threading.Thread.__init__(self)
        self.name = name
        self.is_stop = False
        self.startPage = startPath
        self.endPage = endPage

    def run(self):
        while not self.is_stop:
            #遍历完成后停止线程
            self.is_stop = listPicIndex(self.name, self.startPage, self.endPage)

    def stop(self):       #手动设置停止标记
        self.is_stop = True


if __name__ == "__main__":
    #分段创建线程
    count=1;
    for i in range(1, MAX_PAGE, PAGE_SIZE):
        startPath = i
        endPath = i + PAGE_SIZE
        if endPath > MAX_PAGE:
            endPath = MAX_PAGE
        print startPath, ",", endPath

        t = MyThread("Thread " + str(count), startPath, endPath)
        count=count+1
        t.start()
        pass
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics