python 爬某高校C++题库小程序

bosshida

浏览: 214409 次
性别:
来自: 广州

最近访客更多访客>>

PowerNTT

cigogo

TangoHuang

屌丝码农

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

编程

最近在学习C++，爬下一些题来，用于平时练手。

原理其实也很简单，就是生成页面链接，下载网页，提取内容，保存为xml。

用到正则表达式来匹配，minidom来生成xml文件。

（相关URL已更改，题库将上传到sae的网页）

#coding:utf8
# c++ exercises crawler by bosshida,2014.1.2
import urllib2
import re
import string
from xml.dom import minidom

# regex pattern for problem page
pNamePattern = re.compile("<h2.*?>(.*?)</h2>",re.S)
# regex pattern for problem description
pDescPattern = re.compile('<td.*?background="srcs/bg_mid.gif".*?>(.*?)</td>',re.S)
# regex pattern for input, output, sampleInput, sampleOutput
pPrePattern = re.compile("<pre.*?>(.*?)</pre>",re.S)

# generate problem set page url
def genProblemSetUrl(count):
    pSetUrl = "http://xxx.edu.cn:8080/JudgeOnline/problemset.jsp?vol=%d" % count
    #pSetUrl = "/Users/apple/Dropbox/pysrc/problem%d" % count
    #pSetUrl = pSetUrl + ".htm"
    return pSetUrl

# get next problem url by regex
def getAllProblemUrl(page):
    pUrls = re.findall('<a href="(problem.jsp\?id=\d+)">',page, re.S)
    return pUrls

# download page content
def loadPage(url):
    response = urllib2.urlopen(url)
    content = response.read()
    return content

# get problem name
def getProblemName(page):
    pNameMatch = pNamePattern.search(page)
    return pNameMatch.group(1) if pNameMatch!=None else None

# get problem Description
def getProblemDesc(page):
    pDescMatch = pDescPattern.search(page)
    return pDescMatch.group(1) if pDescMatch!=None else None

# get input content
def getInputContent(page):
    pInput = pPrePattern.findall(page)[0]
    return pInput 

# get output content
def getOutputContent(page):
    pOutput = pPrePattern.findall(page)[1]
    return pOutput

# get sample input
def getSampleInput(page):
    pSampleInput = pPrePattern.findall(page)[2]
    return pSampleInput

# get sample output
def getSampleOutput(page):
    pSampleOutput = pPrePattern.findall(page)[3]
    return pSampleOutput

# class for xml
class XmlGenerator:
    def __init__(self, xmlName):
        self.doc = minidom.Document()
        self.xmlName = xmlName
    
    def createNode(self, nodeName):
        return self.doc.createElement(nodeName)

    def addNode(self, node, prevNode = None):
        curNode = node
        if prevNode is not None:
            prevNode.appendChild(curNode)
        else:
            self.doc.appendChild(curNode)
        return curNode

    def setNodeAttr(self, node, attName, value):
        curNode = node
        curNoee.setAttribute(attName, value)

    def setNodeValue(self, curNode, value):
        nodeData = self.doc.createTextNode(value)
        curNode.appendChild(nodeData)

    def genXml(self):
        f = open(self.xmlName, "w")
        f.write(self.doc.toprettyxml(indent="\t", newl="\n", encoding="utf8"))
        f.close()

path = "D://test.xml"
xmlGen = XmlGenerator(path)
rootNode = xmlGen.createNode("root")
xmlGen.addNode(node=rootNode)

count = 0
while count < 6:
    pSetUrl = genProblemSetUrl(count+1)
    print pSetUrl
    html = loadPage(pSetUrl)
    for pageUrl in getAllProblemUrl(html):
        pageUrl = "http://xxx.edu.cn:8080/JudgeOnline/" + pageUrl
        print pageUrl
        pPage = loadPage(pageUrl)
        pPage_gbk = unicode(pPage, "gbk")
        pName = getProblemName(pPage_gbk)
        pDesc = getProblemDesc(pPage_gbk)
        pInput = getInputContent(pPage_gbk)
        pOutput = getOutputContent(pPage_gbk)
        pSampleInput = getSampleInput(pPage_gbk)
        pSampleOutput = getSampleOutput(pPage_gbk)
        problemNode = xmlGen.createNode("problem")
        
        pUrlNode = xmlGen.createNode("url")
        xmlGen.setNodeValue(pUrlNode, pageUrl)
        xmlGen.addNode(pUrlNode, problemNode)

        pNameNode = xmlGen.createNode("name")
        xmlGen.setNodeValue(pNameNode, pName)
        xmlGen.addNode(pNameNode, problemNode)

        pDescNode = xmlGen.createNode("desc")
        xmlGen.setNodeValue(pDescNode, pDesc)
        xmlGen.addNode(pDescNode, problemNode)

        pInputNode = xmlGen.createNode("input")
        xmlGen.setNodeValue(pInputNode, pInput)
        xmlGen.addNode(pInputNode, problemNode)

        pOutputNode = xmlGen.createNode("output")
        xmlGen.setNodeValue(pOutputNode, pOutput)
        xmlGen.addNode(pOutputNode, problemNode)

        pSInputNode = xmlGen.createNode("sampleInput")
        xmlGen.setNodeValue(pSInputNode, pSampleInput)
        xmlGen.addNode(pSInputNode, problemNode)

        pSOutputNode = xmlGen.createNode("sampleOutput")
        xmlGen.setNodeValue(pSOutputNode, pSampleOutput)
        xmlGen.addNode(pSOutputNode, problemNode)
        
        xmlGen.addNode(problemNode, rootNode)
        xmlGen.genXml()
    count += 1
print "finish"

分享到：

python向mysql写入时出现中文乱码 | （转）vim设置

2014-01-18 15:44
浏览 1387
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

python 爬某高校C++题库小程序

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

python 爬某高校C++题库小程序

评论

发表评论

相关推荐

w3school html 学习笔记

phpcms 笔记

dive into python 笔记

centos下饭强--obfucated-openssh sshcenter.info

android的【qq通讯录】导出短信，在iphone上恢复

python的一些记录

《程序设计导引及在线实践》学习

accelerated c++ 学习笔记

关于foreach与普通for的区别

学习Linux命令，读《系统程序员成长计划》

base64，日期操作，jexl读取excel

无题。。好代码记录

数据结构：排序

c语言复习笔记

并发学习笔记（更新中，java编程思想第四版21章）

并发学习笔记（更新中，java编程思想第四版21章）

RMI 入门

《大话设计模式》一书的所有代码和UML类图

windows系统下的进程监测程序--实现过程记录

发短信算法题

最近访客更多访客>>