__author__ = 'chenyang' #!/usr/bin/env python """Web Crawler/Spider This module implements a web crawler. This is very _basic_ only and needs to be extended to do anything usefull with the traversed pages. """ import re import sys import time import math import urllib2 import urlparse import optparse from cgi import escape from traceback import format_exc from Queue import Queue, Empty as QueueEmpty from BeautifulSoup import BeautifulSoup __version__ = "0.2" __copyright__ = "CopyRight (C) 2008-2011 by James Mills" __license__ = "MIT" __author__ = "James Mills" __author_email__ = "James Mills, James dot Mills st dotred dot com dot au" USAGE = "%prog [options] <url>" VERSION = "%prog v" + __version__ AGENT = "%s/%s" % (__name__, __version__) class Crawler(object): def __init__(self, root, depth, locked=True): self.root = root self.depth = depth self.locked = locked self.host = urlparse.urlparse(root)[1] self.urls = [] self.links = 0 self.followed = 0 def crawl(self): page = Fetcher(self.root) page.fetch() q = Queue() for url in page.urls: q.put(url) followed = [self.root] n = 0 while True: try: url = q.get() except QueueEmpty: break n += 1 if url not in followed: try: host = urlparse.urlparse(url)[1] if self.locked and re.match(".*%s" % self.host, host): followed.append(url) self.followed += 1 page = Fetcher(url) page.fetch() for i, url in enumerate(page): if url not in self.urls: self.links += 1 q.put(url) self.urls.append(url) if n > self.depth and self.depth > 0: break except Exception, e: print "ERROR: Can't process url '%s' (%s)" % (url, e) print format_exc() class Fetcher(object): def __init__(self, url): self.url = url self.urls = [] def __getitem__(self, x): return self.urls[x] def _addHeaders(self, request): request.add_header("User-Agent", AGENT) def open(self): url = self.url try: request = urllib2.Request(url) handle = urllib2.build_opener() except IOError: return None return (request, handle) def fetch(self): request, handle = self.open() self._addHeaders(request) if handle: try: content = unicode(handle.open(request).read(), "utf-8", errors="replace") soup = BeautifulSoup(content) #######BeautifulSoup提供a标签# tags = soup('a') except urllib2.HTTPError, error: if error.code == 404: print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url) else: print >> sys.stderr, "ERROR: %s" % error tags = [] except urllib2.URLError, error: print >> sys.stderr, "ERROR: %s" % error tags = [] for tag in tags: href = tag.get("href") if href is not None: url = urlparse.urljoin(self.url, escape(href)) if url not in self: self.urls.append(url) def getLinks(url): page = Fetcher(url) page.fetch() for i, url in enumerate(page): print "%d. %s" % (i, url) def parse_options(): """parse_options() -> opts, args Parse any command-line options given returning both the parsed options and arguments. """ parser = optparse.OptionParser(usage=USAGE, version=VERSION) parser.add_option("-q", "--quiet", action="store_true", default=False, dest="quiet", help="Enable quiet mode") parser.add_option("-l", "--links", action="store_true", default=False, dest="links", help="Get links for specified url only") parser.add_option("-d", "--depth", action="store", type="int", default=30, dest="depth", help="Maximum depth to traverse") opts, args = parser.parse_args() if len(args) < 1: parser.print_help() raise SystemExit, 1 return opts, args def main(): opts, args = parse_options() url = args[0] if opts.links: getLinks(url) raise SystemExit, 0 depth = opts.depth sTime = time.time() print "Crawling %s (Max Depth: %d)" % (url, depth) crawler = Crawler(url, depth) crawler.crawl() print "\n".join(crawler.urls) eTime = time.time() tTime = eTime - sTime print "Found: %d" % crawler.links print "Followed: %d" % crawler.followed print "Stats: (%d/s after %0.2fs)" % ( int(math.ceil(float(crawler.links) / tTime)), tTime) if __name__ == "__main__": main()
Python BeautifulSoup 简单笔记
http://rsj217.diandian.com/post/2012-11-01/40041235132
TK calculation
import Tkinter as tk calc = tk.Tk() calc.title("CrappyCalc") buttons = [ '7', '8', '9', '*', 'C', '4', '5', '6', '/', 'Neg', '1', '2', '3', '-', '$', '0', '.', '=', '+', '@' ] # set up GUI row = 1 col = 0 for i in buttons: button_style = 'raised' action = lambda x = i: click_event(x) tk.Button(calc, text = i, width = 4, height = 3, relief = button_style, command = action) \ .grid(row = row, column = col, sticky = 'nesw', ) col += 1 if col > 4: col = 0 row += 1 display = tk.Entry(calc, width = 40, bg = "white") display.grid(row = 0, column = 0, columnspan = 5) def click_event(key): # = -> calculate results if key == '=': # safeguard against integer division if '/' in display.get() and '.' not in display.get(): display.insert(tk.END, ".0") # attempt to evaluate results try: result = eval(display.get()) display.insert(tk.END, " = " + str(result)) except: display.insert(tk.END, " Error, use only valid chars") # C -> clear display elif key == 'C': display.delete(0, tk.END) # $ -> clear display elif key == '$': display.delete(0, tk.END) display.insert(tk.END, "$$$$C.$R.$E.$A.$M.$$$$") # @ -> clear display elif key == '@': display.delete(0, tk.END) display.insert(tk.END, "wwwwwwwwwwwwwwwwebsite") # neg -> negate term elif key == 'neg': if '=' in display.get(): display.delete(0, tk.END) try: if display.get()[0] == '-': display.delete(0) else: display.insert(0, '-') except IndexError: pass # clear display and start new input else: if '=' in display.get(): display.delete(0, tk.END) display.insert(tk.END, key) # RUNTIME calc.mainloop()
相关推荐
PythonCrawler-Scrapy-Mysql-File-Template, scrapy爬虫框架模板,将数据保存到Mysql数据库或者文件中。
网络各类型网页爬取数据教程,以及源代码,Python格式
( )\ ) ) ) ( ( (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\
简单爬虫操作,直达博客——复工复产,利用Python爬虫爬取火车票信息,利用Python 爬虫获取火车票信息
python库。 资源全名:spidy_web_crawler-1.6.0-py3-none-any.whl
pythonCrawler 注意 exe_file是本程序爬取的附录,全部测试,实战识读路径全部指向exe_file 本爬虫笔记基于b站 在该教程的基础上对教程中的思路进行实践,对教程出现的错误进行修正,并且另外扩展,而不是教程源码...
主要文件在./PythonCrawler中 1、运行文件databaseInitialization.py初始化数据库和相应的表, 数据库连接默认是localhost:3306 2、运行文件findAllStocksCode.py获得所有4192支股票的股票代码和名称, 这时会得到...
简介人生苦短,我用Python!该仓库主要用于存放网络爬虫相关的Python脚本文件。关于网络爬虫的部分代码均放在该仓库下了,感兴趣的小伙伴可以随意进行下载,有问题欢迎学习交流~~a如果你喜欢的话,记得给我个star;...
python爬虫日常小练习,小项目-python_crawler
爬虫-python crawler-python 是一个简单的爬虫框架,用于从网站收集在线数据用于学术目的。快速开始下载或克隆源代码...目前支持的网站最好和 goengent 一起工作(哎呀,yelp 挡住了)未来的网站去做可用代理列表 ...
python多线程爬虫爬取电影天堂资源,实测效果好
# Python_Crawler_Qichacha 基于Python的企查查爬虫,爬取完整的公司数据 -------- 该资源内项目源码是个人的毕设,代码都测试ok,都是运行成功后才上传资源,答辩评审平均分达到96分,放心下载使用! <项目介绍> 1...
crawlerforSinaweibo_爬虫python_webcrawler_python_weibo_python爬虫_源码
Here is a basic Python web crawler code that uses the requests and beautifulsoup4 libraries: This code sends an HTTP request to the specified URL, then uses BeautifulSoup to parse the ...
介绍 QQ音乐榜单数据包中的,请求参数sign是一个动态加密的参数,只有逆向它才能正确获取到榜单数据包。 安装教程 安装 node.js 和 Python ... 找到 source_code/crawler.py 命令行运行 python crawler.py 即可
基于Python的B站弹幕采集程序(逆向) B站弹幕采集(逆向) 弹幕接口URL的请求参数是被加密的,经过分析之后发现是MD5加密。 用Python直接构造一个加密... 找到 source_code/crawler.py 并在cmd中运行python crawler.py
dcard_crawler ...python crawler.py new [N]回溯更新2小时资料,并新增最新N个post ID资料(往max ID后扫描至max ID+N。default: N=100) python crawler.py last [N]回溯更新N天资料。 (default
python 编写的DHT Crawler 网络爬虫,抓取DHT网络的磁力链接。 文件 collector.py dht网络爬虫脚本 抓取dht网络的磁力链接,使用 libtorrent 的python绑定库开发。 collectord.py dht爬虫服务监控程序 启动并监控...
$python crawler_auto.py [Boardname (case sensitive)] [Start page number] 多板爬行 !!!!!! 警告:由于网络和多个请求问题,非常不稳定。 使用 Sublime Text 修改“boardlist.txt”(避免记事本引起的换行问题...
由于python更新,有些API用法有变,原程序已经不能在新版本中正常运行了。经过本人改进测试,此crawler项目能正常运行。 测试平台: win7 python3.7