python 解析url

canofy

浏览: 837021 次
性别:
来自: 北京、四川

最近访客更多访客>>

hanjiguang

twinssnk

room_bb

chencc_123

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

python

Python JavaScript HTML Blog UP

摘录了dive into python的例子
有两种方法，HTMLParser和SGMLParser
第一种：

#-*-coding:utf-8-*-
import  HTMLParser
#html解析，继承HTMLParser类
class MyHTMLParser(HTMLParser.HTMLParser):
    def _init(self):
        HTMLParser.HTMLParser.__init__(self);

        
# 处理开始标签和结束标签 -- finish processing of start+end tag: <tag.../>
    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)
        self.handle_endtag(tag)

    #handle start tag
   #处理开始标签和结束标签 这里打印出a标签的href的属性值
    def handle_starttag(self,tag, attrs):
        if tag=='a':
             for name,value in attrs:
                 if  name=='href':
                     print  value  

    # 处理结束标签，比如</xx> -- handle end tag
    def handle_endtag(self,tag):
        pass;

    # 处理特殊字符串，就是以&#开头的，一般是内码表示的字符 -- handle character reference
    def handle_charref(self, name):
        pass

    # 处理一些特殊字符，以&开头的，比如 &nbsp; -- handle entity reference
    def handle_entityref(self, name):
        pass

    # 处理数据，就是<xx>data</xx>中间的那些数据 -- handle data
    def handle_data(self, data):
        pass

    # 处理注释 -- handle comment
    def handle_comment(self, data):
        pass

    # 处理<!开头的，比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" -- handle declaration
    def handle_decl(self, decl):
        pass

    # 处理形如<?instruction>的东西 -- handle processing instruction
    def handle_pi(self, data):
        pass
    
    

a='<body><a href="www.163.com">test</a></body>'    
print a
my=MyHTMLParser()
my.feed(a)
#结果为www.163.com

第二种方式：
首先是一个基础类，和上面的方式一样

#!/usr/bin/env python
#-*-coding:utf-8-*-
from sgmllib import SGMLParser
import htmlentitydefs

class BaseHTMLProcessor(SGMLParser):
    def reset(self):                       
        # extend (called by SGMLParser.__init__)
        self.pieces = []
        SGMLParser.reset(self)
    
    #是一个开始一个块的 HTML 标记，象 <html>，<head>，<body> 或 <pre> 等，或是一个独一的标记，
    #象 <br> 或 <img> 等。当它找到一个开始标记 tagname，SGMLParser 将查找名为 start_tagname
    #或 do_tagname 的方法。例如，当它找到一个 <pre> 标记，它将查找一个 start_pre 或 do_pre 的方法。
    #如果找到了，SGMLParser 会使用这个标记的属性列表来调用这个方法；否则，它用这个标记的名字和属性
    #列表来调用 unknown_starttag 方法。
    def unknown_starttag(self, tag, attrs):
        # called for each start tag
        # attrs is a list of (attr, value) tuples
        # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
        # Ideally we would like to reconstruct original tag and attributes, but
        # we may end up quoting attribute values that weren't quoted in the source
        # document, or we may change the type of quotes around the attribute value
        # (single to double quotes).
        # Note that improperly embedded non-HTML code (like client-side Javascript)
        # may be parsed incorrectly by the ancestor, causing runtime script errors.
        # All non-HTML code must be enclosed in HTML comment tags (<!-- code -->)
        # to ensure that it will pass through this parser unaltered (in handle_comment).
        strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
        self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
    
    #是结束一个块的 HTML 标记，象 </html>，</head>，</body> 或 </pre> 等。
    #当找到一个结束标记时，SGMLParser 将查找名为 end_tagname 的方法。如果找到，
    #SGMLParser 调用这个方法，否则它使用标记的名字来调用 unknown_endtag 。
    def unknown_endtag(self, tag):         
        # called for each end tag, e.g. for </pre>, tag will be "pre"
        # Reconstruct the original end tag.
        self.pieces.append("</%(tag)s>" % locals())
        
    #用字符的十进制或等同的十六进制来表示的转义字符，象 &#160;。当
    #找到，SGMLParser 使用十进制或等同的十六进制字符文本来调用 handle_charref 。
    def handle_charref(self, ref):         
        # called for each character reference, e.g. for "&#160;", ref will be "160"
        # Reconstruct the original character reference.
        self.pieces.append("&#%(ref)s;" % locals())

    #HTML 实体，象 &copy;。当找到，SGMLParser 使用 HTML 实体的名字来调用 handle_entityref 。
    def handle_entityref(self, ref):       
        # called for each entity reference, e.g. for "&copy;", ref will be "copy"
        # Reconstruct the original entity reference.
        self.pieces.append("&%(ref)s" % locals())
        # standard HTML entities are closed with a semicolon; other entities are not
        if htmlentitydefs.entitydefs.has_key(ref):
            self.pieces.append(";")

    #文本块。不满足其它 7 种类别的任何东西。当找到，SGMLParser 用文本来调用 handle_data。
    def handle_data(self, text):           
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        # Store the original text verbatim.
        #数据的处理
        self.pieces.append(text)
        
    #HTML 注释, 包括在 <!-- ... -->之间。当找到，SGMLParser 用注释内容来调用 handle_comment
    def handle_comment(self, text):        
        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
        # Reconstruct the original comment.
        # It is especially important that the source document enclose client-side
        # code (like Javascript) within comments so it can pass through this
        # processor undisturbed; see comments in unknown_starttag for details.
        self.pieces.append("<!--%(text)s-->" % locals())

    #HTML 处理指令，包括在 <? ... > 之间。当找到，SGMLParser 用处理指令内容来调用 handle_pi。
    def handle_pi(self, text):             
        # called for each processing instruction, e.g. <?instruction>
        # Reconstruct original processing instruction.
        self.pieces.append("<?%(text)s>" % locals())

    #HTML 声明，如 DOCTYPE，包括在 <! ... >之间。当找到，SGMLParser 用声明内容来调用 handle_decl
    def handle_decl(self, text):
        # called for the DOCTYPE, if present, e.g.
        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        #     "http://www.w3.org/TR/html4/loose.dtd">
        # Reconstruct original DOCTYPE
        self.pieces.append("<!%(text)s>" % locals())

    def output(self):              
        """Return processed HTML as a single string"""
        return "".join(self.pieces)

接着第二种方法具体的应用，解析的是新浪一个特定blog的文章的内容和标题代码如下：

#!/usr/bin/env python
#coding:utf8
import re
from BaseHTMLProcessor import BaseHTMLProcessor
import urllib

class Dialectizer(BaseHTMLProcessor):
    subs = ()

    def reset(self):
        # extend (called from __init__ in ancestor)
        # Reset all data attributes
        self.verbatim = 0
        BaseHTMLProcessor.reset(self)

    def unknown_starttag(self, tag, attrs):
        self.pieces.append("")
        
    def unknown_endtag(self, tag):
        self.pieces.append("")
        
    def start_title(self, attrs):
        self.pieces.append("title")  
    
    def end_title(self): 
        self.pieces.append("title")
        
    def start_p(self, attrs):
        self.pieces.append("\n")  
    
    def end_p(self): 
        self.pieces.append("")
        
    def start_div(self, attrs):
        strattrs = "".join([value for key, value in attrs])
        self.pieces.append(strattrs)        
       
    
    def end_div(self):  
        self.pieces.append("div") 
    
    def handle_data(self, text):
        self.pieces.append(self.verbatim and text or self.process(text))

    def process(self, text):
        for fromPattern, toPattern in self.subs:
            text = re.sub(fromPattern, toPattern, text)
        return text


def translate(url):    
    import urllib                      
    sock = urllib.urlopen(url)         
    htmlSource = sock.read()           
    sock.close()                    
    parser = Dialectizer()
    #parser.subs=((r"&#26412;",r"aaa"),)
    parser.feed(htmlSource)#进行解析
    parser.close()         
    return parser.output() 

def test(url,filename):
    htmlSource=translate(url)
    #标题
    title=htmlSource[re.search("title",htmlSource).end():]
    title=title[:re.search("title",title).end()-5]
    #内容
    content=htmlSource[re.search("articleBody",htmlSource).end()+2:]
    content=content[:re.search("div",content).end()-3]
    content=re.sub("&nbsp;","",content)
    content=re.sub("nbsp;","",content)
    #文件名称
    fileName=title;
    #输出的文件内容
    fileContent=title+"\n\n\n"+content;    
    fsock = open(filename, "wb")
    fsock.write(fileContent)
    fsock.close()

if __name__ == "__main__":
    test("http://blog.sina.com.cn/s/blog_4bd7b9a20100cpgb.html",'test.txt')

代码.zip (3.9 KB)
下载次数: 32

分享到：

python备份文件 | 关于shell函数的总结

2009-03-20 18:13
浏览 5155
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

python 解析url

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

python 解析url

评论

发表评论

相关推荐

GAE发布

django简单的入门例子

摘自python cookbook2(文本文件)

摘自python cookbook1(字符串，字典)

flup安装问题

xml的解析例子

python备份文件

使用python下载日志

python控制流

python类型转换、数值操作

python学习之类型和对象

Django 的数据库查询

摘自python的文档

python发送email

python的简单文件操作

python的数据库链接

用python实现的时间函数

最近访客更多访客>>