`
david_je
  • 浏览: 368739 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

Go写的爬虫

    博客分类:
  • go
Go 
阅读更多
    今天整理资料,惊奇发现去年7月份参加GDG的“七周七语言”活动时候,写的一个Go语言程序的。程序很简单,主要是访问一个博客网站,下载并保存网页,具体要求请见http://be001.com/jams/110

    先上程序:
   
    package main

import (
 "fmt"
 "os"
 "log"
 "net/http"
 "io/ioutil"
 "bytes"
 "encoding/base64"
 "path"
 "strings"
 "regexp"
)

const (
   BASEURL = "http://blog.csdn.net/xushiweizh"
   PATTERN = "href=\"([^<\"]+)\""
   INDEX = len(BASEURL)
   ROOT = "C:/docs/GoCode/disk/"
)

func getContent(url string) string{
     resp,err := http.Get(url)
     if err != nil{
        //handle error
        log.Fatal(err)
     }
     defer resp.Body.Close()
     body,err := ioutil.ReadAll(resp.Body)
     return string(body)
}

func base64Encoding(s string)string{
     var buf bytes.Buffer
     encoder := base64.NewEncoder(base64.StdEncoding, &buf)
     defer encoder.Close()
     encoder.Write([]byte(s))
     return buf.String()
}

func saveFile(content,filename string)error{
     base64Name  := base64Encoding(filename)
     if base64Name == "" {
        return nil
     }
     fout,err := os.Create(path.Join(ROOT,base64Name))
     defer fout.Close()
     if err != nil{
        //handler error
        log.Fatal(err)
        return err
     }
     for _,line := range strings.Split(content,"\r\n"){
        fout.WriteString(line + "\r\n")
     }
     return nil
}

func save(url string,links map[string]bool){
    content := getContent(url)
    err := saveFile(content,url[INDEX:])
    if err != nil{
       log.Fatal(err)
    } 
    links[url]=true
}

func traverse(regxp *regexp.Regexp, url string, links map[string]bool){
   LAST := strings.LastIndex(BASEURL,"/")
     for _, scan := range regxp.FindAllStringSubmatch(getContent(BASEURL),-1){
        var url string
        if strings.Contains(scan[1],"http://"){
           if strings.Contains(scan[1],BASEURL){
               url = scan[1]  
           }else{
             continue
           } 
        }else{
           if strings.Contains(scan[1],BASEURL[LAST:]){
               url = BASEURL[:LAST]+scan[1]
           }else{
             continue
           }
        }
        if v,ok := links[url];ok{
           if v == true {
              fmt.Println("Has download")
           }else{
              save(url,links) 
           }
        }else{
           save(url,links)
        }  
     }
}

func main(){
   links := make(map[string] bool)
   regxp,err := regexp.Compile(PATTERN)
   if err != nil{
      log.Fatal(err)
   }
   traverse(regxp,BASEURL,links)
   for key,val := range links{
       if !val{
          traverse(regxp,key,links)
       }
   }
}


    



分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics