2017-09-24 70 views
1

我正在编写一个go项目,它是一个简单的网络爬虫,用于爬取网站上的链接。我想试验诸如goroutines和渠道等并发功能。但是当我运行它时,它没有经过。没有任何东西显示为没有任何事情发生。我不知道哪里出了问题。有人可以为我指出吗?了解正确使用golang并发上下文中的通道

它的工作原理和显示所有爬行链接,如果我删除通道逻辑,但我希望它发送链接到缓冲通道,然后显示链接,然后结束程序。该程序应该能够达到程序中指定的任何深度。目前,深度为1

package main 

import (
    "fmt" 
    "log" 
    "net/http" 
    "os" 
    "strings" 
    "time" 

    "golang.org/x/net/html" 
) 

// Link type to be sent over channel 
type Link struct { 
    URL string 
    ok bool 
} 

func main() { 
    if len(os.Args) != 2 { 
     fmt.Println("Usage: crawl [URL].") 
    } 

    url := os.Args[1] 
    if !strings.HasPrefix(url, "http://") { 
     url = "http://" + url 
    } 

    ch := make(chan *Link, 5) 
    crawl(url, 1, ch) 

    visited := make(map[string]bool) 

    time.Sleep(2 * time.Second) 

    for link := range ch { 
     if _, ok := visited[link.URL]; !ok { 
      visited[link.URL] = true 
     } 
    } 

    close(ch) 
    for l := range visited { 
     fmt.Println(l) 
    } 
} 

func crawl(url string, n int, ch chan *Link) { 
    if n < 1 { 
     return 
    } 
    resp, err := http.Get(url) 
    if err != nil { 
     log.Fatalf("Can not reach the site. Error = %v\n", err) 
     os.Exit(1) 
    } 

    b := resp.Body 
    defer b.Close() 

    z := html.NewTokenizer(b) 

    nextN := n - 1 
    for { 
     token := z.Next() 

     switch token { 
     case html.ErrorToken: 
      return 
     case html.StartTagToken: 
      current := z.Token() 
      if current.Data != "a" { 
       continue 
      } 
      result, ok := getHrefTag(current) 
      if !ok { 
       continue 
      } 

      hasProto := strings.HasPrefix(result, "http") 
      if hasProto { 
       go crawl(result, nextN, ch) 
       ch <- &Link{result, true} 
      } 
     } 
    } 

} 

func getHrefTag(token html.Token) (result string, ok bool) { 
    for _, a := range token.Attr { 
     if a.Key == "href" { 
      result = a.Val 
      ok = true 
      break 
     } 
    } 
    return 
} 

更新:

经过一番摆弄我想通了更改代码以删除数据的比赛,但是我还是不知道如何避免抓取网址以前访问过(也许我应该开始一个问题吗?):

package main 

import (
    "fmt" 
    "log" 
    "net/http" 
    "os" 
    "strings" 

    "golang.org/x/net/html" 
) 

func main() { 
    if len(os.Args) != 2 { 
     fmt.Println("Usage: crawl [URL].") 
    } 

    url := os.Args[1] 
    if !strings.HasPrefix(url, "http://") { 
     url = "http://" + url 
    } 

    for link := range newCrawl(url, 1) { 
     fmt.Println(link) 
    } 
} 

func newCrawl(url string, num int) chan string { 
    ch := make(chan string, 20) 

    go func() { 
     crawl(url, 1, ch) 
     close(ch) 
    }() 

    return ch 
} 

func crawl(url string, n int, ch chan string) { 
    if n < 1 { 
     return 
    } 
    resp, err := http.Get(url) 
    if err != nil { 
     log.Fatalf("Can not reach the site. Error = %v\n", err) 
     os.Exit(1) 
    } 

    b := resp.Body 
    defer b.Close() 

    z := html.NewTokenizer(b) 

    nextN := n - 1 
    for { 
     token := z.Next() 

     switch token { 
     case html.ErrorToken: 
      return 
     case html.StartTagToken: 
      current := z.Token() 
      if current.Data != "a" { 
       continue 
      } 
      result, ok := getHrefTag(current) 
      if !ok { 
       continue 
      } 

      hasProto := strings.HasPrefix(result, "http") 
      if hasProto { 
       done := make(chan struct{}) 
       go func() { 
        crawl(result, nextN, ch) 
        close(done) 
       }() 
       <-done 
       ch <- result 
      } 
     } 
    } 
} 

func getHrefTag(token html.Token) (result string, ok bool) { 
    for _, a := range token.Attr { 
     if a.Key == "href" { 
      result = a.Val 
      ok = true 
      break 
     } 
    } 
    return 
} 
+0

主要的goroutine发送到信道(在调用'crawl')和稍后从信道接收。如果发送超过5个链接,程序将会死锁。 –

+0

@CeriseLimón谢谢。我试图将它改为150,但它仍然是僵局。有什么建议么?我不认为一个正常的网站在其主页上有超过150个链接。 – newguy

+0

通过从main调用'go craw(url,1,ch)'来解决这个问题。下一个问题是'ch'上的'main'块。有些东西需要关闭'ch'才能继续打印链接。 –

回答

0

我觉得够程的递归调用是不是好主意。它可以简单地失控..我宁愿多平面模型是这样的:

package main 

import (
    "fmt" 
    "log" 
    "net/http" 
    "os" 
    "strings" 
    "sync" 

    "golang.org/x/net/html" 
) 

func main() { 

    if len(os.Args) != 2 { 
     fmt.Println("Usage: crawl [URL].") 
    } 

    url := os.Args[1] 
    if !strings.HasPrefix(url, "http://") { 
     url = "http://" + url 
    } 

    wg := NewWorkGroup(1) 
    wg.Crawl(url) 
    for k, v := range wg.urlMap { 
     fmt.Printf("%s: %d\n", k, v) 
    } 
} 

// represents single link and its deph 
type Link struct { 
    url string 
    deph uint32 
} 

// wraps all around to group 
type WorkGroup struct { 
    *sync.WaitGroup 
    maxDeph uint32 
    numW int 
    pool chan *Worker 
    linkQ chan Link 
    urlMap map[string]uint32 
} 

type Worker struct { 
    result chan []Link 
} 

func newWorker() *Worker { 
    return &Worker{ 
     result: make(chan []Link), 
    } 
} 

func NewWorkGroup(maxDeph uint32) *WorkGroup { 
    numW := int(maxDeph) 
    if maxDeph > 10 { 
     numW = 10 
    } 
    return &WorkGroup{ 
     WaitGroup: new(sync.WaitGroup), 
     maxDeph: maxDeph, 
     numW:  numW, 
     pool:  make(chan *Worker, numW), 
     linkQ:  make(chan Link, 100), 
     urlMap: make(map[string]uint32), 
    } 
} 

// dispatch workers -> filter visited -> send not visited to channel 
// pool + dispatcher keep order so workers go level by level 
func (wg *WorkGroup) spawnDispatcher() { 
    wg.Add(1) 
    go func() { 
     defer wg.Done() 
     defer close(wg.linkQ) 

     for w := range wg.pool { 
      links := <-w.result 
      for i := 0; i < len(links); i++ { 
       if _, ok := wg.urlMap[links[i].url]; !ok { 
        wg.urlMap[links[i].url] = links[i].deph 

        // dont process links that reach max deph 
        if links[i].deph < wg.maxDeph { 
         select { 
         case wg.linkQ <- links[i]: 
          // goes well 
          continue 
         default: 
          // channel is too short, protecting possible deadlock 
         } 
         // drop rest of links 
         break 
        } 
       } 
      } 
      // empty link channel + nothing in process = end 
      if len(wg.linkQ) == 0 && len(wg.pool) == 0 { 
       return 
      } 
     } 
    }() 
} 

//initialize goroutines and crawl url 
func (wg *WorkGroup) Crawl(url string) { 
    defer close(wg.pool) 
    wg.spawnCrawlers() 
    wg.spawnDispatcher() 
    wg.linkQ <- Link{url: url, deph: 0} 
    wg.Wait() 
} 

func (wg *WorkGroup) spawnCrawlers() { 
    // custom num of workers, used maxDeph 
    for i := 0; i < wg.numW; i++ { 
     wg.newCrawler() 
    } 
} 

func (wg *WorkGroup) newCrawler() { 
    wg.Add(1) 
    go func(w *Worker) { 
     defer wg.Done() 
     defer close(w.result) 

     for link := range wg.linkQ { 
      wg.pool <- w 
      w.result <- getExternalUrls(link) 
     } 
    }(newWorker()) 
} 

// default sligtly modified crawl function 
func getExternalUrls(source Link) []Link { 
    resp, err := http.Get(source.url) 
    if err != nil { 
     log.Printf("Can not reach the site. Error = %v\n", err) 
     return nil 
    } 

    b := resp.Body 
    defer b.Close() 

    z := html.NewTokenizer(b) 

    links := []Link{} 

    for { 
     token := z.Next() 

     switch token { 
     case html.ErrorToken: 
      return links 
     case html.StartTagToken: 
      current := z.Token() 
      if current.Data != "a" { 
       continue 
      } 
      url, ok := getHrefTag(current) 
      if ok && strings.HasPrefix(url, "http") { 
       links = append(links, Link{url: url, deph: source.deph + 1}) 
      } 
     } 
    } 
    return links 
} 

//default function 
func getHrefTag(token html.Token) (result string, ok bool) { 
    for _, a := range token.Attr { 
     if a.Key == "href" { 
      result = a.Val 
      ok = true 
      break 
     } 
    } 
    return 
} 
+0

这看起来很有趣。事实上,我写的代码是另一个人的博客文章的修改版本,但我保留其主要结构。你为什么认为用你的方式做得更好?你可以给出这个平面模型背后的基本原理吗? – newguy

+0

当我看着你的代码(更新版本)时,每个goroutine都会阻塞它的父代。所以一次只有一个goroutine工作。让我们可以定为999--那里有998间goroutines等待最后一个。我认为这不是用于goroutines的。其实我觉得这可能是相当昂贵的做法。 – bigless

+0

平=只有主要产卵程序,没有递归 – bigless