-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcallbacks.go
76 lines (63 loc) · 1.61 KB
/
callbacks.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package main
import (
"log"
"net/http"
"strings"
"time"
"github.com/PuerkitoBio/gocrawl"
"github.com/PuerkitoBio/goquery"
"github.com/maddevsio/spiderwoman/lib"
)
func (e *Ext) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
log.Printf("Visit: %s\n", ctx.URL())
if doc == nil {
return nil, true
}
doc.Find("a").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
href = strings.ToLower(href)
// analyze absolute urls, e.g. http://bla.com/lolz
if strings.Contains(href, ctx.URL().Host) {
if !lib.HasInternalOutPatterns(href, internalOutPatterns) {
return
} else {
if verbose {
log.Print(href)
}
}
}
// analyze relative urls, e.g. /lolz.html
if !strings.HasPrefix(href, "http") {
if !lib.HasInternalOutPatterns(href, internalOutPatterns) {
return
} else {
href = ctx.URL().Scheme + "://" + ctx.URL().Host + href
if verbose {
log.Print(href)
}
}
}
if lib.HasStopHost(href, StopHosts) {
return
}
if lib.HasBadSuffixes(href, badSuffixes) {
return
}
mutex.Lock()
if externalLinks[ctx.URL().Host] == nil {
externalLinks[ctx.URL().Host] = make(map[string]int)
}
externalLinks[ctx.URL().Host][href] += 1
mutex.Unlock()
})
return nil, true
}
func (e *Ext) Filter(ctx *gocrawl.URLContext, isVisited bool) bool {
return true
}
func (de *Ext) RequestRobots(ctx *gocrawl.URLContext, robotAgent string) (data []byte, doRequest bool) {
return nil, false
}
func (e *Ext) ComputeDelay(host string, di *gocrawl.DelayInfo, lastFetch *gocrawl.FetchInfo) time.Duration {
return 0
}