-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.go
82 lines (71 loc) · 2.09 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package main
import (
"sync"
"os"
"github.com/PuerkitoBio/gocrawl"
"github.com/maddevsio/simple-config"
"github.com/maddevsio/spiderwoman/lib"
"github.com/urfave/cli"
)
type Ext struct {
*gocrawl.DefaultExtender
}
var (
mutex sync.Mutex
hosts []string
StopHosts []lib.StopHostItem
syncResolve sync.WaitGroup
err error
externalLinksIterator int
externalLinks map[string]map[string]int
externalLinksResolved map[string]map[string]int
config simple_config.SimpleConfig = simple_config.NewSimpleConfig("./config", "yml")
userAgent string = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
resolveURLsPool int = 100
verbose bool = true
maxVisits int = 10
resolveTimeout int = 30
sqliteDBPath string = config.GetString("db-path")
excelFilePath string = config.GetString("xls-path")
internalOutPatterns []string = []string{"/go/", "/go.php?", "/goto/", "/banners/click/", "/adrotate-out.php?", "/bsdb/bs.php?"}
badSuffixes []string = []string{".png", ".jpg", ".pdf"}
)
type Path struct {
SqliteDBPath string
SourcesFilePath string
SourcesDefaultFilePath string
TypesFilePath string
TypesDefaultFilePath string
}
func main() {
app := cli.NewApp()
app.Name = "Spiderwoman"
app.Usage = "Vertical crawler, which main target is to count links (resolved, e.g. from bit.ly) to external domains from all pages of given resources"
app.Commands = []cli.Command{
{
Name: "once",
Aliases: []string{"o"},
Usage: "run the crawl and stop",
Action: actionOnce,
},
{
Name: "forever",
Aliases: []string{"f"},
Usage: "start crawl forever using cron feature",
Action: actionForever,
},
{
Name: "excel",
Aliases: []string{"e"},
Usage: "only create xls file",
Action: actionExcel,
},
{
Name: "grab",
Aliases: []string{"g"},
Usage: "use grabber service only",
Action: actionGrab,
},
}
app.Run(os.Args)
}