-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 52a7273
Showing
8 changed files
with
721 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
config.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"strings" | ||
) | ||
|
||
func is_duplicate_entry(err error) bool { | ||
if err == nil { | ||
return false | ||
} | ||
if strings.Contains(err.Error(), "Duplicate entry") { | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
var ERROR_NOT_SELLER error = fmt.Errorf("不存在商家链接") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
module amazon-crawler | ||
|
||
go 1.19 | ||
|
||
require ( | ||
github.com/PuerkitoBio/goquery v1.8.1 | ||
github.com/chromedp/chromedp v0.9.3 | ||
github.com/go-sql-driver/mysql v1.7.1 | ||
github.com/tengfei-xy/go-log v0.1.2 | ||
gopkg.in/yaml.v3 v3.0.1 | ||
) | ||
|
||
require ( | ||
github.com/andybalholm/cascadia v1.3.1 // indirect | ||
github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 // indirect | ||
github.com/chromedp/sysutil v1.0.0 // indirect | ||
github.com/gobwas/httphead v0.1.0 // indirect | ||
github.com/gobwas/pool v0.2.1 // indirect | ||
github.com/gobwas/ws v1.3.0 // indirect | ||
github.com/josharian/intern v1.0.0 // indirect | ||
github.com/mailru/easyjson v0.7.7 // indirect | ||
golang.org/x/net v0.7.0 // indirect | ||
golang.org/x/sys v0.6.0 // indirect | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= | ||
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= | ||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= | ||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= | ||
github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 h1:2zipcnjfFdqAjOQa8otCCh0Lk1M7RBzciy3s80YAKHk= | ||
github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= | ||
github.com/chromedp/chromedp v0.9.3 h1:Wq58e0dZOdHsxaj9Owmfcf+ibtpYN1N0FWVbaxa/esg= | ||
github.com/chromedp/chromedp v0.9.3/go.mod h1:NipeUkUcuzIdFbBP8eNNvl9upcceOfWzoJn6cRe4ksA= | ||
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= | ||
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= | ||
github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= | ||
github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= | ||
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= | ||
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= | ||
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= | ||
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= | ||
github.com/gobwas/ws v1.3.0 h1:sbeU3Y4Qzlb+MOzIe6mQGf7QR4Hkv6ZD0qhGkBFL2O0= | ||
github.com/gobwas/ws v1.3.0/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= | ||
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= | ||
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= | ||
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= | ||
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= | ||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= | ||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= | ||
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= | ||
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= | ||
github.com/tengfei-xy/go-log v0.1.2 h1:4n/x/6YsQF6MzzG5Prhg11OUZOoY11LcQlK5fwPj7TQ= | ||
github.com/tengfei-xy/go-log v0.1.2/go.mod h1:k3W+Vs69F7ldZQ88XqDP1r0b9oo6NvxORmH50FQAnHE= | ||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= | ||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= | ||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= | ||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= | ||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= | ||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= | ||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= | ||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= | ||
golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= | ||
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= | ||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= | ||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= | ||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= | ||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= | ||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= | ||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= | ||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= | ||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= | ||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= | ||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= | ||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= | ||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= | ||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= | ||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= | ||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= | ||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= | ||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= | ||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= | ||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
package main | ||
|
||
import ( | ||
"database/sql" | ||
"fmt" | ||
"os" | ||
"os/signal" | ||
|
||
_ "github.com/go-sql-driver/mysql" | ||
log "github.com/tengfei-xy/go-log" | ||
"gopkg.in/yaml.v3" | ||
) | ||
|
||
const AMAZON_UK = "https://www.amazon.co.uk" | ||
const MYSQL_APPLICATION_STATUS_START int = 0 | ||
const MYSQL_APPLICATION_STATUS_OVER int = 1 | ||
const MYSQL_APPLICATION_STATUS_SEARCH int = 2 | ||
const MYSQL_APPLICATION_STATUS_SELLER int = 3 | ||
const MYSQL_APPLICATION_STATUS_TRN int = 4 | ||
|
||
type appConfig struct { | ||
Mysql `yaml:"mysql"` | ||
Identified `yaml:"identified"` | ||
db *sql.DB | ||
primary_id int64 | ||
} | ||
type Identified struct { | ||
App int `yaml:"app"` | ||
} | ||
type Mysql struct { | ||
Ip string `yaml:"ip"` | ||
Port string `yaml:"port"` | ||
Username string `yaml:"username"` | ||
Password string `yaml:"password"` | ||
Database string `yaml:"database"` | ||
} | ||
|
||
var app appConfig | ||
|
||
func init_config() { | ||
yamlFile, err := os.ReadFile("config.yaml") | ||
if err != nil { | ||
panic(err) | ||
} | ||
err = yaml.Unmarshal(yamlFile, &app) | ||
if err != nil { | ||
panic(err) | ||
} | ||
log.Infof("程序标识:%d", app.Identified.App) | ||
|
||
DB, err := sql.Open("mysql", fmt.Sprintf("%s:%s@tcp(%s:%s)/%s", app.Mysql.Username, app.Mysql.Password, app.Mysql.Ip, app.Mysql.Port, app.Mysql.Database)) | ||
if err != nil { | ||
panic(err) | ||
} | ||
DB.SetConnMaxLifetime(100) | ||
DB.SetMaxIdleConns(10) | ||
if err := DB.Ping(); err != nil { | ||
panic(err) | ||
} | ||
log.Info("数据库已连接") | ||
app.db = DB | ||
|
||
} | ||
|
||
func init_signal() { | ||
c := make(chan os.Signal, 1) | ||
signal.Notify(c, os.Interrupt) | ||
signal.Notify(c, os.Kill) | ||
<-c | ||
app.end() | ||
app.db.Close() | ||
log.Infof("程序结束") | ||
} | ||
func main() { | ||
init_config() | ||
app.start() | ||
|
||
// go init_signal() | ||
|
||
// var s search | ||
// s.main() | ||
|
||
var seller sellerStruct | ||
seller.main() | ||
|
||
var trn trnStruct | ||
trn.main() | ||
|
||
app.end() | ||
} | ||
func (app *appConfig) start() { | ||
r, err := app.db.Exec("insert into application (app_id) values(?)", app.Identified.App) | ||
if err != nil { | ||
panic(err) | ||
} | ||
id, err := r.LastInsertId() | ||
if err != nil { | ||
panic(err) | ||
} | ||
app.primary_id = id | ||
} | ||
func (app *appConfig) update(status int) { | ||
_, err := app.db.Exec("update application set status=? where id=?", status, app.primary_id) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
func (app *appConfig) end() { | ||
app.db.Exec("update into application set status=? where id=?", MYSQL_APPLICATION_STATUS_OVER, app.primary_id) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"strings" | ||
|
||
"github.com/PuerkitoBio/goquery" | ||
"github.com/chromedp/chromedp" | ||
log "github.com/tengfei-xy/go-log" | ||
) | ||
|
||
const MYSQL_SEARCH_STATUS_START int64 = 0 | ||
const MYSQL_SEARCH_STATUS_OVER int64 = 1 | ||
|
||
type search struct { | ||
zh_key string | ||
en_key string | ||
category_id int64 | ||
url string | ||
start int | ||
end int | ||
html string | ||
valid int | ||
} | ||
|
||
func (s *search) main() error { | ||
app.update(MYSQL_APPLICATION_STATUS_SEARCH) | ||
|
||
log.Infof("------------------------") | ||
log.Infof("1. 开始搜索关键词") | ||
row, err := app.db.Query(`select id,zh_key,en_key from category order by priority`) | ||
if err != nil { | ||
return err | ||
} | ||
s.start = 1 | ||
s.end = 10 | ||
for row.Next() { | ||
row.Scan(&s.category_id, &s.zh_key, &s.en_key) | ||
s.en_key = s.set_en_key() | ||
insert_id, err := s.search_start() | ||
if err != nil { | ||
log.Errorf("插入失败 关键词:%s %v", s.zh_key, err) | ||
continue | ||
} | ||
for ; s.start < s.end; s.start++ { | ||
h, err := s.NewRequest(s.start) | ||
if err != nil { | ||
log.Error(err) | ||
continue | ||
} | ||
s.get_product_url(h) | ||
} | ||
err = s.search_end(insert_id) | ||
if err != nil { | ||
log.Errorf("更新结果失败 关键词:%s %v", s.zh_key, err) | ||
continue | ||
} | ||
s.start = 1 | ||
} | ||
log.Infof("------------------------") | ||
return nil | ||
} | ||
func (s *search) search_start() (int64, error) { | ||
r, err := app.db.Exec("insert into search_statistics(category_id,app) values(?,?)", s.category_id, app.Identified.App) | ||
if err != nil { | ||
return 0, err | ||
} | ||
|
||
id, err := r.LastInsertId() | ||
if err != nil { | ||
return 0, err | ||
} | ||
log.Infof("开始搜索 关键词:%s 关键词ID:%d 状态:%d(开始)", s.zh_key, s.category_id, MYSQL_SEARCH_STATUS_START) | ||
return id, nil | ||
} | ||
func (s *search) search_end(insert_id int64) error { | ||
_, err := app.db.Exec("update search_statistics set status=?,end=CURRENT_TIMESTAMP,valid=? where id=?", MYSQL_SEARCH_STATUS_OVER, s.valid, insert_id) | ||
if err != nil { | ||
return err | ||
} | ||
log.Infof("搜索完成 关键词:%s 完成ID:%d 有效数:%d", s.zh_key, insert_id, s.valid) | ||
return nil | ||
} | ||
func (s *search) set_en_key() string { | ||
return strings.ReplaceAll(strings.ReplaceAll(s.en_key, " ", "+"), "'", "%27") | ||
} | ||
func (s *search) NewRequest(seq int) (string, error) { | ||
url := fmt.Sprintf("https://www.amazon.co.uk/s?k=%s&page=%d&crid=2V9436DZJ6IJF&qid=1699839233&sprefix=clothe%%2Caps%%2C552&ref=sr_pg_2", s.en_key, seq) | ||
log.Infof("开始搜索 关键词:%s 页面:%d url:%s", s.zh_key, seq, url) | ||
|
||
// 创建一个新的上下文 | ||
ctx, cancel := chromedp.NewContext(context.Background()) | ||
defer cancel() | ||
|
||
// 运行任务 | ||
var htmlContent string | ||
err := chromedp.Run(ctx, | ||
chromedp.Navigate(url), | ||
chromedp.OuterHTML("html", &htmlContent), | ||
) | ||
if err != nil { | ||
return "", err | ||
} | ||
s.html = htmlContent | ||
// 打印最终的 HTML 代码 | ||
return htmlContent, nil | ||
} | ||
|
||
func (s *search) get_product_url(body string) { | ||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(body)) | ||
if err != nil { | ||
log.Errorf("内部错误:%v", err) | ||
return | ||
} | ||
defer func() { | ||
if err := recover(); err != nil { | ||
fmt.Println(body) | ||
return | ||
} | ||
}() | ||
|
||
res := doc.Find("div[class~=s-search-results]").First() | ||
|
||
res.Find("div[data-index]").Each(func(i int, h *goquery.Selection) { | ||
// 处理找到的 div 元素 | ||
link, exist := h.Find("a").First().Attr("href") | ||
if !exist { | ||
return | ||
} | ||
if strings.HasPrefix(link, "/s") || strings.HasPrefix(link, "/gp/") { | ||
return | ||
} | ||
url := strings.Split(link, "/ref=") | ||
_, err := app.db.Exec(`INSERT INTO product(url,param) values(?,?)`, url[0], "/ref="+url[1]) | ||
|
||
if is_duplicate_entry(err) { | ||
log.Infof("已存在 关键词:%s 链接:%s ", s.zh_key, link) | ||
return | ||
} | ||
if err != nil { | ||
log.Errorf("插入失败 关键词:%s 链接:%s %v ", s.zh_key, link, err) | ||
return | ||
} | ||
|
||
log.Infof("插入成功 关键词:%s 链接:%s ", s.zh_key, link) | ||
s.valid += 1 | ||
}) | ||
} |
Oops, something went wrong.