Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
tengfei-xy committed Nov 14, 2023
0 parents commit 52a7273
Show file tree
Hide file tree
Showing 8 changed files with 721 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
config.yaml
18 changes: 18 additions & 0 deletions error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package main

import (
"fmt"
"strings"
)

func is_duplicate_entry(err error) bool {
if err == nil {
return false
}
if strings.Contains(err.Error(), "Duplicate entry") {
return true
}
return false
}

var ERROR_NOT_SELLER error = fmt.Errorf("不存在商家链接")
24 changes: 24 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
module amazon-crawler

go 1.19

require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/chromedp/chromedp v0.9.3
github.com/go-sql-driver/mysql v1.7.1
github.com/tengfei-xy/go-log v0.1.2
gopkg.in/yaml.v3 v3.0.1
)

require (
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.3.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
golang.org/x/net v0.7.0 // indirect
golang.org/x/sys v0.6.0 // indirect
)
65 changes: 65 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM=
github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998 h1:2zipcnjfFdqAjOQa8otCCh0Lk1M7RBzciy3s80YAKHk=
github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.9.3 h1:Wq58e0dZOdHsxaj9Owmfcf+ibtpYN1N0FWVbaxa/esg=
github.com/chromedp/chromedp v0.9.3/go.mod h1:NipeUkUcuzIdFbBP8eNNvl9upcceOfWzoJn6cRe4ksA=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI=
github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.3.0 h1:sbeU3Y4Qzlb+MOzIe6mQGf7QR4Hkv6ZD0qhGkBFL2O0=
github.com/gobwas/ws v1.3.0/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/tengfei-xy/go-log v0.1.2 h1:4n/x/6YsQF6MzzG5Prhg11OUZOoY11LcQlK5fwPj7TQ=
github.com/tengfei-xy/go-log v0.1.2/go.mod h1:k3W+Vs69F7ldZQ88XqDP1r0b9oo6NvxORmH50FQAnHE=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
110 changes: 110 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package main

import (
"database/sql"
"fmt"
"os"
"os/signal"

_ "github.com/go-sql-driver/mysql"
log "github.com/tengfei-xy/go-log"
"gopkg.in/yaml.v3"
)

const AMAZON_UK = "https://www.amazon.co.uk"
const MYSQL_APPLICATION_STATUS_START int = 0
const MYSQL_APPLICATION_STATUS_OVER int = 1
const MYSQL_APPLICATION_STATUS_SEARCH int = 2
const MYSQL_APPLICATION_STATUS_SELLER int = 3
const MYSQL_APPLICATION_STATUS_TRN int = 4

type appConfig struct {
Mysql `yaml:"mysql"`
Identified `yaml:"identified"`
db *sql.DB
primary_id int64
}
type Identified struct {
App int `yaml:"app"`
}
type Mysql struct {
Ip string `yaml:"ip"`
Port string `yaml:"port"`
Username string `yaml:"username"`
Password string `yaml:"password"`
Database string `yaml:"database"`
}

var app appConfig

func init_config() {
yamlFile, err := os.ReadFile("config.yaml")
if err != nil {
panic(err)
}
err = yaml.Unmarshal(yamlFile, &app)
if err != nil {
panic(err)
}
log.Infof("程序标识:%d", app.Identified.App)

DB, err := sql.Open("mysql", fmt.Sprintf("%s:%s@tcp(%s:%s)/%s", app.Mysql.Username, app.Mysql.Password, app.Mysql.Ip, app.Mysql.Port, app.Mysql.Database))
if err != nil {
panic(err)
}
DB.SetConnMaxLifetime(100)
DB.SetMaxIdleConns(10)
if err := DB.Ping(); err != nil {
panic(err)
}
log.Info("数据库已连接")
app.db = DB

}

func init_signal() {
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt)
signal.Notify(c, os.Kill)
<-c
app.end()
app.db.Close()
log.Infof("程序结束")
}
func main() {
init_config()
app.start()

// go init_signal()

// var s search
// s.main()

var seller sellerStruct
seller.main()

var trn trnStruct
trn.main()

app.end()
}
func (app *appConfig) start() {
r, err := app.db.Exec("insert into application (app_id) values(?)", app.Identified.App)
if err != nil {
panic(err)
}
id, err := r.LastInsertId()
if err != nil {
panic(err)
}
app.primary_id = id
}
func (app *appConfig) update(status int) {
_, err := app.db.Exec("update application set status=? where id=?", status, app.primary_id)
if err != nil {
panic(err)
}
}
func (app *appConfig) end() {
app.db.Exec("update into application set status=? where id=?", MYSQL_APPLICATION_STATUS_OVER, app.primary_id)
}
149 changes: 149 additions & 0 deletions search.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package main

import (
"context"
"fmt"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
log "github.com/tengfei-xy/go-log"
)

const MYSQL_SEARCH_STATUS_START int64 = 0
const MYSQL_SEARCH_STATUS_OVER int64 = 1

type search struct {
zh_key string
en_key string
category_id int64
url string
start int
end int
html string
valid int
}

func (s *search) main() error {
app.update(MYSQL_APPLICATION_STATUS_SEARCH)

log.Infof("------------------------")
log.Infof("1. 开始搜索关键词")
row, err := app.db.Query(`select id,zh_key,en_key from category order by priority`)
if err != nil {
return err
}
s.start = 1
s.end = 10
for row.Next() {
row.Scan(&s.category_id, &s.zh_key, &s.en_key)
s.en_key = s.set_en_key()
insert_id, err := s.search_start()
if err != nil {
log.Errorf("插入失败 关键词:%s %v", s.zh_key, err)
continue
}
for ; s.start < s.end; s.start++ {
h, err := s.NewRequest(s.start)
if err != nil {
log.Error(err)
continue
}
s.get_product_url(h)
}
err = s.search_end(insert_id)
if err != nil {
log.Errorf("更新结果失败 关键词:%s %v", s.zh_key, err)
continue
}
s.start = 1
}
log.Infof("------------------------")
return nil
}
func (s *search) search_start() (int64, error) {
r, err := app.db.Exec("insert into search_statistics(category_id,app) values(?,?)", s.category_id, app.Identified.App)
if err != nil {
return 0, err
}

id, err := r.LastInsertId()
if err != nil {
return 0, err
}
log.Infof("开始搜索 关键词:%s 关键词ID:%d 状态:%d(开始)", s.zh_key, s.category_id, MYSQL_SEARCH_STATUS_START)
return id, nil
}
func (s *search) search_end(insert_id int64) error {
_, err := app.db.Exec("update search_statistics set status=?,end=CURRENT_TIMESTAMP,valid=? where id=?", MYSQL_SEARCH_STATUS_OVER, s.valid, insert_id)
if err != nil {
return err
}
log.Infof("搜索完成 关键词:%s 完成ID:%d 有效数:%d", s.zh_key, insert_id, s.valid)
return nil
}
func (s *search) set_en_key() string {
return strings.ReplaceAll(strings.ReplaceAll(s.en_key, " ", "+"), "'", "%27")
}
func (s *search) NewRequest(seq int) (string, error) {
url := fmt.Sprintf("https://www.amazon.co.uk/s?k=%s&page=%d&crid=2V9436DZJ6IJF&qid=1699839233&sprefix=clothe%%2Caps%%2C552&ref=sr_pg_2", s.en_key, seq)
log.Infof("开始搜索 关键词:%s 页面:%d url:%s", s.zh_key, seq, url)

// 创建一个新的上下文
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()

// 运行任务
var htmlContent string
err := chromedp.Run(ctx,
chromedp.Navigate(url),
chromedp.OuterHTML("html", &htmlContent),
)
if err != nil {
return "", err
}
s.html = htmlContent
// 打印最终的 HTML 代码
return htmlContent, nil
}

func (s *search) get_product_url(body string) {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
if err != nil {
log.Errorf("内部错误:%v", err)
return
}
defer func() {
if err := recover(); err != nil {
fmt.Println(body)
return
}
}()

res := doc.Find("div[class~=s-search-results]").First()

res.Find("div[data-index]").Each(func(i int, h *goquery.Selection) {
// 处理找到的 div 元素
link, exist := h.Find("a").First().Attr("href")
if !exist {
return
}
if strings.HasPrefix(link, "/s") || strings.HasPrefix(link, "/gp/") {
return
}
url := strings.Split(link, "/ref=")
_, err := app.db.Exec(`INSERT INTO product(url,param) values(?,?)`, url[0], "/ref="+url[1])

if is_duplicate_entry(err) {
log.Infof("已存在 关键词:%s 链接:%s ", s.zh_key, link)
return
}
if err != nil {
log.Errorf("插入失败 关键词:%s 链接:%s %v ", s.zh_key, link, err)
return
}

log.Infof("插入成功 关键词:%s 链接:%s ", s.zh_key, link)
s.valid += 1
})
}
Loading

0 comments on commit 52a7273

Please sign in to comment.