forked from tengfei-xy/amazon-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
product.go
202 lines (176 loc) · 5.65 KB
/
product.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
package main
import (
"fmt"
"net/http"
"strings"
"github.com/PuerkitoBio/goquery"
log "github.com/tengfei-xy/go-log"
)
type productStruct struct {
// 产品页的商家页面链接
url string
// 产品页的商家ID
id string
}
const MYSQL_PRODUCT_STATUS_INSERT int = 0
const MYSQL_PRODUCT_STATUS_CHEKCK int = 1
const MYSQL_PRODUCT_STATUS_OVER int = 2
const MYSQL_PRODUCT_STATUS_ERROR_OVER int = 3
const MYSQL_PRODUCT_STATUS_NO_PRODUCT int = 4
func (product *productStruct) main() error {
if !app.Exec.Enable.Product {
log.Warn("跳过 产品")
return nil
}
if app.Exec.Loop.Product == app.Exec.Loop.product_time {
log.Warn("已达到执行次数 产品")
return nil
}
log.Infof("------------------------")
log.Infof("2. 开始从产品页获取商家ID")
if app.Exec.Loop.Product == 0 {
log.Info("循环次数无限")
} else {
log.Infof("循环次数剩余:%d", app.Exec.Loop.Product-app.Exec.Loop.product_time)
}
app.Exec.Loop.product_time++
app.update(MYSQL_APPLICATION_STATUS_PRODUCT)
_, err := app.db.Exec("UPDATE product SET status = ? ,app = ? WHERE (status = ? or status=?) and (app=? or app=?) LIMIT 100", MYSQL_PRODUCT_STATUS_CHEKCK, app.Basic.App_id, MYSQL_PRODUCT_STATUS_INSERT, MYSQL_PRODUCT_STATUS_ERROR_OVER, 0, app.Basic.App_id)
if err != nil {
log.Errorf("更新product表失败,%v", err)
return err
}
row, err := app.db.Query(`select id,url,param from product where status=? and app = ?`, MYSQL_PRODUCT_STATUS_CHEKCK, app.Basic.App_id)
if err != nil {
log.Errorf("查询product表失败,%v", err)
return err
}
for row.Next() {
product.id = ""
var primary_id int64
var url, param string
if err := row.Scan(&primary_id, &url, ¶m); err != nil {
log.Errorf("获取product表的值失败,%v", err)
continue
}
if strings.HasPrefix(url, "http") {
continue
}
url = "https://" + app.Domain + url + param
if err := robot.IsAllow(userAgent, url); err != nil {
log.Errorf("%v", err)
continue
}
log.Infof("查找商品链接 ID:%d url:%s", primary_id, url)
err := product.request(url)
if err != nil {
if err == ERROR_NOT_SELLER_URL {
product.update_status(primary_id, MYSQL_PRODUCT_STATUS_NO_PRODUCT)
continue
} else if err == ERROR_NOT_404 || err == ERROR_NOT_503 || err == ERROR_VERIFICATION {
product.update_status(primary_id, MYSQL_PRODUCT_STATUS_ERROR_OVER)
log.Error(err)
sleep(300)
continue
} else {
product.update_status(primary_id, MYSQL_PRODUCT_STATUS_ERROR_OVER)
log.Error(err)
sleep(300)
continue
}
}
product.get_seller_id()
err = product.insert_selll_id()
if is_duplicate_entry(err) {
log.Infof("店铺已存在 商家ID:%s", product.id)
err = nil
}
if err != nil {
log.Error(err)
continue
}
if err := product.update_status(primary_id, MYSQL_PRODUCT_STATUS_OVER); err != nil {
log.Error(err)
continue
}
}
log.Infof("2. 结束从产品页获取商家ID")
log.Infof("------------------------")
return nil
}
func (product *productStruct) request(url string) error {
client := get_client()
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return err
}
req.Header.Set("Authority", app.Domain)
req.Header.Set("Accept", `text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7`)
req.Header.Set("Accept-Language", `zh-CN,zh;q=0.9`)
req.Header.Set("cache-control", `max-age=0`)
req.Header.Set("device-memory", `8`)
req.Header.Set("downlink", `1.5'`)
req.Header.Set("dpr", `2`)
req.Header.Set("ect", `3g`)
req.Header.Set("rtt", `350`)
if _, err := app.get_cookie(); err != nil {
log.Error(err)
} else {
req.Header.Set("Cookie", app.cookie)
}
req.Header.Set("upgrade-insecure-requests", `1`)
req.Header.Set("Referer", fmt.Sprintf("https://%s/?k=Hardware+electricia%%27n&crid=3CR8DCX0B3L5U&sprefix=hardware+electricia%%27n%%2Caps%%2C714&ref=nb_sb_noss", app.Domain))
req.Header.Set("Sec-Fetch-Dest", `empty`)
req.Header.Set("Sec-Fetch-Mode", `cors`)
req.Header.Set("Sec-Fetch-Site", `same-origin`)
req.Header.Set("User-Agent", userAgent)
req.Header.Set("sec-ch-ua", `"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"`)
req.Header.Set("sec-ch-ua-mobile", `?0`)
req.Header.Set("sec-ch-ua-platform", `"macOS"`)
resp, err := client.Do(req)
if err != nil {
log.Errorf("内部错误:%v", err)
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Errorf("状态码:%d", resp.StatusCode)
return err
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return fmt.Errorf("内部错误:%v", err)
}
if doc.Find("h4").First().Text() == "Enter the characters you see below" {
return ERROR_VERIFICATION
}
res := doc.Find("a[id=sellerProfileTriggerId]").First()
url, exist := res.Attr("href")
if !exist {
return ERROR_NOT_SELLER_URL
}
product.url = url
return nil
}
func (product *productStruct) get_seller_id() string {
for _, j := range strings.Split(product.url, "&") {
if strings.HasPrefix(j, "seller=") {
product.id = strings.Split(j, "seller=")[1]
}
}
// if ( product.id=="")
return product.id
}
func (product *productStruct) insert_selll_id() error {
_, err := app.db.Exec("insert into seller (seller_id,app_id) values(?,?)", product.id, 0)
return err
}
func (product *productStruct) update_status(id int64, s int) error {
_, err := app.db.Exec("UPDATE product SET status = ? ,app = ? WHERE id = ?", s, app.Basic.App_id, id)
if err != nil {
log.Infof("更新product表状态失败 ID:%d app:%d 状态:%d", id, app.Basic.App_id, s)
return err
}
log.Infof("更新product表状态成功 ID:%d 状态:%d app:%d ", id, s, app.Basic.App_id)
return nil
}