跨境电商英文关键词拓展
这是一篇没有逼格的文章,来来去去都套路。做seo很多时候,加词等于加流量;废话不多说,下面针对一些电子商务平台来采集关键词,如阿里国际站,亚马孙,Ebay,敦煌网等,仅供学习参考!
采集阿里国际站下拉框关键词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#encoding=utf-8 import requests,re,random,time,multiprocessing
daili_list = []
def ip():
for x in open ( \'E:\BaiduNetdiskDownload\daili.txt\' ):
x = x.strip()
daili_list.append(x)
newip = random.choice(daili_list)
return newip
def getUA():
uaList = [
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)\' ,
\'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\' ,
]
newUa = random.choice(uaList)
return newUa
def gethtml(url,headers):
while 1 :
try :
newip = ip()
proxies = { "http" : "http://%s" % newip.strip()}
pages = requests.post(url,headers,proxies,timeout = 10 )
html = pages.content
code = pages.status_code
if \'302 Found\' in html or code ! = 200 :
print u \'代理失效重试\' ,url
continue
else :
return html
except Exception, e:
# print e
continue
def getKeyword(url):
headers = {
\'Accept\' : \'*/*\' ,
\'Accept-Encoding\' : \'gzip, deflate, sdch\' ,
\'Accept-Language\' : \'zh-CN,zh;q=0.8\' ,
\'Connection\' : \'keep-alive\' ,
\'Host\' : \'connectkeyword.alibaba.com\' ,
# \'Referer\':\'http://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=robot+vacuum+cleaner\',
\'User-Agent\' : \'%s\' % getUA(),
}
html = gethtml(url,headers)
re_data = re. compile (r "keywords:\'(.*?)\'" )
re_data = re.findall(re_data,html)
for word in re_data:
print word
urls = []
with open ( \'word.txt\' ) as f:
for i in f.readlines():
url = \'http://connectkeyword.alibaba.com/lenoIframeJson.htm?keyword=%s\' % i.strip()
urls.append(url)
if __name__ = = \'__main__\' :
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for url in urls:
pool.apply_async(getKeyword, (url, ))
pool.close()
pool.join()
|
采集阿里国际站相关关键词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
#encoding=utf-8 import requests,re,time,random,threading
daili_list = []
def ip():
for x in open ( \'E:\BaiduNetdiskDownload\daili.txt\' ):
x = x.strip()
daili_list.append(x)
newip = random.choice(daili_list)
return newip
def getUA():
uaList = [
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)\' ,
\'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\' ,
]
newUa = random.choice(uaList)
return newUa
headers = {
\'Accept\' : \'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\' ,
\'Accept-Encoding\' : \'gzip, deflate, sdch\' ,
\'Accept-Language\' : \'zh-CN,zh;q=0.8\' ,
\'Cache-Control\' : \'max-age=0\' ,
\'Connection\' : \'keep-alive\' ,
\'Host\' : \'www.alibaba.com\' ,
\'Referer\' : \'http://www.alibaba.com/\' ,
\'Upgrade-Insecure-Requests\' : \'1\' ,
\'User-Agent\' : \'%s\' % getUA(),
}
def gethtml(url):
while 1 :
try :
newip = ip()
proxies = { "http" : "http://%s" % newip.strip()}
pages = requests.post(url,headers,proxies,timeout = 10 )
# pages=requests.get(url,headers,timeout=10)
html = pages.content
code = pages.status_code
if \'302 Found\' in html or code ! = 200 :
print u \'代理失效重试\' ,url
continue
else :
return html
except Exception, e:
# print e
continue
class alibaba(threading.Thread):
def __init__( self , target):
super (alibaba, self ).__init__()
self .target = target
def run( self ):
self .get_data()
def get_data( self ):
for url in self .target:
html = gethtml(url)
re_data = re. compile (r "data-domdot=\'.*?\'>(.*?)</a>" )
re_data = re.findall(re_data,html)
for word in re_data:
print word
target = []
with open ( \'word.txt\' ) as f:
for word in f.readlines():
target.append( \'https://www.alibaba.com/products/%s.html\' % word.strip())
if __name__ = = \'__main__\' :
start_working = []
threads = 10
for i in range (threads):
get_target = alibaba(target[(( len (target) + threads - 1 ) / threads) * i:(( len (target) + threads - 1 ) / threads) * (i + 1 )])
start_working.append(get_target)
for i in range ( len (start_working)):
start_working[i].start()
for i in range ( len (start_working)):
start_working[i].join()
|
采集阿里国际站竞争对手对手关键词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
#encoding=utf-8 import requests,re,time,random,pycurl,StringIO,multiprocessing
\'\'\'用来存放关键词的csv文件\'\'\' op_csv = open ( \'keyword.csv\' , \'a+\' )
\'\'\'构造头部headers\'\'\' headers = [
\'authority:bairun.en.alibaba.com\' ,
\'method:GET\' ,
\'path:/productlist-2.html?isGallery=Y\' ,
\'scheme:https\' ,
\'accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\' ,
\'accept-encoding:gzip, deflate, sdch, br\' ,
\'accept-language:zh-CN,zh;q=0.8\' ,
# \'cookie:t=de2c15aaa52943b32f52047e5a99ca38; cna=GwWwEP55+xoCAXQVUP45eRCN; ali_apache_id=116.21.80.254.1479107002992.269078.3; xman_us_f=x_l=1; ali_beacon_id=116.21.80.254.1479107004102.167056.7; _ga=GA1.2.625111664.1480316413; _umdata=ED82BDCEC1AA6EB98C548550E8C44E0AE5F011C4983AF5532DF3387C1347EB9563BE680EE5DD1E6845CFE81BBEE63D88186BA677F30EB7AD6FF80CCC2BD4A3C798A6569850CD68BEB0973249799147735C5F1F0CC270D560F3C2D4966392052A; xman_f=vfBloadSAJR5D11GhbA21T8ukbUCuLph798C5cGdkR4CM6nJE5+SDIspmSWNAJjVpORPK888ZngYPtT+9acvoe9HNCphoIX0KhmARYzojiePaJr6eHAz7g==; history=product_selloffer%5E%0A60585412097%24%0A1821282900%24%0A60455100674; gangesweb-buckettest=116.21.80.254.1479107019387.1; ali_ab=116.21.80.254.1479107004134.0; ali_apache_track="mid=fastcleaner"; l=As/PHJMu9yY8YvH2zsyDpZ5032nZCiMW; isg=AtLSib8p3iaTwyK42wcgudK6I5jnytZ9bqvVYZwrAAVwr3KphHMmjdiPabxp
\'upgrade-insecure-requests:1\' ,
\'user-agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36\' ,
] \'\'\'加入代理\'\'\' daili_list = []
def ip():
for x in open ( \'E:\BaiduNetdiskDownload\daili.txt\' ):
x = x.strip()
daili_list.append(x)
newip = random.choice(daili_list)
return newip
\'\'\'re.search提取所需数据\'\'\' def search(re_url,html):
re_Data = re.search(re_url,html)
if re_Data:
return re_Data.group( 1 )
else :
return \'no\'
\'\'\'获取html源码\'\'\' def gethtml(url):
while 1 :
try :
# newip=ip() #代理
c = pycurl.Curl() #通过curl方法构造一个对象
c.setopt(pycurl.FOLLOWLOCATION, True ) #自动进行跳转抓取
c.setopt(pycurl.MAXREDIRS, 5 ) #设置最多跳转多少次
c.setopt(pycurl.CONNECTTIMEOUT, 60 ) #设置链接超时
c.setopt(pycurl.TIMEOUT, 120 ) #下载超时
c.setopt(pycurl.ENCODING, \'gzip,deflate\' ) #处理gzip内容,有些傻逼网站,就算你给的请求没有gzip,它还是会返回一个gzip压缩后的网页
# c.setopt(c.PROXY,newip) # 代理
c.fp = StringIO.StringIO()
c.setopt(pycurl.URL, url) #设置要访问的URL
c.setopt(pycurl.HTTPHEADER,headers) #传入请求头
# c.setopt(pycurl.POST, 1)
# c.setopt(pycurl.POSTFIELDS, data) #传入POST数据
c.setopt(c.WRITEFUNCTION, c.fp.write) #回调写入字符串缓存
# c.setopt(pycurl.SSL_VERIFYPEER, 0)
# c.setopt(pycurl.SSL_VERIFYHOST, 0)
# c.setopt(pycurl.HEADERFUNCTION, headerCookie)
# c.setopt(pycurl.COOKIE,Cookie)
c.perform()
code = c.getinfo(c.HTTP_CODE) #返回状态码
html = c.fp.getvalue() #返回源代码
if \'302 Found\' in html or code ! = 200 :
print u \'代理失效重试\' ,url
continue
else :
return html
except Exception, e:
# print e
continue
\'\'\'主程序:获取标题和关键词\'\'\' def getKeyword(url):
html = gethtml(url)
\'\'\'提取当前页面想要的url\'\'\'
re_url = re. compile (r \'<a href="(/product/.*?html)" target="_blank" title=".*?" data-domdot=".*?,mn:.*?,ext:.*?">[\s\S]*?</a>\' )
re_url = re.findall(re_url,html)
\'\'\'遍历url\'\'\'
for i in re_url:
\'\'\'构造url\'\'\'
url = \'http://olinecentury.en.alibaba.com\' + i.strip()
html = gethtml(url)
\'\'\'提取标题\'\'\'
re_title = re. compile (r \'<span class="title-text">(.*?)</span>\' )
title = search(re_title,html).strip()
\'\'\'提取关键词\'\'\'
re_word = re. compile (r \'<li><a href="http://www\.alibaba\.com/showroom/.*?html" class="qrPRskw">(.*?)</a></li>\' )
keyword = search(re_word,html).strip()
\'\'\'打印\'\'\'
print u \'标题:\' + title,u \'关键词:\' + keyword
op_csv.writelines( \'%s,%s\n\' % (title,keyword))
\'\'\'拼接列表页的url,并存放到urls这个列表里\'\'\' urls = []
for i in range ( 1 , 6 ):
url = \'http://olinecentury.en.alibaba.com/productgrouplist-803875670-%d/vacuum_cleaner.html?isGallery=Y\' % i
urls.append(url)
\'\'\'多进程开始\'\'\' if __name__ = = \'__main__\' :
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for url in urls: #从urls这个列表遍历单条url
pool.apply_async(getKeyword, (url, )) #传入到主程序def getKeyword(url)
pool.close()
pool.join()
|
亚马逊下拉框关键词采集
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
#encoding=utf-8 import requests,re,random,time,multiprocessing,urllib
daili_list = []
def ip():
for x in open ( \'E:\BaiduNetdiskDownload\daili.txt\' ):
x = x.strip()
daili_list.append(x)
newip = random.choice(daili_list)
return newip
def getUA():
uaList = [
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)\' ,
\'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\' ,
]
newUa = random.choice(uaList)
return newUa
def gethtml(url,headers):
while 1 :
try :
# newip=ip()
# proxies={"http":"http://%s"%newip.strip()}
# pages=requests.post(url,headers,proxies,timeout=10)
pages = requests.get(url,headers,timeout = 10 )
html = pages.content
code = pages.status_code
if \'302 Found\' in html or code ! = 200 :
print u \'代理失效重试\' ,url
continue
else :
return html
except Exception, e:
# print e
continue
\'\'\'re.search提取所需数据\'\'\' def search(re_url,html):
re_Data = re.search(re_url,html)
if re_Data:
return re_Data.group( 1 )
else :
return \'no\'
def getKeyword(url):
headers = {
\'Accept\' : \'*/*\' ,
\'Accept-Encoding\' : \'gzip, deflate, sdch, br\' ,
\'Accept-Language\' : \'zh-CN,zh;q=0.8\' ,
\'Connection\' : \'keep-alive\' ,
\'Host\' : \'completion.amazon.com\' ,
# \'Referer\':\'http://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=robot+vacuum+cleaner\',
\'User-Agent\' : \'%s\' % getUA(),
}
html = gethtml(url,headers)
# print html
re_data = re. compile (r \'completion = (.*?)"nodes"\' )
re_data = search(re_data,html)
if re_data ! = \'no\' :
re_word = re. compile (r \'"(.*?)"\' )
re_data = re.findall(re_word,re_data)
for word in re_data:
print word
urls = []
with open ( \'word.txt\' ) as f:
for i in f.readlines():
url = \'https://completion.amazon.com/search/complete?mkt=1&p=Search&x=String&search-alias=aps&q=%s\' % i.strip()
urls.append(url)
if __name__ = = \'__main__\' :
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for url in urls:
pool.apply_async(getKeyword, (url, ))
pool.close()
pool.join()
|
采集Ebay相关搜索词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
#encoding=utf-8 import requests,re,random,time,multiprocessing,urllib
daili_list = []
def ip():
for x in open ( \'E:\BaiduNetdiskDownload\daili.txt\' ):
x = x.strip()
daili_list.append(x)
newip = random.choice(daili_list)
return newip
def getUA():
uaList = [
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)\' ,
\'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\' ,
]
newUa = random.choice(uaList)
return newUa
def gethtml(url,headers):
while 1 :
try :
# newip=ip()
# proxies={"http":"http://%s"%newip.strip()}
# pages=requests.post(url,headers,proxies,timeout=10)
pages = requests.get(url,headers,timeout = 10 )
html = pages.content
code = pages.status_code
if \'302 Found\' in html or code ! = 200 :
print u \'代理失效重试\' ,url
continue
else :
return html
except Exception, e:
# print e
continue
\'\'\'re.search提取所需数据\'\'\' def search(re_url,html):
re_Data = re.search(re_url,html)
if re_Data:
return re_Data.group( 1 )
else :
return \'no\'
def getKeyword(url):
headers = {
\'Accept\' : \'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\' ,
\'Accept-Encoding\' : \'gzip, deflate, sdch\' ,
\'Accept-Language\' : \'zh-CN,zh;q=0.8\' ,
\'Connection\' : \'keep-alive\' ,
\'Host\' : \'www.ebay.com\' ,
\'Upgrade-Insecure-Requests\' : \'1\' ,
# \'Referer\':\'http://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=robot+vacuum+cleaner\',
\'User-Agent\' : \'%s\' % getUA(),
}
html = gethtml(url,headers)
# print html
re_data = re. compile (r \'data-text="(.*?)"\' )
re_data = re.findall(re_data,html)
if re_data:
for word in re_data:
print word
urls = []
with open ( \'word.txt\' ) as f:
for i in f.readlines():
url = \'http://www.ebay.com/sch/i.html?_nkw=%s\' % i.strip()
urls.append(url)
if __name__ = = \'__main__\' :
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for url in urls:
pool.apply_async(getKeyword, (url, ))
pool.close()
pool.join()
|
采集敦煌网下拉框关键词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
#encoding=utf-8 import requests,re,random,time,multiprocessing,urllib
daili_list = []
def ip():
for x in open ( \'E:\BaiduNetdiskDownload\daili.txt\' ):
x = x.strip()
daili_list.append(x)
newip = random.choice(daili_list)
return newip
def getUA():
uaList = [
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)\' ,
\'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\' ,
]
newUa = random.choice(uaList)
return newUa
def gethtml(url,headers):
while 1 :
try :
# newip=ip()
# proxies={"http":"http://%s"%newip.strip()}
# pages=requests.post(url,headers,proxies,timeout=10)
pages = requests.get(url,headers,timeout = 10 )
html = pages.content
code = pages.status_code
if \'302 Found\' in html or code ! = 200 :
print u \'代理失效重试\' ,url
continue
else :
return html
except Exception, e:
# print e
continue
\'\'\'re.search提取所需数据\'\'\' def search(re_url,html):
re_Data = re.search(re_url,html)
if re_Data:
return re_Data.group( 1 )
else :
return \'no\'
def getKeyword(url):
headers = {
\'Accept\' : \'*/*\' ,
\'Accept-Encoding\' : \'gzip, deflate, sdch, br\' ,
\'Accept-Language\' : \'zh-CN,zh;q=0.8\' ,
\'Connection\' : \'keep-alive\' ,
\'Host\' : \'www.dhgate.com\' ,
# \'Referer\':\'http://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=robot+vacuum+cleaner\',
\'User-Agent\' : \'%s\' % getUA(),
\'X-Requested-With\' : \'XMLHttpRequest\' ,
}
html = gethtml(url,headers)
# print html
re_data = re. compile (r \'<strong>(.*?)\|\' )
re_data = re.findall(re_data,html)
if re_data:
for word in re_data:
re_mark = re. compile (r "</strong>" )
word = re.sub(re_mark,\'\',word)
print word
urls = []
with open ( \'word.txt\' ) as f:
for i in f.readlines():
url = \'http://www.dhgate.com/wholesale/searchTools.do?act=suggestKeywords&q=%s\' % i.strip()
urls.append(url)
if __name__ = = \'__main__\' :
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for url in urls:
pool.apply_async(getKeyword, (url, ))
pool.close()
pool.join()
|
采集敦煌网相关搜索词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
#encoding=utf-8 import requests,re,random,time,multiprocessing,urllib
daili_list = []
def ip():
for x in open ( \'E:\BaiduNetdiskDownload\daili.txt\' ):
x = x.strip()
daili_list.append(x)
newip = random.choice(daili_list)
return newip
def getUA():
uaList = [
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)\' ,
\'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1\' ,
\'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)\' ,
\'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)\' ,
\'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)\' ,
\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36\' ,
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\' ,
]
newUa = random.choice(uaList)
return newUa
def gethtml(url,headers):
while 1 :
try :
# newip=ip()
# proxies={"http":"http://%s"%newip.strip()}
# pages=requests.post(url,headers,proxies,timeout=10)
pages = requests.get(url,headers,timeout = 10 )
html = pages.content
code = pages.status_code
if \'302 Found\' in html or code ! = 200 :
print u \'代理失效重试\' ,url
continue
else :
return html
except Exception, e:
# print e
continue
\'\'\'re.search提取所需数据\'\'\' def search(re_url,html):
re_Data = re.search(re_url,html)
if re_Data:
return re_Data.group( 1 )
else :
return \'no\'
def getKeyword(url):
headers = {
\'Accept\' : \'*/*\' ,
\'Accept-Encoding\' : \'gzip, deflate, sdch, br\' ,
\'Accept-Language\' : \'zh-CN,zh;q=0.8\' ,
\'Connection\' : \'keep-alive\' ,
\'Host\' : \'www.dhgate.com\' ,
# \'Referer\':\'http://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=robot+vacuum+cleaner\',
\'User-Agent\' : \'%s\' % getUA(),
\'X-Requested-With\' : \'XMLHttpRequest\' ,
}
html = gethtml(url,headers)
# print html
re_data = re. compile (r \'<div class="relatesear-wrap">([\s\S]*?)</div>\' )
re_data = search(re_data,html)
if re_data ! = \'no\' :
re_word = re. compile (r \'<a href="http://www.dhgate.com/w/.*?">(.*?)</a>\' )
re_data = re.findall(re_word,re_data)
for word in re_data:
re_mark = re. compile (r "," )
word = re.sub(re_mark,\'\',word)
print word
urls = []
with open ( \'word.txt\' ) as f:
for i in f.readlines():
url = \'http://www.dhgate.com/wholesale/search.do?searchkey=%s\' % i.strip()
urls.append(url)
if __name__ = = \'__main__\' :
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for url in urls:
pool.apply_async(getKeyword, (url, ))
pool.close()
pool.join()
|
以后或许会补充更新!最后给大家送一个彩蛋给读者,居然这么认真拉到最下面!python彩蛋Zen of python!在交互模式下输入import this!
The Zen of Python, by Tim Peters
Beautiful is better than ugly.
# 优美胜于丑陋(Python以编写优美的代码为目标)
Explicit is better than implicit.
# 明了胜于晦涩(优美的代码应当是明了的,命名规范,风格相似)
Simple is better than complex.
# 简洁胜于复杂(优美的代码应当是简洁的,不要有复杂的内部实现)
Complex is better than complicated.
# 复杂胜于凌乱(如果复杂不可避免,那代码间也不能有难懂的关系,要保持接口简洁)
Flat is better than nested.
# 扁平胜于嵌套(优美的代码应当是扁平的,不能有太多的嵌套)
Sparse is better than dense.
# 间隔胜于紧凑(优美的代码有适当的间隔,不要奢望一行代码解决问题)
Readability counts.
# 可读性很重要(优美的代码是可读的)
Special cases aren’t special enough to break the rules.Although practicality beats purity.
# 即便假借特例的实用性之名,也不可违背这些规则(这些规则至高无上)
Errors should never pass silently.Unless explicitly silenced.
# 不要包容所有错误,除非你确定需要这样做(精准地捕获异常,不写except:pass风格的代码)
In the face of ambiguity, refuse the temptation to guess.
# 当存在多种可能,不要尝试去猜测
There should be one– and preferably only one –obvious way to do it.
# 而是尽量找一种,最好是唯一一种明显的解决方案(如果不确定,就用穷举法)
Although that way may not be obvious at first unless you’re Dutch.
# 虽然这并不容易,因为你不是 Python 之父(这里的Dutch是指Guido)
Now is better than never.Although never is often better than *right* now.
# 做也许好过不做,但不假思索就动手还不如不做(动手之前要细思量)
If the implementation is hard to explain, it’s a bad idea.If the implementation is easy to explain, it may be a good idea.
# 如果你无法向人描述你的方案,那肯定不是一个好方案;反之亦然(方案测评标准)
Namespaces are one honking great idea — let’s do more of those!
# 命名空间是一种绝妙的理念,我们应当多加利用(倡导与号召)