只用来学习请勿无限制使用爬虫
功能描述
目标:获取淘宝搜索页面的信息媒体去其中的商品名称和价格
程序的结构设计
步骤1:提交商品搜索请求,循环获取页面
步骤2:对每个页面,提取商品名称和价格信息
步骤3:将信息输出到屏幕上
修改headers的方法请看:https://blog.csdn.net/wyatt007/article/details/105433009
import requests
import re
def getHTMLText(url):
try:
# 淘宝用了反爬虫机制,必须提取cookie让他认为是用户在操作
headers = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36',
'sec-fetch-dest': 'document',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'referer': 'https://login.taobao.com/member/login.jhtml?redirectURL=http%3A%2F%2Fs.taobao.com%2Fsearch%3Fq%3D%25E4%25B9%25A6%25E5%258C%2585%26imgfile%3D%26commend%3Dall%26ssid%3Ds5-e%26search_type%3Ditem%26sourceId%3Dtb.index%26spm%3Da21bo.2017.201856-taobao-item.1%26ie%3Dutf8%26initiative_id%3Dtbindexz_20170306&uuid=a76051882a49611851049b58f3ba0c4a',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '_m_h5_tk=dd76fc3e6c033f4a8d8021db339a6310_1586513908224; _m_h5_tk_enc=96606e0cb355b1af3effa79bcaac6ab5; cna=NBQXF/YlUBICAW8OWvCit+VK; cookie2=18c858a70d4fce1b8c89af2efdf96783; t=dc365f48c9959c4d7e3f953ee22311ba; _tb_token_=56ae8975fb5e7; thw=cn; v=0; _samesite_flag_=true; unb=1865322938; lgc=wyatt007forest; cookie17=UondEQYxBUF5vg%3D%3D; dnk=wyatt007forest; tracknick=wyatt007forest; _l_g_=Ug%3D%3D; sg=t85; _nk_=wyatt007forest; cookie1=B0T8cGrO6mvYMvFGJtImohCg6Pp0kk6SAFOFxlzrB4k%3D; enc=Y9dHDAdu3egag5jNZIsGKVclI615Dbgl%2Ff9j6QJieRxoWyGkHfMywXdwIhiMHCLE6zqpYt7GeCafncNM6NDijg%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_1; alitrackid=login.taobao.com; lastalitrackid=login.taobao.com; sgcookie=EG6Tw2r8ZuQGCzQuamtYy; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie21=UIHiLt3xSifiVqTH8o%2F0Qw%3D%3D&cookie14=UoTUPOFPQuPCNw%3D%3D&existShop=false&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&pas=0; uc3=vt3=F8dBxdGNasGuibD9DuA%3D&id2=UondEQYxBUF5vg%3D%3D&lg2=W5iHLLyFOGW7aA%3D%3D&nk2=FOBNgykgFniOzYa4Ulc%3D; csg=e04469e6; skt=1a8a9900f7e38fca; existShop=MTU4NjUwODUxNA%3D%3D; uc4=id4=0%40UOE3H2P%2BBP8F56sMhO3oW%2B4xeCEg&nk4=0%40FmuWKo5sKfc1BRnX8Q6OBjpgOM5hr9N%2B1g%3D%3D; _cc_=V32FPkk%2Fhw%3D%3D; JSESSIONID=5D83ECCA1EAA48986DFC186A278B74FA; tfstk=cIFABADbhgjm_fNoHSBo1cFEE_bha88xjEi96NiuyXJP-eODgs4atWeIKtgwDbQR.; isg=BAwM29cXvs9QPKqzMIgB2PVs3Wo-RbDvFkwmlGbNGLda8az7jlWAfwJDlflJouhH; l=dBTowWyqQoajurF9BOCanurza77OSIRYYuPzaNbMi_5ZZ6T6_9_OotI4XF96VjWftM8B4dapP5p9-etkZQDmndK-g3fPaxDc.',
} # 隐去了cookie信息和referer信息
r = requests.get(url, timeout=30,headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price=eval(plt[i].split(':')[1])
title=eval(tlt[i].split(':')[1])
ilt.append([price,title])
except:
print("")
def printfGoodsList(ilt):
tplt="{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count=0
for g in ilt:
count=count+1
print(tplt.format(count,g[0],g[1]))
def main():
goods="华为手机"
depth=3
start_url="https://s.taobao.com/search?q="+goods
inforList=[]
for i in range(depth):
try:
url=start_url+'&s='+str(44*i)
print(url)
html=getHTMLText(url)
print(html)
parsePage(inforList,html)
except:
continue
printfGoodsList(inforList)
main()

7152

被折叠的 条评论
为什么被折叠?



