xpath实现简单爬去数据-爬取购物网站
利用xpath抓取购物网站的物品数据及将图片保存到本地,将抓取的数据保存到mongo数据库
import requests
from lxml import etree
import pymongo
import os
# 获取页面信息
def get_html(url):
header = {
'Referer':'http://www.meilishuo.com/search/catalog/10057050?page=2',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
resp = requests.get(url,headers=header)
# print(resp.text)
return resp.text
# 获取图片
def get_img(img):
header = {
'Referer':'http://www.meilishuo.com/search/catalog/10057050?page=2',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
resp_img = requests.get(img,headers=header)
# print(resp.text)
return resp_img.content
# 解析页面
def parse_html(html):
page = etree.HTML(html)
# 图片列表
product_list = page.xpath('//li[contains(@class,"product-list")]')
# print(product_list)
img_list=[]
for product in product_list:
# 图片
img = product.xpath('./div[@class="img-size data-ptp-item"]/a/@data-src')[0]
save_img=get_img(img)
img_list.append(save_img)
# print(img)
# 价格
price = product.xpath('./div[@class="product-info clearfix"]/div/span[@class="price-n"]/text()')
# print(price)
# 收藏
collection = product.xpath('./div[@class="product-info clearfix"]/div[@class="fav fr"]/span/text()')
# print(collection)
# 描述
desc = product.xpath('./a[@class="text-link"]/span/text()')
# print(desc)
# 数据组装
data = {
# 以二进制的方式将图片保存到数据库
'img':get_img(img),
'price':price,
'collection':collection,
'desc':desc
}
# 保存到mongo
sava_mongo(data)
return img_list
# 保存到数据库
def sava_mongo(data):
try:
client = pymongo.MongoClient('localhost',port=27017)
db = client['mlsdata']
if db['data'].insert(data):
print('数据写入成功')
except Exception:
print('数据库链接失败')
def main():
url = 'http://www.meilishuo.com/search/catalog/10057050?page=2'
page=get_html(url)
# 将图片保存到本地
img_url=parse_html(page)
# 设置默认保存文件夹名字
filename = 'movie_photo'
os.mkdir(filename) # 在当前文件夹,建一个名为filename的文件夹
os.chdir(filename) # 将当前工作目录切换到新建的文件夹里
# 设置保存的图片名字
i = 1
for list in img_url:
with open( ('mls'+str(i)) + '.jpg', 'wb')as f:
f.write(list)
i += 1
if __name__ == '__main__':
main()