利用xpath抓取购物网站的物品数据及将图片保存到本地,将抓取的数据保存到mongo数据库

import requests
from lxml import etree
import pymongo
import os


# 获取页面信息

def get_html(url):
	header = {
		'Referer':'http://www.meilishuo.com/search/catalog/10057050?page=2',
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
	}
	resp = requests.get(url,headers=header)
	# print(resp.text)
	return resp.text

# 获取图片
def get_img(img):
	header = {
		'Referer':'http://www.meilishuo.com/search/catalog/10057050?page=2',
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
	}
	resp_img = requests.get(img,headers=header)
	# print(resp.text)
	return resp_img.content

# 解析页面
def parse_html(html):
	page = etree.HTML(html)
	# 图片列表
	product_list = page.xpath('//li[contains(@class,"product-list")]')
	# print(product_list)
	img_list=[]
	for product in product_list:
		# 图片
		img = product.xpath('./div[@class="img-size data-ptp-item"]/a/@data-src')[0]
		save_img=get_img(img)
		img_list.append(save_img)

		# print(img)
		# 价格
		price = product.xpath('./div[@class="product-info clearfix"]/div/span[@class="price-n"]/text()')
		# print(price)
		# 收藏
		collection = product.xpath('./div[@class="product-info clearfix"]/div[@class="fav fr"]/span/text()')
		# print(collection)
		# 描述
		desc = product.xpath('./a[@class="text-link"]/span/text()')
		# print(desc)

		# 数据组装
		data = {
			# 以二进制的方式将图片保存到数据库
			'img':get_img(img),
			'price':price,
			'collection':collection,
			'desc':desc
		}
		# 保存到mongo
		sava_mongo(data)
	return img_list



# 保存到数据库
def sava_mongo(data):
	try:
		client = pymongo.MongoClient('localhost',port=27017)
		db = client['mlsdata']
		if db['data'].insert(data):
			print('数据写入成功')
	except Exception:
		print('数据库链接失败')


def main():
	url = 'http://www.meilishuo.com/search/catalog/10057050?page=2'
	page=get_html(url)
	# 将图片保存到本地
	img_url=parse_html(page)
	# 设置默认保存文件夹名字
	filename = 'movie_photo'
	os.mkdir(filename)  # 在当前文件夹,建一个名为filename的文件夹
	os.chdir(filename)  # 将当前工作目录切换到新建的文件夹里
	# 设置保存的图片名字
	i = 1
	for list in img_url:

		with open( ('mls'+str(i)) + '.jpg', 'wb')as f:
			f.write(list)
			i += 1



if __name__ == '__main__':
	main()

打赏

发表评论

邮箱地址不会被公开。 必填项已用*标注