请注意,本文编写于 389 天前,最后修改于 389 天前,其中某些信息可能已经过时。
# -*- coding: utf-8 -*-
"""
@File : MeiZiTu.py
@Time : 2019/12/25 22:12
@Author : YiFang
@Email : [email protected]
@Software: PyCharm
"""
import os
import time
import requests
from lxml import etree
# 设置保存路径
PICTURS_PATH = os.path.join(os.getcwd(), 'Picturs/')
# 设置Headers
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer':'http://www.mzitu.com'
}
class Spider(object):
def __init__(self, page_num):
self.page_num = page_num
self.page_urls = ['https://www.mzitu.com']
self.girl_urls = []
self.girl_name = ''
self.pic_urls = []
# 获取所有页面url
def get_page_urls(self):
if int(self.page_num) > 1:
for n in range(2, int(self.page_num) + 1):
page_url = 'https://www.mzitu.com/page/' + str(n)
self.page_urls.append(page_url)
elif int(self.page_num) == 1:
pass
# 获取所有妹子url
def get_girl_urls(self):
for page_url in self.page_urls:
html = requests.get(page_url, headers=headers).content
selector = etree.HTML(html)
self.girl_urls += (selector.xpath('//ul[@id="pins"]/li/span/a/@href'))
# 获取单个妹子图片url
def get_pic_urls(self):
for gril_url in self.girl_urls:
html = requests.get(gril_url, headers=headers).content
selector = etree.HTML(html)
# 获取标题
self.girl_name = selector.xpath('//h2[@class="main-title"]/text()')[0]
# 获取最大页数
max_num = int(selector.xpath('//div[@class="pagenavi"]/a/span/text()')[4])
# 获取图片链接
img_url = str(selector.xpath('//div[@class="main-image"]/p/a/img/@src')[0]).split("01.")
self.pic_urls = []
for num in range(1, max_num + 1):
if num < 10:
self.pic_urls.append(img_url[0] + "0" + str(num) + ".jpg")
else:
self.pic_urls.append(img_url[0] + str(num) + ".jpg")
# 调用下载
try:
self.download_pic(gril_url)
except Exception as e:
print("保存失败:" + self.girl_name)
# 下载图片
def download_pic(self,this_referer):
# 创建总文件夹
try:
os.mkdir(PICTURS_PATH)
except:
pass
# 创建单个文件夹
gril_path = PICTURS_PATH + self.girl_name
try:
os.mkdir(gril_path)
except Exception as e:
print("[{}] - 文件夹已存在".format(self.girl_name))
#设置Headers
This_Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': this_referer
}
img_name = 0
# 检查缺失图片并下载
for pic_url in self.pic_urls:
img_name += 1
img_data = requests.get(pic_url, headers=This_Headers)
pic_path = gril_path + "/" + str(img_name) + ".jpg"
if os.path.isfile(pic_path):
print("图片存在 [{}] - 第 [{}] 张图片".format(self.girl_name, img_name))
pass
else:
with open(pic_path, "wb") as f:
print("正在保存 [{}] - 第 [{}] 张图片".format(self.girl_name, img_name))
f.write(img_data.content)
f.close()
time.sleep(0.3)
return
# 启动
def start(self):
self.get_page_urls()
self.get_girl_urls()
self.get_pic_urls()
if __name__ == '__main__':
# 直接调用 Spider() 中填总页数,当前日期:2019-12-25,总页数:238
Spider(238).start()
# 手动输入页数
#Spider(input("请输入总页数:")).start()