# -*- coding: utf-8 -*-
"""
@File    : MeiZiTu.py
@Time    : 2019/12/25 22:12
@Author  : YiFang
@Email   : [email protected]
@Software: PyCharm
"""
import os
import time
import requests
from lxml import etree

# 设置保存路径
PICTURS_PATH = os.path.join(os.getcwd(), 'Picturs/')

# 设置Headers
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'Referer':'http://www.mzitu.com'
}

class Spider(object):
    def __init__(self, page_num):
        self.page_num = page_num
        self.page_urls = ['https://www.mzitu.com']
        self.girl_urls = []
        self.girl_name = ''
        self.pic_urls = []

    # 获取所有页面url
    def get_page_urls(self):
        if int(self.page_num) > 1:
            for n in range(2, int(self.page_num) + 1):
                page_url = 'https://www.mzitu.com/page/' + str(n)
                self.page_urls.append(page_url)
        elif int(self.page_num) == 1:
            pass

    # 获取所有妹子url
    def get_girl_urls(self):
        for page_url in self.page_urls:
            html = requests.get(page_url, headers=headers).content
            selector = etree.HTML(html)
            self.girl_urls += (selector.xpath('//ul[@id="pins"]/li/span/a/@href'))

    # 获取单个妹子图片url
    def get_pic_urls(self):
        for gril_url in self.girl_urls:
            html = requests.get(gril_url, headers=headers).content
            selector = etree.HTML(html)

            # 获取标题
            self.girl_name = selector.xpath('//h2[@class="main-title"]/text()')[0]
            # 获取最大页数
            max_num = int(selector.xpath('//div[@class="pagenavi"]/a/span/text()')[4])
            # 获取图片链接
            img_url = str(selector.xpath('//div[@class="main-image"]/p/a/img/@src')[0]).split("01.")
            self.pic_urls = []
            for num in range(1, max_num + 1):
                if num < 10:
                    self.pic_urls.append(img_url[0] + "0" + str(num) + ".jpg")
                else:
                    self.pic_urls.append(img_url[0] + str(num) + ".jpg")

            # 调用下载
            try:
                self.download_pic(gril_url)
            except Exception as e:
                print("保存失败:" + self.girl_name)

    # 下载图片
    def download_pic(self,this_referer):
        # 创建总文件夹
        try:
            os.mkdir(PICTURS_PATH)
        except:
            pass

        # 创建单个文件夹
        gril_path = PICTURS_PATH + self.girl_name
        try:
            os.mkdir(gril_path)
        except Exception as e:
            print("[{}] - 文件夹已存在".format(self.girl_name))

        #设置Headers
        This_Headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'Referer': this_referer
            }

        img_name = 0
        # 检查缺失图片并下载
        for pic_url in self.pic_urls:
            img_name += 1
            img_data = requests.get(pic_url, headers=This_Headers)
            pic_path = gril_path + "/" + str(img_name) + ".jpg"
            if os.path.isfile(pic_path):
                print("图片存在 [{}] - 第 [{}] 张图片".format(self.girl_name, img_name))
                pass
            else:
                with open(pic_path, "wb") as f:
                    print("正在保存 [{}] - 第 [{}] 张图片".format(self.girl_name, img_name))
                    f.write(img_data.content)
                    f.close()
                    time.sleep(0.3)
        return

    # 启动
    def start(self):
        self.get_page_urls()
        self.get_girl_urls()
        self.get_pic_urls()


if __name__ == '__main__':
    # 直接调用 Spider() 中填总页数,当前日期:2019-12-25,总页数:238
    Spider(238).start()
    # 手动输入页数
    #Spider(input("请输入总页数:")).start()
最后修改:2019 年 12 月 25 日 11 : 44 PM