# -*- encoding: utf-8 -*-
"""
@File    : 80S.py
@Time    : 2019/12/26 11:40
@Author  : YiFang
@Email   : [email protected]
@Software: PyCharm
"""
import os
import time
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# DOMIN
BASE_DOMIN = "http://www.8080s.net/movie/list/"

# 设置保存路径
PATH = os.path.join(os.getcwd(), '80S_Movies.txt')

# 设置Headers
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'Referer':'http://www.8080s.net/'
}

chrome_options=Options()
#设置chrome浏览器无界面模式
chrome_options.add_argument('--headless')

# 获取影片信息
def get_movie_info(u):
    html = requests.get(u, headers=headers).content
    selector = etree.HTML(html)
    # 标题
    try:
        title = str(selector.xpath('//div[@class="info"]/h1/text()')[0]).replace("\n    \t                ", "").replace("                (","[").replace(")\n            ", "]")
    except:
        title = u

    print("正在爬取:[{}]".format(title))

    # 载入全部下载资源
    n = False
    try:
        selector.xpath("//span[@class='dlb_link_link']")[0]
    except:
        n = True
    download_list = None
    download_url1 = None
    download_url2 = None
    if n:
        # 下载列表
        download_list = selector.xpath('//span[@class="dlname nm"]/span/a/text()')
        # 本地下载
        download_url1 = selector.xpath('//span[@class="xunlei dlbutton3"]/a/@href')
        # 迅雷下载
        download_url2 = selector.xpath('//span[@class="xunlei dlbutton1"]/a/@href')
    else:
        driver = webdriver.Chrome(chrome_options=chrome_options)  # 创建浏览器对象
        # driver.maximize_window()# 最大化窗口
        driver.get(u)  # 打开网页
        time.sleep(3)  # 加载等待
        button = driver.find_element_by_xpath("./*//span[@class='dlb_link_link']")
        driver.execute_script("arguments[0].scrollIntoView();", button)  # 移动到按钮位置
        time.sleep(1)
        driver.find_element_by_xpath("./*//span[@class='dlb_link_link']").click()  # 点击按钮
        time.sleep(1)

        html = driver.page_source
        driver.close()  # 关闭浏览器
        driver.quit()  # 结束浏览器进程
        selector = etree.HTML(html)
        # 下载列表
        download_list = selector.xpath('//span[@class="dlname nm"]/span/a/text()')
        # 本地下载
        download_url1 = selector.xpath('//span[@class="xunlei dlbutton3"]/a/@href')
        # 迅雷下载
        download_url2 = selector.xpath('//span[@class="xunlei dlbutton1"]/a/@href')

    with open(PATH, "a") as f:
        f.write("标题:{}\n下载链接:\n".format(title))
        for download_url in download_url1:
            try:
                f.write("{}\n".format(download_url))
            except Exception as e:
                gb18030TypeStr = download_url.encode("GB18030")
                f.write("{}\n".format(gb18030TypeStr))
        f.write("-" * 50)
        f.write("\n")
        f.close()

# 获取当前页所有影片链接
def get_page_urls(u):
    html = requests.get(u, headers=headers).content
    selector = etree.HTML(html)
    urls = selector.xpath('//ul[@class="me1 clearfix"]/li/a/@href')
    for url in urls:
        url = "http://www.8080s.net" + url
        get_movie_info(url)

def Spider(page_num):
    for num in range(0, page_num):
        get_page_urls(BASE_DOMIN + "-p" + str(num))
    print("恭喜,[{}] 页影片爬取完毕".format(page_num))


if __name__ == '__main__':
    html = requests.get(BASE_DOMIN, headers=headers).content
    selector = etree.HTML(html)
    infos = selector.xpath('//div[@class="f_block2"]/dl/ul/li/a/text()')
    string = ""
    num = 0
    for info in infos:
        if info == "全部":
            num = 1
            string += "\n" + str(num) + "." + info
        else:
            num += 1
            string += " | " + str(num) + "." + info

    # 输出信息
    infos = string.split("\n1.全部")

    # 选择语言
    languages = infos[1].split(" | ")
    languages[0] = "1.全部"
    print("选择语言:")
    print("语言: 1.全部" + infos[1])
    language = input("请选择 [语言] (默认全部):")
    if language == "":
        language = 0
    else:
        language = int(language) - 1
    print("设置语言: [{}]".format(languages[language]))
    print("-" * 50)

    # 选择分类
    sorts = infos[2].split(" | ")
    sorts[0] = "1.全部"
    print("选择分类:")
    print("分类: 1.全部" + infos[2])
    sort = input("请选择 [分类] (默认全部):")
    if sort == "":
        sort = 0
    else:
        sort = int(sort) - 1
    print("设置分类: [{}]".format(sorts[sort]))
    print("-" * 50)

    # 选择年代
    years = infos[3].split(" | ")
    years[0] = "1.全部"
    print("选择年代:")
    print("年代: 1.全部" + infos[3])
    year = input("请选择 [年代] (默认全部):")
    if year == "":
        year = 0
    else:
        year = int(year) - 1
    print("设置年代: [{}]".format(years[year]))
    print("-" * 50)

    # 选择地区
    addrs = infos[4].split(" | ")
    addrs[0] = "1.全部"
    print("选择地区:")
    print("地区: 1.全部" + infos[4].split("其他")[0] + "其他")
    addr = input("请选择 [地区] (默认全部):")
    if addr == "":
        addr = 0
    else:
        addr = int(addr) - 1
    print("设置地区: [{}]".format(addrs[addr]))
    print("-" * 50)

    print("请确认配置:")
    print("语言: {}".format(str(languages[language])))
    print("分类: {}".format(str(sorts[sort])))
    print("年代: {}".format(str(years[year])))
    print("地区: {}".format(str(addrs[addr])))

    print("-" * 50)
    if input("确认无误请按下 [回车] :") == "":
        # 按照顺序拼接URL:[分类]
        if sort == 0:
            BASE_DOMIN = BASE_DOMIN + "-"
        else:
            BASE_DOMIN = BASE_DOMIN + "{}-".format(sort)

        # 按照顺序拼接URL:[年代]
        if year == 0:
            BASE_DOMIN = BASE_DOMIN + "-"
        else:
            try:
                y = str(years[year].split(".")[1]).replace("-", "_")
                BASE_DOMIN = BASE_DOMIN + "{}-".format(y)
            except Exception as e:
                BASE_DOMIN = BASE_DOMIN + "{}-".format(years[year].split(".")[1])

        # 按照顺序拼接URL:[地区]
        if addr == 0:
            BASE_DOMIN = BASE_DOMIN + "-"
        else:
            BASE_DOMIN = BASE_DOMIN + "{}-".format(addr)

        # 按顺序拼接URL:[语言]
        if language == 0:
            BASE_DOMIN = BASE_DOMIN + "-"
        elif language == 1:
            BASE_DOMIN = BASE_DOMIN + "1-"
        elif language == 2:
            BASE_DOMIN = BASE_DOMIN + "7-"
        elif language == 3:
            BASE_DOMIN = BASE_DOMIN + "2-"
        elif language == 4:
            BASE_DOMIN = BASE_DOMIN + "3-"
        elif language == 5:
            BASE_DOMIN = BASE_DOMIN + "4-"
        elif language == 6:
            BASE_DOMIN = BASE_DOMIN + "5-"
        elif language == 7:
            BASE_DOMIN = BASE_DOMIN + "6-"
        print("URL拼接完成:{}".format(BASE_DOMIN))
        print("-" * 50)

        # 选择页数
        print("开始获取总页数")
        html = requests.get(BASE_DOMIN, headers=headers).content
        selector = etree.HTML(html)
        page_num = str(selector.xpath('//div[@class="pager"]/a/@href')[5]).split("p")[1]
        print("当前配置总页数: [{}] 页".format(page_num))
        page_num = input("请输入 [页数] (默认全部):")
        if page_num == "":
            page_num = str(selector.xpath('//div[@class="pager"]/a/@href')[5]).split("p")[1]
        print("设置爬取页数: [{}] 页".format(str(page_num)))
        print("-" * 50)
        if input("按下 [回车] 开始爬取:") == "":
            Spider(int(page_num))
    else:
        print("脚本已结束")
        pass
最后修改:2019 年 12 月 26 日 07 : 33 PM