请注意,本文编写于 389 天前,最后修改于 389 天前,其中某些信息可能已经过时。
# -*- encoding: utf-8 -*-
"""
@File : 80S.py
@Time : 2019/12/26 11:40
@Author : YiFang
@Email : [email protected]
@Software: PyCharm
"""
import os
import time
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# DOMIN
BASE_DOMIN = "http://www.8080s.net/movie/list/"
# 设置保存路径
PATH = os.path.join(os.getcwd(), '80S_Movies.txt')
# 设置Headers
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer':'http://www.8080s.net/'
}
chrome_options=Options()
#设置chrome浏览器无界面模式
chrome_options.add_argument('--headless')
# 获取影片信息
def get_movie_info(u):
html = requests.get(u, headers=headers).content
selector = etree.HTML(html)
# 标题
try:
title = str(selector.xpath('//div[@class="info"]/h1/text()')[0]).replace("\n \t ", "").replace(" (","[").replace(")\n ", "]")
except:
title = u
print("正在爬取:[{}]".format(title))
# 载入全部下载资源
n = False
try:
selector.xpath("//span[@class='dlb_link_link']")[0]
except:
n = True
download_list = None
download_url1 = None
download_url2 = None
if n:
# 下载列表
download_list = selector.xpath('//span[@class="dlname nm"]/span/a/text()')
# 本地下载
download_url1 = selector.xpath('//span[@class="xunlei dlbutton3"]/a/@href')
# 迅雷下载
download_url2 = selector.xpath('//span[@class="xunlei dlbutton1"]/a/@href')
else:
driver = webdriver.Chrome(chrome_options=chrome_options) # 创建浏览器对象
# driver.maximize_window()# 最大化窗口
driver.get(u) # 打开网页
time.sleep(3) # 加载等待
button = driver.find_element_by_xpath("./*//span[@class='dlb_link_link']")
driver.execute_script("arguments[0].scrollIntoView();", button) # 移动到按钮位置
time.sleep(1)
driver.find_element_by_xpath("./*//span[@class='dlb_link_link']").click() # 点击按钮
time.sleep(1)
html = driver.page_source
driver.close() # 关闭浏览器
driver.quit() # 结束浏览器进程
selector = etree.HTML(html)
# 下载列表
download_list = selector.xpath('//span[@class="dlname nm"]/span/a/text()')
# 本地下载
download_url1 = selector.xpath('//span[@class="xunlei dlbutton3"]/a/@href')
# 迅雷下载
download_url2 = selector.xpath('//span[@class="xunlei dlbutton1"]/a/@href')
with open(PATH, "a") as f:
f.write("标题:{}\n下载链接:\n".format(title))
for download_url in download_url1:
try:
f.write("{}\n".format(download_url))
except Exception as e:
gb18030TypeStr = download_url.encode("GB18030")
f.write("{}\n".format(gb18030TypeStr))
f.write("-" * 50)
f.write("\n")
f.close()
# 获取当前页所有影片链接
def get_page_urls(u):
html = requests.get(u, headers=headers).content
selector = etree.HTML(html)
urls = selector.xpath('//ul[@class="me1 clearfix"]/li/a/@href')
for url in urls:
url = "http://www.8080s.net" + url
get_movie_info(url)
def Spider(page_num):
for num in range(0, page_num):
get_page_urls(BASE_DOMIN + "-p" + str(num))
print("恭喜,[{}] 页影片爬取完毕".format(page_num))
if __name__ == '__main__':
html = requests.get(BASE_DOMIN, headers=headers).content
selector = etree.HTML(html)
infos = selector.xpath('//div[@class="f_block2"]/dl/ul/li/a/text()')
string = ""
num = 0
for info in infos:
if info == "全部":
num = 1
string += "\n" + str(num) + "." + info
else:
num += 1
string += " | " + str(num) + "." + info
# 输出信息
infos = string.split("\n1.全部")
# 选择语言
languages = infos[1].split(" | ")
languages[0] = "1.全部"
print("选择语言:")
print("语言: 1.全部" + infos[1])
language = input("请选择 [语言] (默认全部):")
if language == "":
language = 0
else:
language = int(language) - 1
print("设置语言: [{}]".format(languages[language]))
print("-" * 50)
# 选择分类
sorts = infos[2].split(" | ")
sorts[0] = "1.全部"
print("选择分类:")
print("分类: 1.全部" + infos[2])
sort = input("请选择 [分类] (默认全部):")
if sort == "":
sort = 0
else:
sort = int(sort) - 1
print("设置分类: [{}]".format(sorts[sort]))
print("-" * 50)
# 选择年代
years = infos[3].split(" | ")
years[0] = "1.全部"
print("选择年代:")
print("年代: 1.全部" + infos[3])
year = input("请选择 [年代] (默认全部):")
if year == "":
year = 0
else:
year = int(year) - 1
print("设置年代: [{}]".format(years[year]))
print("-" * 50)
# 选择地区
addrs = infos[4].split(" | ")
addrs[0] = "1.全部"
print("选择地区:")
print("地区: 1.全部" + infos[4].split("其他")[0] + "其他")
addr = input("请选择 [地区] (默认全部):")
if addr == "":
addr = 0
else:
addr = int(addr) - 1
print("设置地区: [{}]".format(addrs[addr]))
print("-" * 50)
print("请确认配置:")
print("语言: {}".format(str(languages[language])))
print("分类: {}".format(str(sorts[sort])))
print("年代: {}".format(str(years[year])))
print("地区: {}".format(str(addrs[addr])))
print("-" * 50)
if input("确认无误请按下 [回车] :") == "":
# 按照顺序拼接URL:[分类]
if sort == 0:
BASE_DOMIN = BASE_DOMIN + "-"
else:
BASE_DOMIN = BASE_DOMIN + "{}-".format(sort)
# 按照顺序拼接URL:[年代]
if year == 0:
BASE_DOMIN = BASE_DOMIN + "-"
else:
try:
y = str(years[year].split(".")[1]).replace("-", "_")
BASE_DOMIN = BASE_DOMIN + "{}-".format(y)
except Exception as e:
BASE_DOMIN = BASE_DOMIN + "{}-".format(years[year].split(".")[1])
# 按照顺序拼接URL:[地区]
if addr == 0:
BASE_DOMIN = BASE_DOMIN + "-"
else:
BASE_DOMIN = BASE_DOMIN + "{}-".format(addr)
# 按顺序拼接URL:[语言]
if language == 0:
BASE_DOMIN = BASE_DOMIN + "-"
elif language == 1:
BASE_DOMIN = BASE_DOMIN + "1-"
elif language == 2:
BASE_DOMIN = BASE_DOMIN + "7-"
elif language == 3:
BASE_DOMIN = BASE_DOMIN + "2-"
elif language == 4:
BASE_DOMIN = BASE_DOMIN + "3-"
elif language == 5:
BASE_DOMIN = BASE_DOMIN + "4-"
elif language == 6:
BASE_DOMIN = BASE_DOMIN + "5-"
elif language == 7:
BASE_DOMIN = BASE_DOMIN + "6-"
print("URL拼接完成:{}".format(BASE_DOMIN))
print("-" * 50)
# 选择页数
print("开始获取总页数")
html = requests.get(BASE_DOMIN, headers=headers).content
selector = etree.HTML(html)
page_num = str(selector.xpath('//div[@class="pager"]/a/@href')[5]).split("p")[1]
print("当前配置总页数: [{}] 页".format(page_num))
page_num = input("请输入 [页数] (默认全部):")
if page_num == "":
page_num = str(selector.xpath('//div[@class="pager"]/a/@href')[5]).split("p")[1]
print("设置爬取页数: [{}] 页".format(str(page_num)))
print("-" * 50)
if input("按下 [回车] 开始爬取:") == "":
Spider(int(page_num))
else:
print("脚本已结束")
pass