import json
|
import os
|
import random
|
import time
|
from bs4 import BeautifulSoup
|
from urllib.parse import urlencode
|
|
import requests
|
|
from cartoon import cartoon_data_import
|
|
|
class QireManHuaSpider:
|
def __parse_html(self, data):
|
soup = BeautifulSoup(data, "lxml")
|
return soup
|
|
# 书本列表
|
def list_books(self, page):
|
result = requests.post("https://m.qiremanhua.com/book/book_cate_ajax",
|
{"tid": "all", "vip": "all", "end": "all", "sort": 1, "page": page,
|
"random": random.randint(0, 10000)})
|
return result.text
|
|
def chaplist(self, book_id):
|
result = requests.post("https://m.qiremanhua.com/book/chaplist_ajax",
|
{"book_id": book_id, "sort": 1})
|
return result.text
|
|
def commentlist(self, book_id, page):
|
result = requests.post("https://m.qiremanhua.com/comment/list_comment_ajax",
|
{"bookId": book_id, "page": page, "random": random.randint(0, 10000)})
|
return result.text
|
|
def book_detail(self, book_id):
|
result = requests.get(f"https://m.qiremanhua.com/book/{book_id}/")
|
return result.text
|
|
def chap_content(self, book_id, cid):
|
result = requests.get(f"https://m.qiremanhua.com/book/{book_id}/{cid}/")
|
return result.text
|
|
def vertical_cover(self, key):
|
result = requests.get(f"https://m.qiremanhua.com/book/search?key={key}")
|
soup = self.__parse_html(result.text)
|
items = soup.find("div", class_="books-rows").find_all("div", class_="item")
|
for item in items:
|
title = item.find("div", class_="title").text.strip()
|
if title == key.strip():
|
return item.find("img")["src"]
|
return None
|
|
# 首页推荐更多解析
|
def home_recommend_more(self, id_, page):
|
result = requests.post("https://m.qiremanhua.com/book/more_ajax",
|
{"id": id_, "page": page, "random": random.randint(0, 10000)})
|
return result.text
|
|
def parse_list(self, data):
|
data = json.loads(data)
|
try:
|
if data["status"] == 1 and data["state"] == 1:
|
return data["data"]
|
return None
|
except Exception as e:
|
print(data)
|
return []
|
|
def parse_content(self, data):
|
soup = self.__parse_html(data)
|
sp = soup.find("div", class_="episode-detail").find_all("img")
|
imgs = []
|
for p in sp:
|
imgs.append(p["data-original"])
|
return imgs
|
|
# 详情(封面,标题,(分类1,分类2),更新信息,简介,作者,人气值,点赞人数,已收藏)
|
def parse_detail(self, data):
|
soup = self.__parse_html(data)
|
sp = soup.find("div", class_="book-hero").find("img")
|
cover = sp["src"].strip()
|
title = soup.find("div", class_="book-hero__detail").find("h1").text.strip()
|
tags = []
|
for t in soup.find("div", class_="book-hero__detail").find("div", class_="tags").find_all("div"):
|
tags.append(t.text.strip())
|
update_info = ""
|
if soup.find("div", class_="book-container__head").find("div", class_="update"):
|
update_info = soup.find("div", class_="book-container__head").find("div",
|
class_="update").text.strip().replace(
|
"更新:", "")
|
|
desc = soup.find("div", class_="book-container__detail").text.strip()
|
author = soup.find("div", class_="book-container__author").text.strip().replace("作者:", "")
|
vals = soup.find("div", class_="book-container__row").find_all("div", class_="number")
|
return cover, title, tags, update_info, desc, author, vals[0].text, vals[1].text, vals[2].text
|
|
|
def download_file(path, img):
|
r = requests.get(img, headers={"Referer": "https://m.qiremanhua.com/",
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/603.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/603.1"})
|
with open(path, 'wb') as f:
|
f.write(r.content)
|
|
|
def save_file(path, text):
|
with open(path, mode='w', encoding="utf-8") as f:
|
f.write(text)
|
|
|
def read_file(path):
|
with open(path, mode='r', encoding="utf-8") as f:
|
lines = f.readlines()
|
return lines[0]
|
|
|
spider = QireManHuaSpider()
|
|
|
def spide_detail(book_id):
|
# 获取详情
|
path = f"E:\\动漫\\文网文素材\\book_detail\\{book_id}.json"
|
if not os.path.exists(path):
|
detail = spider.parse_detail(spider.book_detail(book_id))
|
save_file(path, json.dumps(detail))
|
|
path = f"E:\\动漫\\文网文素材\\chap_list\\{book_id}.json"
|
if not os.path.exists(path):
|
# 获取章节
|
chaplist = spider.parse_list(spider.chaplist(book_id))
|
save_file(path, json.dumps(chaplist))
|
# 获取评论
|
path = f"E:\\动漫\\文网文素材\\comment_list\\{book_id}.json"
|
if not os.path.exists(path):
|
page = 0
|
comment_list = []
|
while True:
|
page += 1
|
temp_list = spider.parse_list(spider.commentlist(book_id, page))
|
comment_list += temp_list
|
if len(temp_list) < 10:
|
break
|
else:
|
time.sleep(0.3)
|
save_file(path, json.dumps(comment_list))
|
print("完成爬取", book_id)
|
|
|
# 爬取内容
|
def spide_content(book_id):
|
path = f"E:\\动漫\\文网文素材\\chap_list\\{book_id}.json"
|
if not os.path.exists(path):
|
return
|
with open(path, mode='r', encoding="utf-8") as f:
|
data = f.readline()
|
data = json.loads(data)
|
for d in data:
|
if d['cBS'] == 0:
|
cid = d['cId']
|
content = spider.chap_content(book_id, cid)
|
content = spider.parse_content(content)
|
has_download = False
|
for i in range(len(content)):
|
path_ = f"E:\\动漫\\文网文素材\\content_list\\{book_id}\\{cid}\\{i}.jpg"
|
dir_ = os.path.abspath(os.path.join(path_, os.pardir))
|
if not os.path.exists(dir_):
|
os.makedirs(dir_)
|
if not os.path.exists(path_):
|
# 下载图片
|
download_file(path_, content[i])
|
has_download = True
|
if has_download:
|
time.sleep(1)
|
|
|
def __get_cover():
|
for p in range(1, 100):
|
path = f"E:\\动漫\\文网文素材\\book_list\\{p}.json"
|
if not os.path.exists(path):
|
continue
|
data = read_file(path)
|
data = json.loads(data)
|
for d in data:
|
|
cover = d["book_cover"]
|
ps = cover.split("/")
|
dir_ = f"E:\\动漫\\文网文素材\\cover\\{ps[-2]}"
|
if not os.path.exists(dir_):
|
os.mkdir(dir_)
|
|
if os.path.exists(f"{dir_}\\{ps[-1]}"):
|
continue
|
download_file(f"{dir_}\\{ps[-1]}", d["book_cover"])
|
print("横屏封面", d["id"], d["book_name"], d["book_cover"])
|
|
|
def __get_vertical_cover():
|
for p in range(1, 100):
|
path = f"E:\\动漫\\文网文素材\\book_list\\{p}.json"
|
if not os.path.exists(path):
|
continue
|
data = read_file(path)
|
data = json.loads(data)
|
for d in data:
|
cover = cartoon_data_import.get_vertical_cover(d["id"])
|
if not cover or len(cover) < 20:
|
cover = spider.vertical_cover(d['book_name'])
|
if cover:
|
cartoon_data_import.update_vertical_cover(d["id"], cover)
|
else:
|
print("没有搜索到:",d["id"],d['book_name'])
|
if not cover:
|
continue
|
ps = cover.split("/")
|
dir_ = f"E:\\动漫\\文网文素材\\cover\\{ps[-2]}"
|
if not os.path.exists(dir_):
|
os.mkdir(dir_)
|
|
if os.path.exists(f"{dir_}\\{ps[-1]}"):
|
continue
|
download_file(f"{dir_}\\{ps[-1]}", cover)
|
print("竖版封面", d["id"], d["book_name"], cover)
|
|
def __get_comment_user_avtor():
|
avtors = cartoon_data_import.list_comment_user_avtor()
|
index = 0
|
for cover in avtors:
|
index+=1
|
if not cover:
|
continue
|
if cover.find("/uploads/")<0:
|
continue
|
ps = cover.split("/uploads/")[1].split("/")
|
dir_ = f"E:\\动漫\\文网文素材\\"+"\\".join( ps[:-1])
|
if not os.path.exists(dir_):
|
os.makedirs(dir_)
|
|
if os.path.exists(f"{dir_}\\{ps[-1]}"):
|
continue
|
download_file(f"{dir_}\\{ps[-1]}", cover)
|
print("头像下载", index, cover)
|
|
|
def __get_home_recommend():
|
ids = [59, 60, 61, 62, 63, 64, 65, 66]
|
for id_ in ids:
|
page = 1
|
fdata = []
|
while True:
|
data = spider.home_recommend_more(id_, page)
|
page += 1
|
data = spider.parse_list(data)
|
fdata += data
|
if len(data) < 10:
|
break
|
# 保存文档
|
path = f"E:\\动漫\\文网文素材\\recommend\\{id_}.json"
|
if not os.path.exists(path):
|
save_file(path, json.dumps(fdata))
|
|
|
if __name__ == "__main__0":
|
__get_home_recommend()
|
|
if __name__ == "__main__":
|
__get_comment_user_avtor()
|
# count = 0
|
# for p in range(1, 100):
|
# path = f"E:\\动漫\\文网文素材\\book_list\\{p}.json"
|
# if not os.path.exists(path):
|
# continue
|
# data = read_file(path)
|
# data = json.loads(data)
|
# for d in data:
|
# count += 1
|
# print("开始爬取", d["id"])
|
# # spide_detail(d["id"])
|
# spide_content(d["id"])
|
# print("总数", count)
|
|
# data = spider.chap_content("10762", "57721")
|
# print(spider.parse_detail(spider.book_detail("10762")))
|