简单的一个脚本,通过Python脚本获取云展网的图片数据,并转换合并成PDF的工具。这里并不是批量抓取网站书籍数据,是通过输入单本书籍URL来获取单本书籍,初衷是帮朋友获取一本书的图片数据,觉得其他人应该也会有这个需求,所以就贴出了代码,仅供其他人学习参考。云展网做了反爬的机制,
import os
from tqdm import tqdm
import re
import img2pdf
import requests
import json
from PIL import Image
import shutil
from urllib.parse import quote
url = 'https://www.yunzhan365.com/api/book/get-explore-book'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76',
'Cookie': 'MUID=xxxxxxxxx' #云展网d登录后获取headers里的cookie替换后可使用
}
if headers['Cookie'] == 'MUID=xxxxxxxxx':
print('获取cookie并在headers里替换后可使用,云展网:https://www.yunzhan365.com/')
exit()
url = input("请输入url:")
path = "files"
if not os.path.exists(path):
os.mkdir(path)
url = url[27:-17]
print(url)
data_url = f'https://book.yunzhan365.com{url}mobile/javascript/config.js'
print(data_url)
# 发送HTTP请求
response = requests.get(data_url,headers=headers)
# 检查请求是否成功
if response.status_code == 200:
# 获取网页内容
content = response.text
# 去除字符串 "var htmlConfig = "
content = content.replace('var htmlConfig = ', '', 1)
content = content.replace('; ', '', 1)
content = content.rstrip(';')
# 尝试将去除特定字符串后的字符串转换为JSON
try:
data = json.loads(content)
except json.JSONDecodeError as e:
print('JSON解析失败,可能需要检查字符串格式。', e)
else:
print('获取config文件失败, status code:', response.status_code)
image_list = []
# 如果解析成功,提取图片URL
if data is not None:
# 找到所有包含图片URL的键
for page in data["fliphtml5_pages"]:
pages = page.get("n", [])
image_list.append(pages[0])
else:
print("JSON解析失败,可能需要检查字符串格式。")
if image_list == []:
print("获取图片地址失败")
exit()
for num, img in enumerate(image_list):
img = img.replace('../', '')
url_jpg = f"https://book.yunzhan365.com{url}{img}"
print(url_jpg)
img_name = str(num + 1)
# 发送GET请求获取图片
response = requests.get(url_jpg)
# 检查请求是否成功
if response.status_code == 200:
# 将图片内容写入本地文件
with open(path + '/' + img_name + '.webp', 'wb') as f:
f.write(response.content)
else:
print("无法下载图片,请求失败。")
pdf_choose = input("爬取成功,是否合并为pdf文件?(是Y/否N):")
if pdf_choose == "Y" or pdf_choose == 'y' or pdf_choose == '是':
jpg_path = 'jpg_files'
# 遍历webp_folder中的所有文件
for filename in os.listdir(path):
if filename.endswith('.webp'): # 确保处理的是WebP文件
# 构造完整的文件路径
webp_path = os.path.join(path, filename)
# 创建新的文件名和路径
new_filename = os.path.splitext(filename)[0] + '.jpg'
jpg_path = os.path.join(path, new_filename)
# 使用Pillow打开WebP文件并保存为JPG格式
with Image.open(webp_path) as img:
img.convert('RGB').save(jpg_path, 'JPEG')
print("图片转换完成!")
jpg_list = []
imgs = []
file = os.listdir(path)
for img_name in file:
if 'jpg' in img_name:
jpg_list.append(img_name)
jpg_list.sort(key=lambda x: int(x[:-4]))
photo_list = [os.path.join(path, i) for i in jpg_list]
a4inpt = (img2pdf.mm_to_pt(210), img2pdf.mm_to_pt(297))
layout_fun = img2pdf.get_layout_fun(a4inpt)
with open(path + '/' + "123" + '.pdf', 'wb') as f:
f.write(img2pdf.convert(photo_list, layout_fun=layout_fun))
print("合并成功,请前往程序所在路径查看")
评论列表,共 0 条评论
暂无评论