diff --git a/.github/workflows/test-basic-funcs.yml b/.github/workflows/test-basic-funcs.yml
index 9d3d85c34..de3330205 100644
--- a/.github/workflows/test-basic-funcs.yml
+++ b/.github/workflows/test-basic-funcs.yml
@@ -40,12 +40,8 @@ jobs:
- name: Install dependencies
run: |
poetry install
- - name: Lint with flake8
- run: |
- # stop the build if there are Python syntax errors or undefined names
- poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
- # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
- poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Lint with ruff
+ run: poetry run ruff check .
- name: Test avid.py
run: |
poetry run pytest unittest/test_avid.py
diff --git a/javsp/__main__.py b/javsp/__main__.py
index 7771170e7..ff93f49b0 100644
--- a/javsp/__main__.py
+++ b/javsp/__main__.py
@@ -1,30 +1,29 @@
+import json
+import logging
import os
import re
import sys
-import json
+import threading
import time
-import logging
+from typing import Dict, List
+
+import requests
from PIL import Image
from pydantic import ValidationError
from pydantic_extra_types.pendulum_dt import Duration
-import requests
-import threading
-from typing import Dict, List
-sys.stdout.reconfigure(encoding='utf-8')
+sys.stdout.reconfigure(encoding="utf-8")
import colorama
import pretty_errors
from colorama import Fore, Style
from tqdm import tqdm
-
pretty_errors.configure(display_link=True)
+from javsp.cropper import get_cropper
from javsp.print import TqdmOut
-from javsp.cropper import Cropper, get_cropper
-
# 将StreamHandler的stream修改为TqdmOut,以与Tqdm协同工作
root_logger = logging.getLogger()
@@ -32,23 +31,23 @@
if type(handler) == logging.StreamHandler:
handler.stream = TqdmOut
-logger = logging.getLogger('main')
+logger = logging.getLogger("main")
-from javsp.lib import resource_path
-from javsp.nfo import write_nfo
+from javsp.config import Cfg, CrawlerID
+from javsp.datatype import Movie, MovieInfo
from javsp.file import *
from javsp.func import *
from javsp.image import *
-from javsp.datatype import Movie, MovieInfo
+from javsp.lib import resource_path
+from javsp.nfo import write_nfo
from javsp.web.base import download
from javsp.web.exceptions import *
from javsp.web.translate import translate_movie_info
-from javsp.config import Cfg, CrawlerID
-
actressAliasMap = {}
+
def resolve_alias(name):
"""将别名解析为固定的名字"""
for fixedName, aliases in actressAliasMap.items():
@@ -68,30 +67,33 @@ def import_crawlers():
# if name == 'fc2fan' and (not os.path.isdir(Cfg().Crawler.fc2fan_local_path)):
# logger.debug('由于未配置有效的fc2fan路径,已跳过该抓取器')
# continue
- import_name = 'javsp.web.' + name
+ import_name = "javsp.web." + name
__import__(import_name)
- valid_mods.append(import_name) # 抓取器有效: 使用完整模块路径,便于程序实际使用
+ valid_mods.append(
+ import_name
+ ) # 抓取器有效: 使用完整模块路径,便于程序实际使用
except ModuleNotFoundError:
- unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示
+ unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示
if unknown_mods:
- logger.warning('配置的抓取器无效: ' + ', '.join(unknown_mods))
+ logger.warning("配置的抓取器无效: " + ", ".join(unknown_mods))
# 爬虫是IO密集型任务,可以通过多线程提升效率
def parallel_crawler(movie: Movie, tqdm_bar=None):
"""使用多线程抓取不同网站的数据"""
+
def wrapper(parser, info: MovieInfo, retry):
"""对抓取器函数进行包装,便于更新提示信息和自动重试"""
crawler_name = threading.current_thread().name
- task_info = f'Crawler: {crawler_name}: {info.dvdid}'
+ task_info = f"Crawler: {crawler_name}: {info.dvdid}"
for cnt in range(retry):
try:
parser(info)
movie_id = info.dvdid or info.cid
logger.debug(f"{crawler_name}: 抓取成功: '{movie_id}': '{info.url}'")
- setattr(info, 'success', True)
+ setattr(info, "success", True)
if isinstance(tqdm_bar, tqdm):
- tqdm_bar.set_description(f'{crawler_name}: 抓取完成')
+ tqdm_bar.set_description(f"{crawler_name}: 抓取完成")
break
except MovieNotFoundError as e:
logger.debug(e)
@@ -103,9 +105,11 @@ def wrapper(parser, info: MovieInfo, retry):
logger.error(e)
break
except requests.exceptions.RequestException as e:
- logger.debug(f'{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}')
+ logger.debug(
+ f"{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}"
+ )
if isinstance(tqdm_bar, tqdm):
- tqdm_bar.set_description(f'{crawler_name}: 网络错误,正在重试')
+ tqdm_bar.set_description(f"{crawler_name}: 网络错误,正在重试")
except Exception as e:
logger.exception(e)
@@ -114,7 +118,7 @@ def wrapper(parser, info: MovieInfo, retry):
all_info = {i.value: MovieInfo(movie) for i in crawler_mods}
# 番号为cid但同时也有有效的dvdid时,也尝试使用普通模式进行抓取
- if movie.data_src == 'cid' and movie.dvdid:
+ if movie.data_src == "cid" and movie.dvdid:
crawler_mods = crawler_mods + Cfg().crawler.selection.normal
for i in all_info.values():
i.dvdid = None
@@ -123,13 +127,15 @@ def wrapper(parser, info: MovieInfo, retry):
thread_pool = []
for mod_partial, info in all_info.items():
mod = f"javsp.web.{mod_partial}"
- parser = getattr(sys.modules[mod], 'parse_data')
+ parser = getattr(sys.modules[mod], "parse_data")
# 将all_info中的info实例传递给parser,parser抓取完成后,info实例的值已经完成更新
# TODO: 抓取器如果带有parse_data_raw,说明它已经自行进行了重试处理,此时将重试次数设置为1
- if hasattr(sys.modules[mod], 'parse_data_raw'):
+ if hasattr(sys.modules[mod], "parse_data_raw"):
th = threading.Thread(target=wrapper, name=mod, args=(parser, info, 1))
else:
- th = threading.Thread(target=wrapper, name=mod, args=(parser, info, Cfg().network.retry))
+ th = threading.Thread(
+ target=wrapper, name=mod, args=(parser, info, Cfg().network.retry)
+ )
th.start()
thread_pool.append(th)
# 等待所有线程结束
@@ -138,22 +144,28 @@ def wrapper(parser, info: MovieInfo, retry):
th: threading.Thread
th.join(timeout=timeout)
# 根据抓取结果更新影片类型判定
- if movie.data_src == 'cid' and movie.dvdid:
+ if movie.data_src == "cid" and movie.dvdid:
titles = [all_info[i].title for i in Cfg().crawler.selection[movie.data_src]]
if any(titles):
movie.dvdid = None
- all_info = {k: v for k, v in all_info.items() if k in Cfg().crawler.selection['cid']}
+ all_info = {
+ k: v for k, v in all_info.items() if k in Cfg().crawler.selection["cid"]
+ }
else:
- logger.debug(f'自动更正影片数据源类型: {movie.dvdid} ({movie.cid}): normal')
- movie.data_src = 'normal'
+ logger.debug(f"自动更正影片数据源类型: {movie.dvdid} ({movie.cid}): normal")
+ movie.data_src = "normal"
movie.cid = None
- all_info = {k: v for k, v in all_info.items() if k not in Cfg().crawler.selection['cid']}
+ all_info = {
+ k: v
+ for k, v in all_info.items()
+ if k not in Cfg().crawler.selection["cid"]
+ }
# 删除抓取失败的站点对应的数据
- all_info = {k:v for k,v in all_info.items() if hasattr(v, 'success')}
+ all_info = {k: v for k, v in all_info.items() if hasattr(v, "success")}
for info in all_info.values():
del info.success
# 删除all_info中键名中的'web.'
- all_info = {k[4:]:v for k,v in all_info.items()}
+ all_info = {k[4:]: v for k, v in all_info.items()}
return all_info
@@ -162,8 +174,8 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
final_info = MovieInfo(movie)
########## 部分字段配置了专门的选取逻辑,先处理这些字段 ##########
# genre
- if 'javdb' in all_info and all_info['javdb'].genre:
- final_info.genre = all_info['javdb'].genre
+ if "javdb" in all_info and all_info["javdb"].genre:
+ final_info.genre = all_info["javdb"].genre
########## 移除所有抓取器数据中,标题尾部的女优名 ##########
if Cfg().summarizer.title.remove_trailing_actor_name:
@@ -172,7 +184,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
########## 然后检查所有字段,如果某个字段还是默认值,则按照优先级选取数据 ##########
# parser直接更新了all_info中的项目,而初始all_info是按照优先级生成的,已经符合配置的优先级顺序了
# 按照优先级取出各个爬虫获取到的信息
- attrs = [i for i in dir(final_info) if not i.startswith('_')]
+ attrs = [i for i in dir(final_info) if not i.startswith("_")]
covers, big_covers = [], []
for name, data in all_info.items():
absorbed = []
@@ -180,15 +192,15 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
for attr in attrs:
incoming = getattr(data, attr)
current = getattr(final_info, attr)
- if attr == 'cover':
+ if attr == "cover":
if incoming and (incoming not in covers):
covers.append(incoming)
absorbed.append(attr)
- elif attr == 'big_cover':
+ elif attr == "big_cover":
if incoming and (incoming not in big_covers):
big_covers.append(incoming)
absorbed.append(attr)
- elif attr == 'uncensored':
+ elif attr == "uncensored":
if (current is None) and (incoming is not None):
setattr(final_info, attr, incoming)
absorbed.append(attr)
@@ -197,7 +209,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
setattr(final_info, attr, incoming)
absorbed.append(attr)
if absorbed:
- logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed))
+ logger.debug(f"从'{name}'中获取了字段: " + " ".join(absorbed))
# 使用网站的番号作为番号
if Cfg().crawler.respect_site_avid:
id_weight = {}
@@ -209,14 +221,19 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
id_weight.setdefault(data.cid, []).append(name)
# 根据权重选择最终番号
if id_weight:
- id_weight = {k:v for k, v in sorted(id_weight.items(), key=lambda x:len(x[1]), reverse=True)}
+ id_weight = {
+ k: v
+ for k, v in sorted(
+ id_weight.items(), key=lambda x: len(x[1]), reverse=True
+ )
+ }
final_id = list(id_weight.keys())[0]
if movie.dvdid:
final_info.dvdid = final_id
else:
final_info.cid = final_id
# javdb封面有水印,优先采用其他站点的封面
- javdb_cover = getattr(all_info.get('javdb'), 'cover', None)
+ javdb_cover = getattr(all_info.get("javdb"), "cover", None)
if javdb_cover is not None:
match Cfg().crawler.use_javdb_cover:
case UseJavDBCover.fallback:
@@ -225,8 +242,8 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
case UseJavDBCover.no:
covers.remove(javdb_cover)
- setattr(final_info, 'covers', covers)
- setattr(final_info, 'big_covers', big_covers)
+ setattr(final_info, "covers", covers)
+ setattr(final_info, "big_covers", big_covers)
# 对cover和big_cover赋值,避免后续检查必须字段时出错
if covers:
final_info.cover = covers[0]
@@ -237,16 +254,17 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
if final_info.genre is None:
final_info.genre = []
if movie.hard_sub:
- final_info.genre.append('内嵌字幕')
+ final_info.genre.append("内嵌字幕")
if movie.uncensored:
- final_info.genre.append('无码流出/破解')
+ final_info.genre.append("无码流出/破解")
# 女优别名固定
if Cfg().crawler.normalize_actress_name and bool(final_info.actress_pics):
final_info.actress = [resolve_alias(i) for i in final_info.actress]
if final_info.actress_pics:
final_info.actress_pics = {
- resolve_alias(key): value for key, value in final_info.actress_pics.items()
+ resolve_alias(key): value
+ for key, value in final_info.actress_pics.items()
}
# 检查是否所有必需的字段都已经获得了值
@@ -258,127 +276,161 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
movie.info = final_info
return True
+
def generate_names(movie: Movie):
"""按照模板生成相关文件的文件名"""
info = movie.info
# 准备用来填充命名模板的字典
d = info.get_info_dic()
if info.actress and len(info.actress) > Cfg().summarizer.path.max_actress_count:
- logging.debug('女优人数过多,按配置保留了其中的前n个: ' + ','.join(info.actress))
- actress = info.actress[:Cfg().summarizer.path.max_actress_count] + ['…']
+ logging.debug(
+ "女优人数过多,按配置保留了其中的前n个: " + ",".join(info.actress)
+ )
+ actress = info.actress[: Cfg().summarizer.path.max_actress_count] + ["…"]
else:
actress = info.actress
- d['actress'] = ','.join(actress) if actress else Cfg().summarizer.default.actress
+ d["actress"] = ",".join(actress) if actress else Cfg().summarizer.default.actress
# 保存label供后面判断裁剪图片的方式使用
- setattr(info, 'label', d['label'].upper())
+ setattr(info, "label", d["label"].upper())
# 处理字段:替换不能作为文件名的字符,移除首尾的空字符
for k, v in d.items():
d[k] = replace_illegal_chars(v.strip())
# 生成nfo文件中的影片标题
nfo_title = Cfg().summarizer.nfo.title_pattern.format(**d)
- setattr(info, 'nfo_title', nfo_title)
-
+ setattr(info, "nfo_title", nfo_title)
+
# 使用字典填充模板,生成相关文件的路径(多分片影片要考虑CD-x部分)
- cdx = '' if len(movie.files) <= 1 else '-CD1'
- if hasattr(info, 'title_break'):
+ cdx = "" if len(movie.files) <= 1 else "-CD1"
+ if hasattr(info, "title_break"):
title_break = info.title_break
else:
- title_break = split_by_punc(d['title'])
- if hasattr(info, 'ori_title_break'):
+ title_break = split_by_punc(d["title"])
+ if hasattr(info, "ori_title_break"):
ori_title_break = info.ori_title_break
else:
- ori_title_break = split_by_punc(d['rawtitle'])
+ ori_title_break = split_by_punc(d["rawtitle"])
copyd = d.copy()
- copyd['num'] = copyd['num'] + movie.attr_str
+ copyd["num"] = copyd["num"] + movie.attr_str
longest_ext = max((os.path.splitext(i)[1] for i in movie.files), key=len)
for end in range(len(ori_title_break), 0, -1):
- copyd['rawtitle'] = replace_illegal_chars(''.join(ori_title_break[:end]).strip())
+ copyd["rawtitle"] = replace_illegal_chars(
+ "".join(ori_title_break[:end]).strip()
+ )
for sub_end in range(len(title_break), 0, -1):
- copyd['title'] = replace_illegal_chars(''.join(title_break[:sub_end]).strip())
+ copyd["title"] = replace_illegal_chars(
+ "".join(title_break[:sub_end]).strip()
+ )
if Cfg().summarizer.move_files:
- save_dir = os.path.normpath(Cfg().summarizer.path.output_folder_pattern.format(**copyd)).strip()
- basename = os.path.normpath(Cfg().summarizer.path.basename_pattern.format(**copyd)).strip()
+ save_dir = os.path.normpath(
+ Cfg().summarizer.path.output_folder_pattern.format(**copyd)
+ ).strip()
+ basename = os.path.normpath(
+ Cfg().summarizer.path.basename_pattern.format(**copyd)
+ ).strip()
else:
# 如果不整理文件,则保存抓取的数据到当前目录
save_dir = os.path.dirname(movie.files[0])
filebasename = os.path.basename(movie.files[0])
ext = os.path.splitext(filebasename)[1]
- basename = filebasename.replace(ext, '')
- long_path = os.path.join(save_dir, basename+longest_ext)
+ basename = filebasename.replace(ext, "")
+ long_path = os.path.join(save_dir, basename + longest_ext)
remaining = get_remaining_path_len(os.path.abspath(long_path))
if remaining > 0:
movie.save_dir = save_dir
movie.basename = basename
- movie.nfo_file = os.path.join(save_dir, Cfg().summarizer.nfo.basename_pattern.format(**copyd) + '.nfo')
- movie.fanart_file = os.path.join(save_dir, Cfg().summarizer.fanart.basename_pattern.format(**copyd) + '.jpg')
- movie.poster_file = os.path.join(save_dir, Cfg().summarizer.cover.basename_pattern.format(**copyd) + '.jpg')
- if d['title'] != copyd['title']:
+ movie.nfo_file = os.path.join(
+ save_dir,
+ Cfg().summarizer.nfo.basename_pattern.format(**copyd) + ".nfo",
+ )
+ movie.fanart_file = os.path.join(
+ save_dir,
+ Cfg().summarizer.fanart.basename_pattern.format(**copyd) + ".jpg",
+ )
+ movie.poster_file = os.path.join(
+ save_dir,
+ Cfg().summarizer.cover.basename_pattern.format(**copyd) + ".jpg",
+ )
+ if d["title"] != copyd["title"]:
logger.info(f"自动截短标题为:\n{copyd['title']}")
- if d['rawtitle'] != copyd['rawtitle']:
+ if d["rawtitle"] != copyd["rawtitle"]:
logger.info(f"自动截短原始标题为:\n{copyd['rawtitle']}")
return
else:
# 以防万一,当整理路径非常深或者标题起始很长一段没有标点符号时,硬性截短生成的名称
- copyd['title'] = copyd['title'][:remaining]
- copyd['rawtitle'] = copyd['rawtitle'][:remaining]
+ copyd["title"] = copyd["title"][:remaining]
+ copyd["rawtitle"] = copyd["rawtitle"][:remaining]
# 如果不整理文件,则保存抓取的数据到当前目录
if not Cfg().summarizer.move_files:
save_dir = os.path.dirname(movie.files[0])
filebasename = os.path.basename(movie.files[0])
ext = os.path.splitext(filebasename)[1]
- basename = filebasename.replace(ext, '')
+ basename = filebasename.replace(ext, "")
else:
- save_dir = os.path.normpath(Cfg().summarizer.path.output_folder_pattern.format(**copyd)).strip()
- basename = os.path.normpath(Cfg().summarizer.path.basename_pattern.format(**copyd)).strip()
+ save_dir = os.path.normpath(
+ Cfg().summarizer.path.output_folder_pattern.format(**copyd)
+ ).strip()
+ basename = os.path.normpath(
+ Cfg().summarizer.path.basename_pattern.format(**copyd)
+ ).strip()
movie.save_dir = save_dir
movie.basename = basename
- movie.nfo_file = os.path.join(save_dir, Cfg().summarizer.nfo.basename_pattern.format(**copyd) + '.nfo')
- movie.fanart_file = os.path.join(save_dir, Cfg().summarizer.fanart.basename_pattern.format(**copyd) + '.jpg')
- movie.poster_file = os.path.join(save_dir, Cfg().summarizer.cover.basename_pattern.format(**copyd) + '.jpg')
-
- if d['title'] != copyd['title']:
+ movie.nfo_file = os.path.join(
+ save_dir, Cfg().summarizer.nfo.basename_pattern.format(**copyd) + ".nfo"
+ )
+ movie.fanart_file = os.path.join(
+ save_dir, Cfg().summarizer.fanart.basename_pattern.format(**copyd) + ".jpg"
+ )
+ movie.poster_file = os.path.join(
+ save_dir, Cfg().summarizer.cover.basename_pattern.format(**copyd) + ".jpg"
+ )
+
+ if d["title"] != copyd["title"]:
logger.info(f"自动截短标题为:\n{copyd['title']}")
- if d['rawtitle'] != copyd['rawtitle']:
+ if d["rawtitle"] != copyd["rawtitle"]:
logger.info(f"自动截短原始标题为:\n{copyd['rawtitle']}")
+
def reviewMovieID(all_movies, root):
"""人工检查每一部影片的番号"""
count = len(all_movies)
- logger.info('进入手动模式检查番号: ')
+ logger.info("进入手动模式检查番号: ")
for i, movie in enumerate(all_movies, start=1):
id = repr(movie)[7:-2]
- print(f'[{i}/{count}]\t{Fore.LIGHTMAGENTA_EX}{id}{Style.RESET_ALL}, 对应文件:')
+ print(f"[{i}/{count}]\t{Fore.LIGHTMAGENTA_EX}{id}{Style.RESET_ALL}, 对应文件:")
relpaths = [os.path.relpath(i, root) for i in movie.files]
- print('\n'.join([' '+i for i in relpaths]))
- s = input("回车确认当前番号,或直接输入更正后的番号(如'ABC-123'或'cid:sqte00300')")
+ print("\n".join([" " + i for i in relpaths]))
+ s = input(
+ "回车确认当前番号,或直接输入更正后的番号(如'ABC-123'或'cid:sqte00300')"
+ )
if not s:
logger.info(f"已确认影片番号: {','.join(relpaths)}: {id}")
else:
s = s.strip()
s_lc = s.lower()
- if s_lc.startswith(('cid:', 'cid=')):
+ if s_lc.startswith(("cid:", "cid=")):
new_movie = Movie(cid=s_lc[4:])
- new_movie.data_src = 'cid'
+ new_movie.data_src = "cid"
new_movie.files = movie.files
- elif s_lc.startswith('fc2'):
+ elif s_lc.startswith("fc2"):
new_movie = Movie(s)
- new_movie.data_src = 'fc2'
+ new_movie.data_src = "fc2"
new_movie.files = movie.files
else:
new_movie = Movie(s)
- new_movie.data_src = 'normal'
+ new_movie.data_src = "normal"
new_movie.files = movie.files
- all_movies[i-1] = new_movie
+ all_movies[i - 1] = new_movie
new_id = repr(new_movie)[7:-2]
logger.info(f"已更正影片番号: {','.join(relpaths)}: {id} -> {new_id}")
print()
-SUBTITLE_MARK_FILE = Image.open(os.path.abspath(resource_path('image/sub_mark.png')))
-UNCENSORED_MARK_FILE = Image.open(os.path.abspath(resource_path('image/unc_mark.png')))
+SUBTITLE_MARK_FILE = Image.open(os.path.abspath(resource_path("image/sub_mark.png")))
+UNCENSORED_MARK_FILE = Image.open(os.path.abspath(resource_path("image/unc_mark.png")))
+
def process_poster(movie: Movie):
def should_use_ai_crop_match(label):
@@ -386,10 +438,13 @@ def should_use_ai_crop_match(label):
re.match(r, label)
return True
return False
+
crop_engine = None
- if (movie.info.uncensored or
- movie.data_src == 'fc2' or
- should_use_ai_crop_match(movie.info.label.upper())):
+ if (
+ movie.info.uncensored
+ or movie.data_src == "fc2"
+ or should_use_ai_crop_match(movie.info.label.upper())
+ ):
crop_engine = Cfg().summarizer.cover.crop.engine
cropper = get_cropper(crop_engine)
fanart_image = Image.open(movie.fanart_file)
@@ -397,21 +452,27 @@ def should_use_ai_crop_match(label):
if Cfg().summarizer.cover.add_label:
if movie.hard_sub:
- fanart_cropped = add_label_to_poster(fanart_cropped, SUBTITLE_MARK_FILE, LabelPostion.BOTTOM_RIGHT)
+ fanart_cropped = add_label_to_poster(
+ fanart_cropped, SUBTITLE_MARK_FILE, LabelPostion.BOTTOM_RIGHT
+ )
if movie.uncensored:
- fanart_cropped = add_label_to_poster(fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT)
+ fanart_cropped = add_label_to_poster(
+ fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT
+ )
fanart_cropped.save(movie.poster_file)
+
def RunNormalMode(all_movies):
"""普通整理模式"""
- def check_step(result, msg='步骤错误'):
+
+ def check_step(result, msg="步骤错误"):
"""检查一个整理步骤的结果,并负责更新tqdm的进度"""
if result:
inner_bar.update()
else:
- raise Exception(msg + '\n')
+ raise Exception(msg + "\n")
- outer_bar = tqdm(all_movies, desc='整理影片', ascii=True, leave=False)
+ outer_bar = tqdm(all_movies, desc="整理影片", ascii=True, leave=False)
total_step = 6
if Cfg().translator.engine:
total_step += 1
@@ -423,34 +484,36 @@ def check_step(result, msg='步骤错误'):
try:
# 初始化本次循环要整理影片任务
filenames = [os.path.split(i)[1] for i in movie.files]
- logger.info('正在整理: ' + ', '.join(filenames))
- inner_bar = tqdm(total=total_step, desc='步骤', ascii=True, leave=False)
+ logger.info("正在整理: " + ", ".join(filenames))
+ inner_bar = tqdm(total=total_step, desc="步骤", ascii=True, leave=False)
# 依次执行各个步骤
- inner_bar.set_description(f'启动并发任务')
+ inner_bar.set_description("启动并发任务")
all_info = parallel_crawler(movie, inner_bar)
- msg = f'为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息'
+ msg = f"为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息"
check_step(all_info, msg)
- inner_bar.set_description('汇总数据')
+ inner_bar.set_description("汇总数据")
has_required_keys = info_summary(movie, all_info)
check_step(has_required_keys)
if Cfg().translator.engine:
- inner_bar.set_description('翻译影片信息')
+ inner_bar.set_description("翻译影片信息")
success = translate_movie_info(movie.info)
check_step(success)
generate_names(movie)
- check_step(movie.save_dir, '无法按命名规则生成目标文件夹')
+ check_step(movie.save_dir, "无法按命名规则生成目标文件夹")
if not os.path.exists(movie.save_dir):
os.makedirs(movie.save_dir)
- inner_bar.set_description('下载封面图片')
+ inner_bar.set_description("下载封面图片")
if Cfg().summarizer.cover.highres:
- cover_dl = download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers)
+ cover_dl = download_cover(
+ movie.info.covers, movie.fanart_file, movie.info.big_covers
+ )
else:
cover_dl = download_cover(movie.info.covers, movie.fanart_file)
- check_step(cover_dl, '下载封面图片失败')
+ check_step(cover_dl, "下载封面图片失败")
cover, pic_path = cover_dl
# 确保实际下载的封面的url与即将写入到movie.info中的一致
if cover != movie.info.cover:
@@ -466,23 +529,31 @@ def check_step(result, msg='步骤错误'):
check_step(True)
if Cfg().summarizer.extra_fanarts.enabled:
- scrape_interval = Cfg().summarizer.extra_fanarts.scrap_interval.total_seconds()
- inner_bar.set_description('下载剧照')
+ scrape_interval = (
+ Cfg().summarizer.extra_fanarts.scrap_interval.total_seconds()
+ )
+ inner_bar.set_description("下载剧照")
if movie.info.preview_pics:
- extrafanartdir = movie.save_dir + '/extrafanart'
+ extrafanartdir = movie.save_dir + "/extrafanart"
os.mkdir(extrafanartdir)
- for (id, pic_url) in enumerate(movie.info.preview_pics):
- inner_bar.set_description(f"Downloading extrafanart {id} from url: {pic_url}")
-
+ for id, pic_url in enumerate(movie.info.preview_pics):
+ inner_bar.set_description(
+ f"Downloading extrafanart {id} from url: {pic_url}"
+ )
+
fanart_destination = f"{extrafanartdir}/{id}.png"
try:
info = download(pic_url, fanart_destination)
if valid_pic(fanart_destination):
filesize = get_fmt_size(pic_path)
width, height = get_pic_size(pic_path)
- elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed']))
- speed = get_fmt_size(info['rate']) + '/s'
- logger.info(f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]")
+ elapsed = time.strftime(
+ "%M:%S", time.gmtime(info["elapsed"])
+ )
+ speed = get_fmt_size(info["rate"]) + "/s"
+ logger.info(
+ f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]"
+ )
else:
check_step(False, f"下载剧照{id}: {pic_url}失败")
except:
@@ -490,18 +561,20 @@ def check_step(result, msg='步骤错误'):
time.sleep(scrape_interval)
check_step(True)
- inner_bar.set_description('写入NFO')
+ inner_bar.set_description("写入NFO")
write_nfo(movie.info, movie.nfo_file)
check_step(True)
if Cfg().summarizer.move_files:
- inner_bar.set_description('移动影片文件')
+ inner_bar.set_description("移动影片文件")
movie.rename_files(Cfg().summarizer.path.hard_link)
check_step(True)
- logger.info(f'整理完成,相关文件已保存到: {movie.save_dir}\n')
+ logger.info(f"整理完成,相关文件已保存到: {movie.save_dir}\n")
else:
- logger.info(f'刮削完成,相关文件已保存到: {movie.nfo_file}\n')
+ logger.info(f"刮削完成,相关文件已保存到: {movie.nfo_file}\n")
- if movie != all_movies[-1] and Cfg().crawler.sleep_after_scraping > Duration(0):
+ if movie != all_movies[
+ -1
+ ] and Cfg().crawler.sleep_after_scraping > Duration(0):
time.sleep(Cfg().crawler.sleep_after_scraping.total_seconds())
return_movies.append(movie)
# except Exception as e:
@@ -523,9 +596,11 @@ def download_cover(covers, fanart_path, big_covers=[]):
if valid_pic(pic_path):
filesize = get_fmt_size(pic_path)
width, height = get_pic_size(pic_path)
- elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed']))
- speed = get_fmt_size(info['rate']) + '/s'
- logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]")
+ elapsed = time.strftime("%M:%S", time.gmtime(info["elapsed"]))
+ speed = get_fmt_size(info["rate"]) + "/s"
+ logger.info(
+ f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]"
+ )
return (url, pic_path)
except requests.exceptions.HTTPError:
# HTTPError通常说明猜测的高清封面地址实际不可用,因此不再重试
@@ -544,20 +619,22 @@ def download_cover(covers, fanart_path, big_covers=[]):
break
except Exception as e:
logger.debug(e, exc_info=True)
- logger.error(f"下载封面图片失败")
- logger.debug('big_covers:'+str(big_covers) + ', covers'+str(covers))
+ logger.error("下载封面图片失败")
+ logger.debug("big_covers:" + str(big_covers) + ", covers" + str(covers))
return None
+
def get_pic_path(fanart_path, url):
fanart_base = os.path.splitext(fanart_path)[0]
- pic_extend = url.split('.')[-1]
+ pic_extend = url.split(".")[-1]
# 判断 url 是否带?后面的参数
- if '?' in pic_extend:
- pic_extend = pic_extend.split('?')[0]
-
+ if "?" in pic_extend:
+ pic_extend = pic_extend.split("?")[0]
+
pic_path = fanart_base + "." + pic_extend
return pic_path
+
def error_exit(success, err_info):
"""检查业务逻辑是否成功完成,如果失败则报错退出程序"""
if not success:
@@ -581,24 +658,25 @@ def entry():
colorama.init(autoreset=True)
# 检查更新
- version_info = 'JavSP ' + getattr(sys, 'javsp_version', '未知版本/从代码运行')
- logger.debug(version_info.center(60, '='))
+ version_info = "JavSP " + getattr(sys, "javsp_version", "未知版本/从代码运行")
+ logger.debug(version_info.center(60, "="))
check_update(Cfg().other.check_update, Cfg().other.auto_update)
root = get_scan_dir(Cfg().scanner.input_directory)
- error_exit(root, '未选择要扫描的文件夹')
+ error_exit(root, "未选择要扫描的文件夹")
# 导入抓取器,必须在chdir之前
import_crawlers()
os.chdir(root)
- print(f'扫描影片文件...')
+ print("扫描影片文件...")
recognized = scan_movies(root)
movie_count = len(recognized)
recognize_fail = []
- error_exit(movie_count, '未找到影片文件')
- logger.info(f'扫描影片文件:共找到 {movie_count} 部影片')
+ error_exit(movie_count, "未找到影片文件")
+ logger.info(f"扫描影片文件:共找到 {movie_count} 部影片")
RunNormalMode(recognized + recognize_fail)
sys.exit(0)
+
if __name__ == "__main__":
entry()
diff --git a/javsp/avid.py b/javsp/avid.py
index f535f1fee..2e22b8b2b 100644
--- a/javsp/avid.py
+++ b/javsp/avid.py
@@ -1,149 +1,159 @@
"""获取和转换影片的各类番号(DVD ID, DMM cid, DMM pid)"""
+
import os
import re
from pathlib import Path
-__all__ = ['get_id', 'get_cid', 'guess_av_type']
+__all__ = ["get_id", "get_cid", "guess_av_type"]
from javsp.config import Cfg
+
def get_id(filepath_str: str) -> str:
"""从给定的文件路径中提取番号(DVD ID)"""
filepath = Path(filepath_str)
# 通常是接收文件的路径,当然如果是普通字符串也可以
- ignore_pattern = re.compile('|'.join(Cfg().scanner.ignored_id_pattern))
- norm = ignore_pattern.sub('', filepath.stem).upper()
- if 'FC2' in norm:
+ ignore_pattern = re.compile("|".join(Cfg().scanner.ignored_id_pattern))
+ norm = ignore_pattern.sub("", filepath.stem).upper()
+ if "FC2" in norm:
# 根据FC2 Club的影片数据,FC2编号为5-7个数字
- match = re.search(r'FC2[^A-Z\d]{0,5}(PPV[^A-Z\d]{0,5})?(\d{5,7})', norm, re.I)
+ match = re.search(r"FC2[^A-Z\d]{0,5}(PPV[^A-Z\d]{0,5})?(\d{5,7})", norm, re.I)
if match:
- return 'FC2-' + match.group(2)
- elif 'HEYDOUGA' in norm:
- match = re.search(r'(HEYDOUGA)[-_]*(\d{4})[-_]0?(\d{3,5})', norm, re.I)
+ return "FC2-" + match.group(2)
+ elif "HEYDOUGA" in norm:
+ match = re.search(r"(HEYDOUGA)[-_]*(\d{4})[-_]0?(\d{3,5})", norm, re.I)
if match:
- return '-'.join(match.groups())
- elif 'GETCHU' in norm:
- match = re.search(r'GETCHU[-_]*(\d+)', norm, re.I)
+ return "-".join(match.groups())
+ elif "GETCHU" in norm:
+ match = re.search(r"GETCHU[-_]*(\d+)", norm, re.I)
if match:
- return 'GETCHU-' + match.group(1)
- elif 'GYUTTO' in norm:
- match = re.search(r'GYUTTO-(\d+)', norm, re.I)
+ return "GETCHU-" + match.group(1)
+ elif "GYUTTO" in norm:
+ match = re.search(r"GYUTTO-(\d+)", norm, re.I)
if match:
- return 'GYUTTO-' + match.group(1)
- elif '259LUXU' in norm: # special case having form of '259luxu'
- match = re.search(r'259LUXU-(\d+)', norm, re.I)
+ return "GYUTTO-" + match.group(1)
+ elif "259LUXU" in norm: # special case having form of '259luxu'
+ match = re.search(r"259LUXU-(\d+)", norm, re.I)
if match:
- return '259LUXU-' + match.group(1)
+ return "259LUXU-" + match.group(1)
else:
# 先尝试移除可疑域名进行匹配,如果匹配不到再使用原始文件名进行匹配
- no_domain = re.sub(r'\w{3,10}\.(COM|NET|APP|XYZ)', '', norm, flags=re.I)
+ no_domain = re.sub(r"\w{3,10}\.(COM|NET|APP|XYZ)", "", norm, flags=re.I)
if no_domain != norm:
avid = get_id(no_domain)
if avid:
return avid
# 匹配缩写成hey的heydouga影片。由于番号分三部分,要先于后面分两部分的进行匹配
- match = re.search(r'(?:HEY)[-_]*(\d{4})[-_]0?(\d{3,5})', norm, re.I)
+ match = re.search(r"(?:HEY)[-_]*(\d{4})[-_]0?(\d{3,5})", norm, re.I)
if match:
- return 'heydouga-' + '-'.join(match.groups())
+ return "heydouga-" + "-".join(match.groups())
# 匹配片商 MUGEN 的奇怪番号。由于MK3D2DBD的模式,要放在普通番号模式之前进行匹配
- match = re.search(r'(MKB?D)[-_]*(S\d{2,3})|(MK3D2DBD|S2M|S2MBD)[-_]*(\d{2,3})', norm, re.I)
+ match = re.search(
+ r"(MKB?D)[-_]*(S\d{2,3})|(MK3D2DBD|S2M|S2MBD)[-_]*(\d{2,3})", norm, re.I
+ )
if match:
if match.group(1) is not None:
- avid = match.group(1) + '-' + match.group(2)
+ avid = match.group(1) + "-" + match.group(2)
else:
- avid = match.group(3) + '-' + match.group(4)
+ avid = match.group(3) + "-" + match.group(4)
return avid
# 匹配IBW这样带有后缀z的番号
- match = re.search(r'(IBW)[-_](\d{2,5}z)', norm, re.I)
+ match = re.search(r"(IBW)[-_](\d{2,5}z)", norm, re.I)
if match:
- return match.group(1) + '-' + match.group(2)
+ return match.group(1) + "-" + match.group(2)
# 普通番号,优先尝试匹配带分隔符的(如ABC-123)
- match = re.search(r'([A-Z]{2,10})[-_](\d{2,5})', norm, re.I)
+ match = re.search(r"([A-Z]{2,10})[-_](\d{2,5})", norm, re.I)
if match:
- return match.group(1) + '-' + match.group(2)
+ return match.group(1) + "-" + match.group(2)
# 普通番号,运行到这里时表明无法匹配到带分隔符的番号
# 先尝试匹配东热的red, sky, ex三个不带-分隔符的系列
# (这三个系列已停止更新,因此根据其作品编号将数字范围限制得小一些以降低误匹配概率)
- match = re.search(r'(RED[01]\d\d|SKY[0-3]\d\d|EX00[01]\d)', norm, re.I)
+ match = re.search(r"(RED[01]\d\d|SKY[0-3]\d\d|EX00[01]\d)", norm, re.I)
if match:
return match.group(1)
# 然后再将影片视作缺失了-分隔符来匹配
- match = re.search(r'([A-Z]{2,})(\d{2,5})', norm, re.I)
+ match = re.search(r"([A-Z]{2,})(\d{2,5})", norm, re.I)
if match:
- return match.group(1) + '-' + match.group(2)
+ return match.group(1) + "-" + match.group(2)
# 尝试匹配TMA制作的影片(如'T28-557',他家的番号很乱)
- match = re.search(r'(T[23]8[-_]\d{3})', norm)
+ match = re.search(r"(T[23]8[-_]\d{3})", norm)
if match:
return match.group(1)
# 尝试匹配东热n, k系列
- match = re.search(r'(N\d{4}|K\d{4})', norm, re.I)
+ match = re.search(r"(N\d{4}|K\d{4})", norm, re.I)
if match:
return match.group(1)
# 尝试匹配纯数字番号(无码影片)
- match = re.search(r'(\d{6}[-_]\d{2,3})', norm)
+ match = re.search(r"(\d{6}[-_]\d{2,3})", norm)
if match:
return match.group(1)
# 如果还是匹配不了,尝试将')('替换为'-'后再试,少部分影片的番号是由')('分隔的
- if ')(' in norm:
- avid = get_id(norm.replace(')(', '-'))
+ if ")(" in norm:
+ avid = get_id(norm.replace(")(", "-"))
if avid:
return avid
# 如果最后仍然匹配不了番号,则尝试使用文件所在文件夹的名字去匹配
-
- if filepath.parent.name != '': # haven't reach '.' or '/'
+
+ if filepath.parent.name != "": # haven't reach '.' or '/'
return get_id(filepath.parent.name)
else:
- return ''
+ return ""
+
+
+CD_POSTFIX = re.compile(r"([-_]\w|cd\d)$")
-CD_POSTFIX = re.compile(r'([-_]\w|cd\d)$')
def get_cid(filepath: str) -> str:
"""尝试将给定的文件名匹配为CID(Content ID)"""
basename = os.path.splitext(os.path.basename(filepath))[0]
# 移除末尾可能带有的分段影片序号
- possible = CD_POSTFIX.sub('', basename)
+ possible = CD_POSTFIX.sub("", basename)
# cid只由数字、小写字母和下划线组成
- match = re.match(r'^([a-z\d_]+)$', possible, re.A)
+ match = re.match(r"^([a-z\d_]+)$", possible, re.A)
if match:
possible = match.group(1)
- if '_' not in possible:
+ if "_" not in possible:
# 长度为7-14的cid就占了约99.01%. 最长的cid为24,但是长为20-24的比例不到十万分之五
- match = re.match(r'^[a-z\d]{7,19}$', possible)
+ match = re.match(r"^[a-z\d]{7,19}$", possible)
if match:
return possible
else:
# 绝大多数都只有一个下划线(只有约万分之一带有两个下划线)
- match2 = re.match(r'''^h_\d{3,4}[a-z]{1,10}\d{2,5}[a-z\d]{0,8}$ # 约 99.17%
+ match2 = re.match(
+ r"""^h_\d{3,4}[a-z]{1,10}\d{2,5}[a-z\d]{0,8}$ # 约 99.17%
|^\d{3}_\d{4,5}$ # 约 0.57%
|^402[a-z]{3,6}\d*_[a-z]{3,8}\d{5,6}$ # 约 0.09%
|^h_\d{3,4}wvr\d\w\d{4,5}[a-z\d]{0,8}$ # 约 0.06%
- $''', possible, re.VERBOSE)
+ $""",
+ possible,
+ re.VERBOSE,
+ )
if match2:
return possible
- return ''
+ return ""
def guess_av_type(avid: str) -> str:
"""识别给定的番号所属的分类: normal, fc2, cid"""
- match = re.match(r'^FC2-\d{5,7}$', avid, re.I)
+ match = re.match(r"^FC2-\d{5,7}$", avid, re.I)
if match:
- return 'fc2'
- match = re.match(r'^GETCHU-(\d+)',avid,re.I)
+ return "fc2"
+ match = re.match(r"^GETCHU-(\d+)", avid, re.I)
if match:
- return 'getchu'
- match = re.match(r'^GYUTTO-(\d+)',avid,re.I)
+ return "getchu"
+ match = re.match(r"^GYUTTO-(\d+)", avid, re.I)
if match:
- return 'gyutto'
+ return "gyutto"
# 如果传入的avid完全匹配cid的模式,则将影片归类为cid
cid = get_cid(avid)
if cid == avid:
- return 'cid'
+ return "cid"
# 以上都不是: 默认归类为normal
- return 'normal'
+ return "normal"
if __name__ == "__main__":
- print(get_id('FC2-123456/Unknown.mp4'))
+ print(get_id("FC2-123456/Unknown.mp4"))
diff --git a/javsp/chromium.py b/javsp/chromium.py
index db315293e..b7d5a5ef4 100644
--- a/javsp/chromium.py
+++ b/javsp/chromium.py
@@ -1,4 +1,5 @@
"""解析Chromium系浏览器Cookies的相关函数"""
+
import os
import sys
import json
@@ -9,7 +10,7 @@
from shutil import copyfile
from datetime import datetime
-__all__ = ['get_browsers_cookies']
+__all__ = ["get_browsers_cookies"]
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
@@ -18,15 +19,16 @@
logger = logging.getLogger(__name__)
-class Decrypter():
+class Decrypter:
def __init__(self, key):
self.key = key
+
def decrypt(self, encrypted_value):
- nonce = encrypted_value[3:3+12]
- ciphertext = encrypted_value[3+12:-16]
+ nonce = encrypted_value[3 : 3 + 12]
+ ciphertext = encrypted_value[3 + 12 : -16]
tag = encrypted_value[-16:]
cipher = AES.new(self.key, AES.MODE_GCM, nonce=nonce)
- plaintext = cipher.decrypt_and_verify(ciphertext, tag).decode('utf-8')
+ plaintext = cipher.decrypt_and_verify(ciphertext, tag).decode("utf-8")
return plaintext
@@ -34,32 +36,38 @@ def get_browsers_cookies():
"""获取系统上的所有Chromium系浏览器的JavDB的Cookies"""
# 不予支持: Opera, 360安全&极速, 搜狗使用非标的用户目录或数据格式; QQ浏览器屏蔽站点
user_data_dirs = {
- 'Chrome': '/Google/Chrome/User Data',
- 'Chrome Beta': '/Google/Chrome Beta/User Data',
- 'Chrome Canary': '/Google/Chrome SxS/User Data',
- 'Chromium': '/Google/Chromium/User Data',
- 'Edge': '/Microsoft/Edge/User Data',
- 'Vivaldi': '/Vivaldi/User Data'
+ "Chrome": "/Google/Chrome/User Data",
+ "Chrome Beta": "/Google/Chrome Beta/User Data",
+ "Chrome Canary": "/Google/Chrome SxS/User Data",
+ "Chromium": "/Google/Chromium/User Data",
+ "Edge": "/Microsoft/Edge/User Data",
+ "Vivaldi": "/Vivaldi/User Data",
}
- LocalAppDataDir = os.getenv('LOCALAPPDATA')
+ LocalAppDataDir = os.getenv("LOCALAPPDATA")
all_browser_cookies = []
exceptions = []
for brw, path in user_data_dirs.items():
user_dir = LocalAppDataDir + path
- cookies_files = glob(user_dir+'/*/Cookies') + glob(user_dir+'/*/Network/Cookies')
- local_state = user_dir+'/Local State'
+ cookies_files = glob(user_dir + "/*/Cookies") + glob(
+ user_dir + "/*/Network/Cookies"
+ )
+ local_state = user_dir + "/Local State"
if os.path.exists(local_state):
key = decrypt_key(local_state)
decrypter = Decrypter(key)
for file in cookies_files:
- profile = brw + ": " + file.split('User Data')[1].split(os.sep)[1]
+ profile = brw + ": " + file.split("User Data")[1].split(os.sep)[1]
file = os.path.normpath(file)
try:
records = get_cookies(file, decrypter)
if records:
# 将records转换为便于使用的格式
for site, cookies in records.items():
- entry = {'profile': profile, 'site': site, 'cookies': cookies}
+ entry = {
+ "profile": profile,
+ "site": site,
+ "cookies": cookies,
+ }
all_browser_cookies.append(entry)
except Exception as e:
exceptions.append(e)
@@ -78,45 +86,51 @@ def convert_chrome_utc(chrome_utc):
unix_utc = datetime.fromtimestamp(second)
return unix_utc
+
def decrypt_key_win(local_state):
"""从Local State文件中提取并解密出Cookies文件的密钥"""
# Chrome 80+ 的Cookies解密方法参考自: https://stackoverflow.com/a/60423699/6415337
import win32crypt
- with open(local_state, 'rt', encoding='utf-8') as file:
- encrypted_key = json.loads(file.read())['os_crypt']['encrypted_key']
- encrypted_key = base64.b64decode(encrypted_key) # Base64 decoding
- encrypted_key = encrypted_key[5:] # Remove DPAPI
- decrypted_key = win32crypt.CryptUnprotectData(encrypted_key, None, None, None, 0)[1] # Decrypt key
+
+ with open(local_state, "rt", encoding="utf-8") as file:
+ encrypted_key = json.loads(file.read())["os_crypt"]["encrypted_key"]
+ encrypted_key = base64.b64decode(encrypted_key) # Base64 decoding
+ encrypted_key = encrypted_key[5:] # Remove DPAPI
+ decrypted_key = win32crypt.CryptUnprotectData(encrypted_key, None, None, None, 0)[
+ 1
+ ] # Decrypt key
return decrypted_key
def decrypt_key_linux(local_state):
"""从Local State文件中提取并解密出Cookies文件的密钥,适用于Linux"""
# 读取Local State文件中的密钥
- with open(local_state, 'rt', encoding='utf-8') as file:
- encrypted_key = json.loads(file.read())['os_crypt']['encrypted_key']
+ with open(local_state, "rt", encoding="utf-8") as file:
+ encrypted_key = json.loads(file.read())["os_crypt"]["encrypted_key"]
encrypted_key = base64.b64decode(encrypted_key)
encrypted_key = encrypted_key[5:]
key = encrypted_key
- nonce = b' ' * 12
+ nonce = b" " * 12
aesgcm = AESGCM(key)
decrypted_key = aesgcm.decrypt(nonce, encrypted_key, None)
return decrypted_key
-decrypt_key = decrypt_key_win if sys.platform == 'win32' else decrypt_key_linux
+decrypt_key = decrypt_key_win if sys.platform == "win32" else decrypt_key_linux
-def get_cookies(cookies_file, decrypter, host_pattern='javdb%.com'):
+def get_cookies(cookies_file, decrypter, host_pattern="javdb%.com"):
"""从cookies_file文件中查找指定站点的所有Cookies"""
# 复制Cookies文件到临时目录,避免直接操作原始的Cookies文件
- temp_dir = os.getenv('TMPDIR', os.getenv('TEMP', os.getenv('TMP', '.')))
- temp_cookie = os.path.join(temp_dir, 'Cookies')
+ temp_dir = os.getenv("TMPDIR", os.getenv("TEMP", os.getenv("TMP", ".")))
+ temp_cookie = os.path.join(temp_dir, "Cookies")
copyfile(cookies_file, temp_cookie)
# 连接数据库进行查询
conn = sqlite3.connect(temp_cookie)
cursor = conn.cursor()
- cursor.execute(f'SELECT host_key, name, encrypted_value, expires_utc FROM cookies WHERE host_key LIKE "{host_pattern}"')
+ cursor.execute(
+ f'SELECT host_key, name, encrypted_value, expires_utc FROM cookies WHERE host_key LIKE "{host_pattern}"'
+ )
# 将查询结果按照host_key进行组织
now = datetime.now()
records = {}
@@ -127,7 +141,7 @@ def get_cookies(cookies_file, decrypter, host_pattern='javdb%.com'):
if expires > now:
d[name] = decrypter.decrypt(encrypted_value)
# Cookies的核心字段是'_jdb_session',因此如果records中缺失此字段(说明已过期),则对应的Cookies不再有效
- valid_records = {k: v for k, v in records.items() if '_jdb_session' in v}
+ valid_records = {k: v for k, v in records.items() if "_jdb_session" in v}
conn.close()
os.remove(temp_cookie)
return valid_records
@@ -136,5 +150,4 @@ def get_cookies(cookies_file, decrypter, host_pattern='javdb%.com'):
if __name__ == "__main__":
all_cookies = get_browsers_cookies()
for d in all_cookies:
- print('{:<20}{}'.format(d['profile'], d['site']))
-
+ print("{:<20}{}".format(d["profile"], d["site"]))
diff --git a/javsp/config.py b/javsp/config.py
index 3fbc8f071..a3884b394 100644
--- a/javsp/config.py
+++ b/javsp/config.py
@@ -9,6 +9,7 @@
from javsp.lib import resource_path
+
class Scanner(BaseConfig):
ignored_id_pattern: List[str]
input_directory: Path | None = None
@@ -16,26 +17,28 @@ class Scanner(BaseConfig):
ignored_folder_name_pattern: List[str]
minimum_size: ByteSize
+
class CrawlerID(str, Enum):
- airav = 'airav'
- avsox = 'avsox'
- avwiki = 'avwiki'
- dl_getchu = 'dl_getchu'
- fanza = 'fanza'
- fc2 = 'fc2'
- fc2fan = 'fc2fan'
- fc2ppvdb = 'fc2ppvdb'
- gyutto = 'gyutto'
- jav321 = 'jav321'
- javbus = 'javbus'
- javdb = 'javdb'
- javlib = 'javlib'
- javmenu = 'javmenu'
- mgstage = 'mgstage'
- njav = 'njav'
- prestige = 'prestige'
- arzon = 'arzon'
- arzon_iv = 'arzon_iv'
+ airav = "airav"
+ avsox = "avsox"
+ avwiki = "avwiki"
+ dl_getchu = "dl_getchu"
+ fanza = "fanza"
+ fc2 = "fc2"
+ fc2fan = "fc2fan"
+ fc2ppvdb = "fc2ppvdb"
+ gyutto = "gyutto"
+ jav321 = "jav321"
+ javbus = "javbus"
+ javdb = "javdb"
+ javlib = "javlib"
+ javmenu = "javmenu"
+ mgstage = "mgstage"
+ njav = "njav"
+ prestige = "prestige"
+ arzon = "arzon"
+ arzon_iv = "arzon_iv"
+
class Network(BaseConfig):
proxy_server: Url | None
@@ -43,27 +46,28 @@ class Network(BaseConfig):
timeout: Duration
proxy_free: Dict[CrawlerID, Url]
+
class CrawlerSelect(BaseConfig):
def items(self) -> List[tuple[str, list[CrawlerID]]]:
return [
- ('normal', self.normal),
- ('fc2', self.fc2),
- ('cid', self.cid),
- ('getchu', self.getchu),
- ('gyutto', self.gyutto),
+ ("normal", self.normal),
+ ("fc2", self.fc2),
+ ("cid", self.cid),
+ ("getchu", self.getchu),
+ ("gyutto", self.gyutto),
]
def __getitem__(self, index) -> list[CrawlerID]:
match index:
- case 'normal':
+ case "normal":
return self.normal
- case 'fc2':
+ case "fc2":
return self.fc2
- case 'cid':
+ case "cid":
return self.cid
- case 'getchu':
+ case "getchu":
return self.getchu
- case 'gyutto':
+ case "gyutto":
return self.gyutto
raise Exception("Unknown crawler type")
@@ -73,37 +77,40 @@ def __getitem__(self, index) -> list[CrawlerID]:
getchu: list[CrawlerID]
gyutto: list[CrawlerID]
+
class MovieInfoField(str, Enum):
- dvdid = 'dvdid'
- cid = 'cid'
- url = 'url'
- plot = 'plot'
- cover = 'cover'
- big_cover = 'big_cover'
- genre = 'genre'
- genre_id = 'genre_id'
- genre_norm = 'genre_norm'
- score = 'score'
- title = 'title'
- ori_title = 'ori_title'
- magnet = 'magnet'
- serial = 'serial'
- actress = 'actress'
- actress_pics = 'actress_pics'
- director = 'director'
- duration = 'duration'
- producer = 'producer'
- publisher = 'publisher'
- uncensored = 'uncensored'
- publish_date = 'publish_date'
- preview_pics = 'preview_pics'
- preview_video = 'preview_video'
+ dvdid = "dvdid"
+ cid = "cid"
+ url = "url"
+ plot = "plot"
+ cover = "cover"
+ big_cover = "big_cover"
+ genre = "genre"
+ genre_id = "genre_id"
+ genre_norm = "genre_norm"
+ score = "score"
+ title = "title"
+ ori_title = "ori_title"
+ magnet = "magnet"
+ serial = "serial"
+ actress = "actress"
+ actress_pics = "actress_pics"
+ director = "director"
+ duration = "duration"
+ producer = "producer"
+ publisher = "publisher"
+ uncensored = "uncensored"
+ publish_date = "publish_date"
+ preview_pics = "preview_pics"
+ preview_video = "preview_video"
+
class UseJavDBCover(str, Enum):
yes = "yes"
no = "no"
fallback = "fallback"
+
class Crawler(BaseConfig):
selection: CrawlerSelect
required_keys: list[MovieInfoField]
@@ -114,6 +121,7 @@ class Crawler(BaseConfig):
use_javdb_cover: UseJavDBCover
normalize_actress_name: bool
+
class MovieDefault(BaseConfig):
title: str
actress: str
@@ -122,6 +130,7 @@ class MovieDefault(BaseConfig):
producer: str
publisher: str
+
class PathSummarize(BaseConfig):
output_folder_pattern: str
basename_pattern: str
@@ -130,25 +139,31 @@ class PathSummarize(BaseConfig):
max_actress_count: PositiveInt = 10
hard_link: bool
+
class TitleSummarize(BaseConfig):
remove_trailing_actor_name: bool
+
class NFOSummarize(BaseConfig):
basename_pattern: str
title_pattern: str
custom_genres_fields: list[str]
custom_tags_fields: list[str]
+
class ExtraFanartSummarize(BaseConfig):
enabled: bool
scrap_interval: Duration
+
class SlimefaceEngine(BaseConfig):
- name: Literal['slimeface']
+ name: Literal["slimeface"]
+
class CoverCrop(BaseConfig):
- engine: SlimefaceEngine | None
- on_id_pattern: list[str]
+ engine: SlimefaceEngine | None
+ on_id_pattern: list[str]
+
class CoverSummarize(BaseConfig):
basename_pattern: str
@@ -156,9 +171,11 @@ class CoverSummarize(BaseConfig):
add_label: bool
crop: CoverCrop
+
class FanartSummarize(BaseConfig):
basename_pattern: str
+
class Summarizer(BaseConfig):
default: MovieDefault
censor_options_representation: list[str]
@@ -170,60 +187,76 @@ class Summarizer(BaseConfig):
fanart: FanartSummarize
extra_fanarts: ExtraFanartSummarize
+
class BaiduTranslateEngine(BaseConfig):
- name: Literal['baidu']
+ name: Literal["baidu"]
app_id: str
api_key: str
+
class BingTranslateEngine(BaseConfig):
- name: Literal['bing']
+ name: Literal["bing"]
api_key: str
+
class ClaudeTranslateEngine(BaseConfig):
- name: Literal['claude']
+ name: Literal["claude"]
api_key: str
+
class OpenAITranslateEngine(BaseConfig):
- name: Literal['openai']
+ name: Literal["openai"]
url: Url
api_key: str
model: str
+
class GoogleTranslateEngine(BaseConfig):
- name: Literal['google']
+ name: Literal["google"]
+
TranslateEngine: TypeAlias = Union[
- BaiduTranslateEngine,
- BingTranslateEngine,
- ClaudeTranslateEngine,
- OpenAITranslateEngine,
- GoogleTranslateEngine,
- None]
+ BaiduTranslateEngine,
+ BingTranslateEngine,
+ ClaudeTranslateEngine,
+ OpenAITranslateEngine,
+ GoogleTranslateEngine,
+ None,
+]
+
class TranslateField(BaseConfig):
title: bool
plot: bool
+
class Translator(BaseConfig):
- engine: TranslateEngine = Field(..., discriminator='name')
+ engine: TranslateEngine = Field(..., discriminator="name")
fields: TranslateField
+
class Other(BaseConfig):
check_update: bool
auto_update: bool
+
def get_config_source():
- parser = ArgumentParser(prog='JavSP', description='汇总多站点数据的AV元数据刮削器', formatter_class=RawTextHelpFormatter)
- parser.add_argument('-c', '--config', help='使用指定的配置文件')
+ parser = ArgumentParser(
+ prog="JavSP",
+ description="汇总多站点数据的AV元数据刮削器",
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("-c", "--config", help="使用指定的配置文件")
args, _ = parser.parse_known_args()
sources = []
if args.config is None:
- args.config = resource_path('config.yml')
+ args.config = resource_path("config.yml")
sources.append(FileSource(file=args.config))
- sources.append(EnvSource(prefix='JAVSP_', allow_all=True))
- sources.append(CLArgSource(prefix='o'))
+ sources.append(EnvSource(prefix="JAVSP_", allow_all=True))
+ sources.append(CLArgSource(prefix="o"))
return sources
+
class Cfg(BaseConfig):
scanner: Scanner
network: Network
@@ -231,4 +264,4 @@ class Cfg(BaseConfig):
summarizer: Summarizer
translator: Translator
other: Other
- CONFIG_SOURCES=get_config_source()
+ CONFIG_SOURCES = get_config_source()
diff --git a/javsp/cropper/__init__.py b/javsp/cropper/__init__.py
index e9c340873..381642289 100644
--- a/javsp/cropper/__init__.py
+++ b/javsp/cropper/__init__.py
@@ -2,8 +2,9 @@
from javsp.cropper.interface import Cropper, DefaultCropper
from javsp.cropper.slimeface_crop import SlimefaceCropper
+
def get_cropper(engine: SlimefaceEngine | None) -> Cropper:
if engine is None:
return DefaultCropper()
- if engine.name == 'slimeface':
+ if engine.name == "slimeface":
return SlimefaceCropper()
diff --git a/javsp/cropper/interface.py b/javsp/cropper/interface.py
index 710c2b630..698db72aa 100644
--- a/javsp/cropper/interface.py
+++ b/javsp/cropper/interface.py
@@ -1,23 +1,27 @@
from PIL.Image import Image
from abc import ABC, abstractmethod
+
+
class Cropper(ABC):
@abstractmethod
def crop_specific(self, fanart: Image, ratio: float) -> Image:
pass
def crop(self, fanart: Image, ratio: float | None = None) -> Image:
- if ratio is None:
+ if ratio is None:
ratio = 1.42
return self.crop_specific(fanart, ratio)
+
class DefaultCropper(Cropper):
def crop_specific(self, fanart: Image, ratio: float) -> Image:
"""将给定的fanart图片文件裁剪为适合poster尺寸的图片"""
(fanart_w, fanart_h) = fanart.size
- (poster_w, poster_h) = \
- (int(fanart_h / ratio), fanart_h) \
- if fanart_h / fanart_w < ratio \
- else (fanart_w, int(fanart_w * ratio)) # 图片太“瘦”时以宽度来定裁剪高度
+ (poster_w, poster_h) = (
+ (int(fanart_h / ratio), fanart_h)
+ if fanart_h / fanart_w < ratio
+ else (fanart_w, int(fanart_w * ratio))
+ ) # 图片太“瘦”时以宽度来定裁剪高度
box = (poster_w - fanart_w, 0, poster_w, poster_h)
fanart.crop(box)
diff --git a/javsp/cropper/slimeface_crop.py b/javsp/cropper/slimeface_crop.py
index a0f9712e1..610b4f8fb 100644
--- a/javsp/cropper/slimeface_crop.py
+++ b/javsp/cropper/slimeface_crop.py
@@ -2,33 +2,39 @@
from javsp.cropper.interface import Cropper, DefaultCropper
from javsp.cropper.utils import get_bound_box_by_face
+
class SlimefaceCropper(Cropper):
def crop_specific(self, fanart: Image.Image, ratio: float) -> Image.Image:
- try:
- # defer the libary import so we don't break if missing dependencies
+ try:
+ # defer the libary import so we don't break if missing dependencies
from slimeface import detectRGB
- bbox_confs = detectRGB(fanart.width, fanart.height, fanart.convert('RGB').tobytes())
- bbox_confs.sort(key=lambda conf_bbox: -conf_bbox[4]) # last arg stores confidence
+
+ bbox_confs = detectRGB(
+ fanart.width, fanart.height, fanart.convert("RGB").tobytes()
+ )
+ bbox_confs.sort(
+ key=lambda conf_bbox: -conf_bbox[4]
+ ) # last arg stores confidence
face = bbox_confs[0][:-1]
poster_box = get_bound_box_by_face(face, fanart.size, ratio)
return fanart.crop(poster_box)
except:
return DefaultCropper().crop_specific(fanart, ratio)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
from argparse import ArgumentParser
- arg_parser = ArgumentParser(prog='slimeface crop')
+ arg_parser = ArgumentParser(prog="slimeface crop")
- arg_parser.add_argument('-i', '--image', help='path to image to detect')
+ arg_parser.add_argument("-i", "--image", help="path to image to detect")
args, _ = arg_parser.parse_known_args()
- if(args.image is None):
+ if args.image is None:
print("USAGE: slimeface_crop.py -i/--image [path]")
exit(1)
input = Image.open(args.image)
im = SlimefaceCropper().crop(input)
- im.save('output.png')
-
+ im.save("output.png")
diff --git a/javsp/cropper/utils.py b/javsp/cropper/utils.py
index b11b48eee..7f68ac57a 100644
--- a/javsp/cropper/utils.py
+++ b/javsp/cropper/utils.py
@@ -1,12 +1,16 @@
def get_poster_size(image_shape: tuple[int, int], ratio: float) -> tuple[int, int]:
- (fanart_w, fanart_h) = image_shape
- (poster_w, poster_h) = \
- (int(fanart_h / ratio), fanart_h) \
- if fanart_h / fanart_w < ratio \
- else (fanart_w, int(fanart_w * ratio)) # 图片太“瘦”时以宽度来定裁剪高度
- return (poster_w, poster_h)
+ (fanart_w, fanart_h) = image_shape
+ (poster_w, poster_h) = (
+ (int(fanart_h / ratio), fanart_h)
+ if fanart_h / fanart_w < ratio
+ else (fanart_w, int(fanart_w * ratio))
+ ) # 图片太“瘦”时以宽度来定裁剪高度
+ return (poster_w, poster_h)
+
-def get_bound_box_by_face(face: tuple[int, int, int, int], image_shape: tuple[int, int], ratio: float) -> tuple[int, int, int, int]:
+def get_bound_box_by_face(
+ face: tuple[int, int, int, int], image_shape: tuple[int, int], ratio: float
+) -> tuple[int, int, int, int]:
"""
returns (left, upper, right, lower)
"""
@@ -24,4 +28,3 @@ def get_bound_box_by_face(face: tuple[int, int, int, int], image_shape: tuple[in
poster_left = min(poster_left, fanart_w - poster_w)
poster_left = int(poster_left)
return (poster_left, 0, poster_left + poster_w, poster_h)
-
diff --git a/javsp/datatype.py b/javsp/datatype.py
index 4bfdd7171..67cc21666 100644
--- a/javsp/datatype.py
+++ b/javsp/datatype.py
@@ -1,4 +1,5 @@
"""定义数据类型和一些通用性的对数据类型的操作"""
+
import os
import csv
import json
@@ -11,7 +12,8 @@
logger = logging.getLogger(__name__)
-filemove_logger = logging.getLogger('filemove')
+filemove_logger = logging.getLogger("filemove")
+
class MovieInfo:
def __init__(self, dvdid: str = None, /, *, cid: str = None, from_file=None):
@@ -22,36 +24,38 @@ def __init__(self, dvdid: str = None, /, *, cid: str = None, from_file=None):
"""
arg_count = len([i for i in [dvdid, cid, from_file] if i])
if arg_count != 1:
- raise TypeError(f'Require 1 parameter but {arg_count} given')
+ raise TypeError(f"Require 1 parameter but {arg_count} given")
if isinstance(dvdid, Movie):
self.dvdid = dvdid.dvdid
self.cid = dvdid.cid
else:
- self.dvdid = dvdid # DVD ID,即通常的番号
- self.cid = cid # DMM Content ID
+ self.dvdid = dvdid # DVD ID,即通常的番号
+ self.cid = cid # DMM Content ID
# 创建类的默认属性
- self.url = None # 影片页面的URL
- self.plot = None # 故事情节
- self.cover = None # 封面图片(URL)
- self.big_cover = None # 高清封面图片(URL)
- self.genre = None # 影片分类的标签
- self.genre_id = None # 影片分类的标签的ID,用于解决部分站点多个genre同名的问题,也便于管理多语言的genre
- self.genre_norm = None # 统一后的影片分类的标签
- self.score = None # 评分(10分制,为方便提取写入和保持统一,应以字符串类型表示)
- self.title = None # 影片标题(不含番号)
- self.ori_title = None # 原始影片标题,仅在标题被处理过时才对此字段赋值
- self.magnet = None # 磁力链接
- self.serial = None # 系列
- self.actress = None # 出演女优
- self.actress_pics = None # 出演女优的头像。单列一个字段,便于满足不同的使用需要
- self.director = None # 导演
- self.duration = None # 影片时长
- self.producer = None # 制作商
- self.publisher = None # 发行商
- self.uncensored = None # 是否为无码影片
- self.publish_date = None # 发布日期
- self.preview_pics = None # 预览图片(URL)
- self.preview_video = None # 预览视频(URL)
+ self.url = None # 影片页面的URL
+ self.plot = None # 故事情节
+ self.cover = None # 封面图片(URL)
+ self.big_cover = None # 高清封面图片(URL)
+ self.genre = None # 影片分类的标签
+ self.genre_id = None # 影片分类的标签的ID,用于解决部分站点多个genre同名的问题,也便于管理多语言的genre
+ self.genre_norm = None # 统一后的影片分类的标签
+ self.score = (
+ None # 评分(10分制,为方便提取写入和保持统一,应以字符串类型表示)
+ )
+ self.title = None # 影片标题(不含番号)
+ self.ori_title = None # 原始影片标题,仅在标题被处理过时才对此字段赋值
+ self.magnet = None # 磁力链接
+ self.serial = None # 系列
+ self.actress = None # 出演女优
+ self.actress_pics = None # 出演女优的头像。单列一个字段,便于满足不同的使用需要
+ self.director = None # 导演
+ self.duration = None # 影片时长
+ self.producer = None # 制作商
+ self.publisher = None # 发行商
+ self.uncensored = None # 是否为无码影片
+ self.publish_date = None # 发布日期
+ self.preview_pics = None # 预览图片(URL)
+ self.preview_video = None # 预览视频(URL)
if from_file:
if os.path.isfile(from_file):
@@ -80,15 +84,15 @@ def dump(self, filepath=None, crawler=None) -> None:
if not filepath:
id = self.dvdid if self.dvdid else self.cid
if crawler:
- filepath = f'../unittest/data/{id} ({crawler}).json'
+ filepath = f"../unittest/data/{id} ({crawler}).json"
filepath = os.path.join(os.path.dirname(__file__), filepath)
else:
- filepath = id + '.json'
- with open(filepath, 'wt', encoding='utf-8') as f:
+ filepath = id + ".json"
+ with open(filepath, "wt", encoding="utf-8") as f:
f.write(str(self))
def load(self, filepath) -> None:
- with open(filepath, 'rt', encoding='utf-8') as f:
+ with open(filepath, "rt", encoding="utf-8") as f:
d = json.load(f)
# 更新对象属性
attrs = vars(self).keys()
@@ -100,68 +104,75 @@ def get_info_dic(self):
"""生成用来填充模板的字典"""
info = self
d = {}
- d['num'] = info.dvdid or info.cid
- d['title'] = info.title or Cfg().summarizer.default.title
- d['rawtitle'] = info.ori_title or d['title']
- d['actress'] = ','.join(info.actress) if info.actress else Cfg().summarizer.default.actress
- d['score'] = info.score or '0'
- d['censor'] = Cfg().summarizer.censor_options_representation[1 if info.uncensored else 0]
- d['serial'] = info.serial or Cfg().summarizer.default.series
- d['director'] = info.director or Cfg().summarizer.default.director
- d['producer'] = info.producer or Cfg().summarizer.default.producer
- d['publisher'] = info.publisher or Cfg().summarizer.default.publisher
- d['date'] = info.publish_date or '0000-00-00'
- d['year'] = d['date'].split('-')[0]
+ d["num"] = info.dvdid or info.cid
+ d["title"] = info.title or Cfg().summarizer.default.title
+ d["rawtitle"] = info.ori_title or d["title"]
+ d["actress"] = (
+ ",".join(info.actress) if info.actress else Cfg().summarizer.default.actress
+ )
+ d["score"] = info.score or "0"
+ d["censor"] = Cfg().summarizer.censor_options_representation[
+ 1 if info.uncensored else 0
+ ]
+ d["serial"] = info.serial or Cfg().summarizer.default.series
+ d["director"] = info.director or Cfg().summarizer.default.director
+ d["producer"] = info.producer or Cfg().summarizer.default.producer
+ d["publisher"] = info.publisher or Cfg().summarizer.default.publisher
+ d["date"] = info.publish_date or "0000-00-00"
+ d["year"] = d["date"].split("-")[0]
# cid中不会出现'-',可以直接从d['num']拆分出label
- num_items = d['num'].split('-')
- d['label'] = num_items[0] if len(num_items) > 1 else '---'
- d['genre'] = ','.join(info.genre_norm if info.genre_norm else info.genre if info.genre else [])
+ num_items = d["num"].split("-")
+ d["label"] = num_items[0] if len(num_items) > 1 else "---"
+ d["genre"] = ",".join(
+ info.genre_norm if info.genre_norm else info.genre if info.genre else []
+ )
return d
class Movie:
"""用于关联影片文件的类"""
+
def __init__(self, dvdid=None, /, *, cid=None) -> None:
arg_count = len([i for i in (dvdid, cid) if i])
if arg_count != 1:
- raise TypeError(f'Require 1 parameter but {arg_count} given')
+ raise TypeError(f"Require 1 parameter but {arg_count} given")
# 创建类的默认属性
- self.dvdid = dvdid # DVD ID,即通常的番号
- self.cid = cid # DMM Content ID
- self.files = [] # 关联到此番号的所有影片文件的列表(用于管理带有多个分片的影片)
- self.data_src = 'normal' # 数据源:不同的数据源将使用不同的爬虫
- self.info: MovieInfo = None # 抓取到的影片信息
- self.save_dir = None # 存放影片、封面、NFO的文件夹路径
- self.basename = None # 按照命名模板生成的不包含路径和扩展名的basename
- self.nfo_file = None # nfo文件的路径
- self.fanart_file = None # fanart文件的路径
- self.poster_file = None # poster文件的路径
- self.guid = None # GUI使用的唯一标识,通过dvdid和files做md5生成
+ self.dvdid = dvdid # DVD ID,即通常的番号
+ self.cid = cid # DMM Content ID
+ self.files = [] # 关联到此番号的所有影片文件的列表(用于管理带有多个分片的影片)
+ self.data_src = "normal" # 数据源:不同的数据源将使用不同的爬虫
+ self.info: MovieInfo = None # 抓取到的影片信息
+ self.save_dir = None # 存放影片、封面、NFO的文件夹路径
+ self.basename = None # 按照命名模板生成的不包含路径和扩展名的basename
+ self.nfo_file = None # nfo文件的路径
+ self.fanart_file = None # fanart文件的路径
+ self.poster_file = None # poster文件的路径
+ self.guid = None # GUI使用的唯一标识,通过dvdid和files做md5生成
@cached_property
def hard_sub(self) -> bool:
"""影片文件带有内嵌字幕"""
- return 'C' in self.attr_str
+ return "C" in self.attr_str
@cached_property
def uncensored(self) -> bool:
"""影片文件是无码流出/无码破解版本(很多种子并不严格区分这两种,故这里也不进一步细分)"""
- return 'U' in self.attr_str
+ return "U" in self.attr_str
@cached_property
def attr_str(self) -> str:
"""用来标示影片文件的额外属性的字符串(空字符串/-U/-C/-UC)"""
# 暂不支持多分片的影片
if len(self.files) != 1:
- return ''
+ return ""
r = detect_special_attr(self.files[0], self.dvdid)
if r:
- r = '-' + r
+ r = "-" + r
return r
def __repr__(self) -> str:
- if self.cid and self.data_src == 'cid':
+ if self.cid and self.data_src == "cid":
expression = f"('cid={self.cid}')"
else:
expression = f"('{self.dvdid}')"
@@ -169,13 +180,14 @@ def __repr__(self) -> str:
def rename_files(self, use_hardlink: bool = False) -> None:
"""根据命名规则移动(重命名)影片文件"""
- def move_file(src:str, dst:str):
+
+ def move_file(src: str, dst: str):
"""移动(重命名)文件并记录信息到日志"""
abs_dst = os.path.abspath(dst)
# shutil.move might overwrite dst file
if os.path.exists(abs_dst):
- raise FileExistsError(f'File exists: {abs_dst}')
- if (use_hardlink):
+ raise FileExistsError(f"File exists: {abs_dst}")
+ if use_hardlink:
os.link(src, abs_dst)
else:
shutil.move(src, abs_dst)
@@ -183,7 +195,9 @@ def move_file(src:str, dst:str):
dst_name = os.path.basename(dst)
logger.info(f"重命名文件: '{src_rel}' -> '...{os.sep}{dst_name}'")
# 目前StreamHandler并未设置filter,为了避免显示中出现重复的日志,这里暂时只能用debug级别
- filemove_logger.debug(f'移动(重命名)文件: \n 原路径: "{src}"\n 新路径: "{abs_dst}"')
+ filemove_logger.debug(
+ f'移动(重命名)文件: \n 原路径: "{src}"\n 新路径: "{abs_dst}"'
+ )
new_paths = []
dir = os.path.dirname(self.files[0])
@@ -196,28 +210,31 @@ def move_file(src:str, dst:str):
else:
for i, fullpath in enumerate(self.files, start=1):
ext = os.path.splitext(fullpath)[1]
- newpath = os.path.join(self.save_dir, self.basename + f'-CD{i}' + ext)
+ newpath = os.path.join(self.save_dir, self.basename + f"-CD{i}" + ext)
move_file(fullpath, newpath)
new_paths.append(newpath)
self.new_paths = new_paths
if len(os.listdir(dir)) == 0:
- #如果移动文件后目录为空则删除该目录
+ # 如果移动文件后目录为空则删除该目录
os.rmdir(dir)
class GenreMap(dict):
"""genre的映射表"""
+
def __init__(self, file):
genres = {}
- with open(resource_path(file), newline='', encoding='utf-8-sig') as csvfile:
+ with open(resource_path(file), newline="", encoding="utf-8-sig") as csvfile:
reader = csv.DictReader(csvfile)
try:
for row in reader:
- genres[row['id']] = row['translate']
+ genres[row["id"]] = row["translate"]
except UnicodeDecodeError:
- logger.error('CSV file must be saved as UTF-8-BOM to edit is in Excel')
+ logger.error("CSV file must be saved as UTF-8-BOM to edit is in Excel")
except KeyError:
- logger.error("The columns 'id' and 'translate' must exist in the csv file")
+ logger.error(
+ "The columns 'id' and 'translate' must exist in the csv file"
+ )
self.update(genres)
def map(self, ls):
diff --git a/javsp/file.py b/javsp/file.py
index 9ae6b0f8b..b986dc08d 100644
--- a/javsp/file.py
+++ b/javsp/file.py
@@ -1,16 +1,22 @@
"""与文件相关的各类功能"""
+
import os
-from pathlib import Path
import re
import ctypes
import logging
import itertools
-import json
from sys import platform
from typing import List
-__all__ = ['scan_movies', 'get_fmt_size', 'get_remaining_path_len', 'replace_illegal_chars', 'get_failed_when_scan', 'find_subtitle_in_dir']
+__all__ = [
+ "scan_movies",
+ "get_fmt_size",
+ "get_remaining_path_len",
+ "replace_illegal_chars",
+ "get_failed_when_scan",
+ "find_subtitle_in_dir",
+]
from javsp.avid import *
@@ -24,14 +30,16 @@
def scan_movies(root: str) -> List[Movie]:
"""获取文件夹内的所有影片的列表(自动探测同一文件夹内的分片)"""
- # 由于实现的限制:
+ # 由于实现的限制:
# 1. 以数字编号最多支持10个分片,字母编号最多支持26个分片
# 2. 允许分片间的编号有公共的前导符(如编号01, 02, 03),因为求prefix时前导符也会算进去
# 扫描所有影片文件并获取它们的番号
- dic = {} # avid: [abspath1, abspath2...]
+ dic = {} # avid: [abspath1, abspath2...]
small_videos = {}
- ignore_folder_name_pattern = re.compile('|'.join(Cfg().scanner.ignored_folder_name_pattern))
+ ignore_folder_name_pattern = re.compile(
+ "|".join(Cfg().scanner.ignored_folder_name_pattern)
+ )
for dirpath, dirnames, filenames in os.walk(root):
for name in dirnames.copy():
if ignore_folder_name_pattern.match(name):
@@ -55,7 +63,7 @@ def scan_movies(root: str) -> List[Movie]:
else:
dic[avid] = [fullpath]
else:
- fail = Movie('无法识别番号')
+ fail = Movie("无法识别番号")
fail.files = [fullpath]
failed_items.append(fail)
logger.error(f"无法提取影片番号: '{fullpath}'")
@@ -70,15 +78,17 @@ def scan_movies(root: str) -> List[Movie]:
elif avid:
has_avid[name] = avid
# 对于前面忽略的视频生成一个简单的提示
- small_videos = {k:sorted(v) for k,v in sorted(small_videos.items())}
+ small_videos = {k: sorted(v) for k, v in sorted(small_videos.items())}
skipped_files = list(itertools.chain(*small_videos.values()))
skipped_cnt = len(skipped_files)
if skipped_cnt > 0:
if len(has_avid) > 0:
- logger.info(f"跳过了 {', '.join(has_avid)} 等{skipped_cnt}个小于指定大小的视频文件")
+ logger.info(
+ f"跳过了 {', '.join(has_avid)} 等{skipped_cnt}个小于指定大小的视频文件"
+ )
else:
logger.info(f"跳过了{skipped_cnt}个小于指定大小的视频文件")
- logger.debug('跳过的视频文件如下:\n' + '\n'.join(skipped_files))
+ logger.debug("跳过的视频文件如下:\n" + "\n".join(skipped_files))
# 检查是否有多部影片对应同一个番号
non_slice_dup = {} # avid: [abspath1, abspath2...]
for avid, files in dic.copy().items():
@@ -95,19 +105,21 @@ def scan_movies(root: str) -> List[Movie]:
basenames = [os.path.basename(i) for i in files]
prefix = os.path.commonprefix(basenames)
try:
- pattern_expr = re_escape(prefix) + r'\s*([a-z\d])\s*'
+ pattern_expr = re_escape(prefix) + r"\s*([a-z\d])\s*"
pattern = re.compile(pattern_expr, flags=re.I)
except re.error:
logger.debug(f"正则识别影片分片信息时出错: '{pattern_expr}'")
del dic[avid]
continue
- remaining = [pattern.sub(r'\1', i).lower() for i in basenames]
+ remaining = [pattern.sub(r"\1", i).lower() for i in basenames]
postfixes = [i[1:] for i in remaining]
slices = [i[0] for i in remaining]
# 如果有不同的后缀,说明有文件名不符合正则表达式条件(没有发生替换或不带分片信息)
- if (len(set(postfixes)) != 1
+ if (
+ len(set(postfixes)) != 1
# remaining为初步提取的分片信息,不允许有重复值
- or len(slices) != len(set(slices))):
+ or len(slices) != len(set(slices))
+ ):
logger.debug(f"无法识别分片信息: {prefix=}, {remaining=}")
non_slice_dup[avid] = files
del dic[avid]
@@ -115,7 +127,9 @@ def scan_movies(root: str) -> List[Movie]:
# 影片编号必须从 0/1/a 开始且编号连续
sorted_slices = sorted(slices)
first, last = sorted_slices[0], sorted_slices[-1]
- if (first not in ('0', '1', 'a')) or (ord(last) != (ord(first)+len(sorted_slices)-1)):
+ if (first not in ("0", "1", "a")) or (
+ ord(last) != (ord(first) + len(sorted_slices) - 1)
+ ):
logger.debug(f"无效的分片起始编号或分片编号不连续: {sorted_slices=}")
non_slice_dup[avid] = files
del dic[avid]
@@ -125,18 +139,21 @@ def scan_movies(root: str) -> List[Movie]:
dic[avid] = mapped_files
# 汇总输出错误提示信息
- msg = ''
+ msg = ""
for avid, files in non_slice_dup.items():
- msg += f'{avid}: \n'
+ msg += f"{avid}: \n"
for f in files:
- msg += (' ' + os.path.relpath(f, root) + '\n')
+ msg += " " + os.path.relpath(f, root) + "\n"
if msg:
- logger.error("下列番号对应多部影片文件且不符合分片规则,已略过整理,请手动处理后重新运行脚本: \n" + msg)
+ logger.error(
+ "下列番号对应多部影片文件且不符合分片规则,已略过整理,请手动处理后重新运行脚本: \n"
+ + msg
+ )
# 转换数据的组织格式
movies: List[Movie] = []
for avid, files in dic.items():
src = guess_av_type(avid)
- if src != 'cid':
+ if src != "cid":
mov = Movie(avid)
else:
mov = Movie(cid=avid)
@@ -144,7 +161,7 @@ def scan_movies(root: str) -> List[Movie]:
mov.dvdid = get_id(files[0])
mov.files = files
mov.data_src = src
- logger.debug(f'影片数据源类型: {avid}: {src}')
+ logger.debug(f"影片数据源类型: {avid}: {src}")
movies.append(mov)
return movies
@@ -154,37 +171,41 @@ def get_failed_when_scan():
return failed_items
-_PARDIR_REPLACE = re.compile(r'\.{2,}')
+_PARDIR_REPLACE = re.compile(r"\.{2,}")
+
+
def replace_illegal_chars(name):
"""将不能用于文件名的字符替换为形近的字符"""
# 非法字符列表 https://stackoverflow.com/a/31976060/6415337
- if platform == 'win32':
+ if platform == "win32":
# http://www.unicode.org/Public/security/latest/confusables.txt
- charmap = {'<': '❮',
- '>': '❯',
- ':': ':',
- '"': '″',
- '/': '/',
- '\\': '\',
- '|': '|',
- '?': '?',
- '*': '꘎'}
+ charmap = {
+ "<": "❮",
+ ">": "❯",
+ ":": ":",
+ '"': "″",
+ "/": "/",
+ "\\": "\",
+ "|": "|",
+ "?": "?",
+ "*": "꘎",
+ }
for c, rep in charmap.items():
name = name.replace(c, rep)
elif platform == "darwin": # MAC OS X
- name = name.replace(':', ':')
- else: # 其余都当做Linux处理
- name = name.replace('/', '/')
+ name = name.replace(":", ":")
+ else: # 其余都当做Linux处理
+ name = name.replace("/", "/")
# 处理连续多个英文句点.
if os.pardir in name:
- name = _PARDIR_REPLACE.sub('…', name)
+ name = _PARDIR_REPLACE.sub("…", name)
return name
def is_remote_drive(path: str):
"""判断一个路径是否为远程映射到本地"""
- #TODO: 当前仅支持Windows平台
- if platform != 'win32':
+ # TODO: 当前仅支持Windows平台
+ if platform != "win32":
return False
DRIVE_REMOTE = 0x4
drive = os.path.splitdrive(os.path.abspath(path))[0] + os.sep
@@ -194,10 +215,14 @@ def is_remote_drive(path: str):
def get_remaining_path_len(path):
"""计算当前系统支持的最大路径长度与给定路径长度的差值"""
- #TODO: 支持不同的操作系统
+ # TODO: 支持不同的操作系统
fullpath = os.path.abspath(path)
# Windows: If the length exceeds ~256 characters, you will be able to see the path/files via Windows/File Explorer, but may not be able to delete/move/rename these paths/files
- length = len(fullpath.encode('utf-8')) if Cfg().summarizer.path.length_by_byte else len(fullpath)
+ length = (
+ len(fullpath.encode("utf-8"))
+ if Cfg().summarizer.path.length_by_byte
+ else len(fullpath)
+ )
remaining = Cfg().summarizer.path.length_maximum - length
return remaining
@@ -215,7 +240,7 @@ def get_fmt_size(file_or_size) -> str:
size = file_or_size
else:
size = os.path.getsize(file_or_size)
- for unit in ['','Ki','Mi','Gi','Ti']:
+ for unit in ["", "Ki", "Mi", "Gi", "Ti"]:
# 1023.995: to avoid rounding bug when format str, e.g. 1048571 -> 1024.0 KiB
if abs(size) < 1023.995:
return f"{size:3.2f} {unit}B"
@@ -223,7 +248,9 @@ def get_fmt_size(file_or_size) -> str:
_sub_files = {}
-SUB_EXTENSIONS = ('.srt', '.ass')
+SUB_EXTENSIONS = (".srt", ".ass")
+
+
def find_subtitle_in_dir(folder: str, dvdid: str):
"""在folder内寻找是否有匹配dvdid的字幕"""
folder_data = _sub_files.get(folder)
diff --git a/javsp/func.py b/javsp/func.py
index 042afea5c..ba2048acd 100644
--- a/javsp/func.py
+++ b/javsp/func.py
@@ -1,4 +1,5 @@
"""业务逻辑所需的或有一定通用性的函数"""
+
# 为了降低耦合度,也避免功能复杂后可能出现的循环导入的问题,这里尽量不导入项目内部的模块
# 如果需要获得配置信息,也应当由外部模块将配置项的值以参数的形式传入
import os
@@ -27,24 +28,31 @@
from javsp.lib import re_escape, resource_path
-__all__ = ['select_folder', 'get_scan_dir', 'remove_trail_actor_in_title',
- 'shutdown', 'CLEAR_LINE', 'check_update', 'split_by_punc']
+__all__ = [
+ "select_folder",
+ "get_scan_dir",
+ "remove_trail_actor_in_title",
+ "shutdown",
+ "CLEAR_LINE",
+ "check_update",
+ "split_by_punc",
+]
-CLEAR_LINE = '\r\x1b[K'
+CLEAR_LINE = "\r\x1b[K"
logger = logging.getLogger(__name__)
-def select_folder(default_dir=''):
+def select_folder(default_dir=""):
"""使用文件对话框提示用户选择一个文件夹"""
if not USE_GUI:
logger.error("无法打开窗口,请通过命令行的方式输入扫描路径")
exit(1)
window = Tk()
window.withdraw()
- window.iconbitmap(resource_path('image/JavSP.ico'))
+ window.iconbitmap(resource_path("image/JavSP.ico"))
path = filedialog.askdirectory(initialdir=default_dir)
- if path != '':
+ if path != "":
return os.path.normpath(path)
@@ -57,21 +65,21 @@ def get_scan_dir(cfg_scan_dir: Path | None) -> str | None:
else:
logger.error(f"配置的待整理文件夹无效:'{cfg_scan_dir}'")
else:
- if platform.system().lower() == 'windows':
- print('请选择要整理的文件夹:', end='')
+ if platform.system().lower() == "windows":
+ print("请选择要整理的文件夹:", end="")
root = select_folder()
else:
- root = input('请选择要整理的文件夹路径,必须是绝对路径: ')
+ root = input("请选择要整理的文件夹路径,必须是绝对路径: ")
print(root)
return root
-def remove_trail_actor_in_title(title:str, actors:list) -> str:
+def remove_trail_actor_in_title(title: str, actors: list) -> str:
"""寻找并移除标题尾部的女优名"""
if not (actors and title):
return title
# 目前使用分隔符白名单来做检测(担心按Unicode范围匹配误伤太多),考虑尽可能多的分隔符
- delimiters = '-xX &·,; &・,;'
+ delimiters = "-xX &·,; &・,;"
actor_ls = [re_escape(i) for i in actors if i]
pattern = f"^(.*?)([{delimiters}]{{1,3}}({'|'.join(actor_ls)}))+$"
# 使用match而不是sub是为了将替换掉的部分写入日志
@@ -87,11 +95,13 @@ def shutdown(timeout=120):
"""关闭计算机"""
try:
for i in reversed(range(timeout)):
- print(CLEAR_LINE + f"JavSP整理完成,将在 {i} 秒后关机。按'Ctrl+C'取消", end='')
+ print(
+ CLEAR_LINE + f"JavSP整理完成,将在 {i} 秒后关机。按'Ctrl+C'取消", end=""
+ )
time.sleep(1)
- logger.info('整理完成,自动关机')
- #TODO: 当前仅支持Windows平台
- os.system('shutdown -s')
+ logger.info("整理完成,自动关机")
+ # TODO: 当前仅支持Windows平台
+ os.system("shutdown -s")
except KeyboardInterrupt:
return
@@ -101,7 +111,7 @@ def utc2local(utc_str):
# python不支持 ISO-8601 中的Z后缀
now = time.time()
offset = datetime.fromtimestamp(now) - datetime.utcfromtimestamp(now)
- utc_str = utc_str.replace('Z', '+00:00')
+ utc_str = utc_str.replace("Z", "+00:00")
utc_time = datetime.fromisoformat(utc_str)
local_time = utc_time + offset
return local_time
@@ -111,7 +121,7 @@ def get_actual_width(mix_str: str) -> int:
"""给定一个中英混合的字符串,返回实际的显示宽度"""
width = len(mix_str)
for c in mix_str:
- if u'\u4e00' <= c <= u'\u9fa5':
+ if "\u4e00" <= c <= "\u9fa5":
width += 1
return width
@@ -120,31 +130,33 @@ def align_center(mix_str: str, total_width: int) -> str:
"""给定一个中英混合的字符串,根据其实际显示宽度中心对齐"""
actual_width = get_actual_width(mix_str)
add_space = int((total_width - actual_width) / 2)
- aligned_str = ' ' * add_space + mix_str
+ aligned_str = " " * add_space + mix_str
return aligned_str
# 枚举Unicode各平面内中日韩区块及拉丁字母区块内的所有标点符号
_punc = (
-" ", # spaces
-"!\"#%&'()*,-./:;?@[\\]_{}", # (0x0, 0x7f), Basic Latin
-"¡§«¶·»¿", # (0x80, 0xff), Latin-1 Supplement
-";·", # (0x370, 0x3ff), Greek and Coptic
-"‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞", # (0x2000, 0x206f), General Punctuation
-"、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽", # (0x3000, 0x303f), CJK Symbols and Punctuation
-"゠・", # (0x30a0, 0x30ff), Katakana
-"︐︑︒︓︔︕︖︗︘︙", # (0xfe10, 0xfe1f), Vertical Forms
-"︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏", # (0xfe30, 0xfe4f), CJK Compatibility Forms
-# "﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫", # (0xfe50, 0xfe6f), Small Form Variants
-"!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・", # (0xff00, 0xffef), Halfwidth and Fullwidth Forms
+ " ", # spaces
+ "!\"#%&'()*,-./:;?@[\\]_{}", # (0x0, 0x7f), Basic Latin
+ "¡§«¶·»¿", # (0x80, 0xff), Latin-1 Supplement
+ ";·", # (0x370, 0x3ff), Greek and Coptic
+ "‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞", # (0x2000, 0x206f), General Punctuation
+ "、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽", # (0x3000, 0x303f), CJK Symbols and Punctuation
+ "゠・", # (0x30a0, 0x30ff), Katakana
+ "︐︑︒︓︔︕︖︗︘︙", # (0xfe10, 0xfe1f), Vertical Forms
+ "︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏", # (0xfe30, 0xfe4f), CJK Compatibility Forms
+ # "﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫", # (0xfe50, 0xfe6f), Small Form Variants
+ "!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・", # (0xff00, 0xffef), Halfwidth and Fullwidth Forms
)
-_punc_pattern = re.compile('.*?[' + ''.join(_punc) + ']')
+_punc_pattern = re.compile(".*?[" + "".join(_punc) + "]")
+
+
def split_by_punc(s):
"""将一个字符串按照Unicode标准中的标点符号进行分割"""
iters = list(_punc_pattern.finditer(s))
if iters:
- ls = [s[i.span()[0]: i.span()[1]] for i in iters]
- ls.append(s[iters[-1].span()[1]:])
+ ls = [s[i.span()[0] : i.span()[1]] for i in iters]
+ ls.append(s[iters[-1].span()[1] :])
else:
ls = [s]
return ls
@@ -161,67 +173,71 @@ def print_header(title, info=[]):
info_width = 0
terminal_width = shutil.get_terminal_size().columns
display_width = min(max(title_width, info_width) + 6, terminal_width)
- print('=' * display_width)
+ print("=" * display_width)
for line in title:
print(align_center(line, display_width))
if info:
- print('-' * display_width)
+ print("-" * display_width)
for line in info:
print(line)
- print('=' * display_width)
- print('')
+ print("=" * display_width)
+ print("")
# 使用pyinstaller打包exe时生成hook,运行时由该hook将版本信息注入到sys中
- local_version = meta.version('javsp')
+ local_version = meta.version("javsp")
if local_version == "":
return
# 检查更新
if allow_check:
- api_url = 'https://api.github.com/repos/Yuukiy/JavSP/releases/latest'
- release_url = 'https://github.com/Yuukiy/JavSP/releases/latest'
- print('正在检查更新...', end='')
+ api_url = "https://api.github.com/repos/Yuukiy/JavSP/releases/latest"
+ release_url = "https://github.com/Yuukiy/JavSP/releases/latest"
+ print("正在检查更新...", end="")
try:
data = request_get(api_url, timeout=3).json()
- latest_version = data['tag_name']
- release_time = utc2local(data['published_at'])
- release_date = release_time.isoformat().split('T')[0]
+ latest_version = data["tag_name"]
+ release_time = utc2local(data["published_at"])
+ release_date = release_time.isoformat().split("T")[0]
if version.parse(local_version) < version.parse(latest_version):
- update_status = 'new_version'
+ update_status = "new_version"
else:
- update_status = 'already_latest'
+ update_status = "already_latest"
except Exception as e:
- logger.debug('检查版本更新时出错: ' + repr(e))
- update_status = 'fail_to_check'
+ logger.debug("检查版本更新时出错: " + repr(e))
+ update_status = "fail_to_check"
else:
- update_status = 'disallow'
+ update_status = "disallow"
# 根据检查更新的情况输出软件版本信息和更新信息
- print(CLEAR_LINE, end='')
- if update_status == 'disallow':
- title = f'Jav Scraper Package: {local_version}'
+ print(CLEAR_LINE, end="")
+ if update_status == "disallow":
+ title = f"Jav Scraper Package: {local_version}"
print_header([title])
- elif update_status == 'already_latest':
- title = f'Jav Scraper Package: {local_version} (已是最新版)'
+ elif update_status == "already_latest":
+ title = f"Jav Scraper Package: {local_version} (已是最新版)"
print_header([title])
- elif update_status == 'fail_to_check':
- release_url_mirror = 'https://hub.fastgit.xyz/Yuukiy/JavSP/releases/latest'
- titles = [f'Jav Scraper Package: {local_version}']
- info = ['检查更新失败,请前往以下地址查看最新版本:', ' '+release_url,
- '如果你打不开上面的地址,也可以尝试访问镜像站点:', ' '+release_url_mirror]
+ elif update_status == "fail_to_check":
+ release_url_mirror = "https://hub.fastgit.xyz/Yuukiy/JavSP/releases/latest"
+ titles = [f"Jav Scraper Package: {local_version}"]
+ info = [
+ "检查更新失败,请前往以下地址查看最新版本:",
+ " " + release_url,
+ "如果你打不开上面的地址,也可以尝试访问镜像站点:",
+ " " + release_url_mirror,
+ ]
print_header(titles, info)
- elif update_status == 'new_version':
- titles = [f'Jav Scraper Package: {local_version}']
- titles.append(f'↓ 有新版本可下载: {latest_version} ↓')
+ elif update_status == "new_version":
+ titles = [f"Jav Scraper Package: {local_version}"]
+ titles.append(f"↓ 有新版本可下载: {latest_version} ↓")
titles.append(release_url)
# 提取changelog消息
try:
enable_msg_head = True
- lines = data['body'].splitlines()
- changelog = [f'更新时间: {release_date}']
+ lines = data["body"].splitlines()
+ changelog = [f"更新时间: {release_date}"]
for line in lines:
- if line.startswith('## '):
+ if line.startswith("## "):
enable_msg_head = False
changelog.append(Style.BRIGHT + line[3:] + Style.RESET_ALL)
- elif line.startswith('- '):
+ elif line.startswith("- "):
enable_msg_head = False
changelog.append(line)
elif enable_msg_head:
@@ -232,15 +248,17 @@ def print_header(title, info=[]):
# 尝试自动更新
if auto_update:
try:
- logger.info('尝试自动更新到新版本: ' + latest_version + " (按'Ctrl+C'取消)")
+ logger.info(
+ "尝试自动更新到新版本: " + latest_version + " (按'Ctrl+C'取消)"
+ )
download_update(data)
except KeyboardInterrupt:
- logger.info('用户取消更新')
+ logger.info("用户取消更新")
except Exception as e:
- logger.warning('自动更新失败,请重启程序再试或者手动下载更新')
+ logger.warning("自动更新失败,请重启程序再试或者手动下载更新")
logger.debug(e, exc_info=True)
finally:
- print() # 输出空行,作为新旧程序的分隔
+ print() # 输出空行,作为新旧程序的分隔
def download_update(rel_info):
@@ -249,22 +267,26 @@ def download_update(rel_info):
Args:
rel_info (json): 调用Github API得到的最新版的release信息
"""
- if rel_info.get('assets') and getattr(sys, 'frozen', False):
- down_url = rel_info['assets'][0]['browser_download_url']
- asset_name = rel_info['assets'][0]['name']
- desc = '下载更新' if shutil.get_terminal_size().columns < 120 else '下载更新: '+asset_name
+ if rel_info.get("assets") and getattr(sys, "frozen", False):
+ down_url = rel_info["assets"][0]["browser_download_url"]
+ asset_name = rel_info["assets"][0]["name"]
+ desc = (
+ "下载更新"
+ if shutil.get_terminal_size().columns < 120
+ else "下载更新: " + asset_name
+ )
download(down_url, asset_name, desc=desc)
if os.path.exists(asset_name):
# 备份原有的程序
basepath, ext = os.path.splitext(sys.executable)
- backup_name = basepath + '_backup' + ext
+ backup_name = basepath + "_backup" + ext
if os.path.exists(backup_name):
os.remove(backup_name)
os.rename(sys.executable, backup_name)
# 解压下载的zip文件
- with zipfile.ZipFile(asset_name, 'r') as zip_ref:
+ with zipfile.ZipFile(asset_name, "r") as zip_ref:
zip_ref.extractall()
- logger.info('更新完成,启动新版本程序...')
+ logger.info("更新完成,启动新版本程序...")
args = [sys.executable] + sys.argv[1:]
p = subprocess.Popen(args, start_new_session=True)
p.wait()
@@ -273,5 +295,5 @@ def download_update(rel_info):
if __name__ == "__main__":
- setattr(sys, 'javsp_version', 'v0')
+ setattr(sys, "javsp_version", "v0")
check_update()
diff --git a/javsp/image.py b/javsp/image.py
index e98ece903..5ea209879 100644
--- a/javsp/image.py
+++ b/javsp/image.py
@@ -1,11 +1,11 @@
"""处理本地图片的相关功能"""
+
from enum import Enum
-import os
import logging
from PIL import Image, ImageOps
-__all__ = ['valid_pic', 'get_pic_size', 'add_label_to_poster', 'LabelPostion']
+__all__ = ["valid_pic", "get_pic_size", "add_label_to_poster", "LabelPostion"]
logger = logging.getLogger(__name__)
@@ -24,15 +24,19 @@ def valid_pic(pic_path):
# 位置枚举
class LabelPostion(Enum):
"""水印位置枚举"""
+
TOP_LEFT = 1
TOP_RIGHT = 2
BOTTOM_LEFT = 3
BOTTOM_RIGHT = 4
-def add_label_to_poster(poster: Image.Image, mark_pic_file: Image.Image, pos: LabelPostion) -> Image.Image:
+
+def add_label_to_poster(
+ poster: Image.Image, mark_pic_file: Image.Image, pos: LabelPostion
+) -> Image.Image:
"""向poster中添加标签(水印)"""
- mark_img = mark_pic_file.convert('RGBA')
- r,g,b,a = mark_img.split()
+ mark_img = mark_pic_file.convert("RGBA")
+ r, g, b, a = mark_img.split()
# 计算水印位置
if pos == LabelPostion.TOP_LEFT:
box = (0, 0)
diff --git a/javsp/lib.py b/javsp/lib.py
index 3b6932d76..8335918fd 100644
--- a/javsp/lib.py
+++ b/javsp/lib.py
@@ -1,14 +1,17 @@
"""用来组织不需要依赖任何自定义类型的功能函数"""
+
import os
import re
import sys
from pathlib import Path
-__all__ = ['re_escape', 'resource_path', 'strftime_to_minutes', 'detect_special_attr']
+__all__ = ["re_escape", "resource_path", "strftime_to_minutes", "detect_special_attr"]
+
+
+_special_chars_map = {i: "\\" + chr(i) for i in b"()[]{}?*+|^$\\."}
-_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+|^$\\.'}
def re_escape(s: str) -> str:
"""用来对字符串进行转义,以将转义后的字符串用于构造正则表达式"""
pattern = s.translate(_special_chars_map)
@@ -33,42 +36,46 @@ def strftime_to_minutes(s: str) -> int:
Returns:
[int]: 取整后的分钟数
"""
- items = list(map(int, s.split(':')))
+ items = list(map(int, s.split(":")))
if len(items) == 2:
- minutes = items[0] + round(items[1]/60)
+ minutes = items[0] + round(items[1] / 60)
elif len(items) == 3:
- minutes = items[0] * 60 + items[1] + round(items[2]/60)
+ minutes = items[0] * 60 + items[1] + round(items[2] / 60)
else:
raise ValueError(f"无法将字符串'{s}'转换为分钟")
return minutes
-_PATTERN = re.compile(r'(uncen(sor(ed)?)?([- _\s]*leak(ed)?)?|[无無][码碼](流出|破解))', flags=re.I)
+_PATTERN = re.compile(
+ r"(uncen(sor(ed)?)?([- _\s]*leak(ed)?)?|[无無][码碼](流出|破解))", flags=re.I
+)
+
+
def detect_special_attr(filepath: str, avid: str = None) -> str:
"""通过文件名检测影片是否有特殊属性(内嵌字幕、无码流出/破解)
Returns:
[str]: '', 'U', 'C', 'UC'
"""
- result = ''
+ result = ""
base = os.path.splitext(os.path.basename(filepath))[0].upper()
# 尝试使用正则匹配
match = _PATTERN.search(base)
if match:
- result += 'U'
+ result += "U"
# 尝试匹配-C/-U/-UC后缀的影片
- postfix = base.split('-')[-1]
- if postfix in ('U', 'C', 'UC'):
+ postfix = base.split("-")[-1]
+ if postfix in ("U", "C", "UC"):
result += postfix
elif avid:
- pattern_str = re.sub(r'[_-]', '[_-]*', avid) + r'(UC|U|C)\b'
+ pattern_str = re.sub(r"[_-]", "[_-]*", avid) + r"(UC|U|C)\b"
match = re.search(pattern_str, base, flags=re.I)
if match:
result += match.group(1)
# 最终格式化
- result = ''.join(sorted(set(result), reverse=True))
+ result = "".join(sorted(set(result), reverse=True))
return result
if __name__ == "__main__":
- print(detect_special_attr('ipx-177cd1.mp4', 'IPX-177'))
+ print(detect_special_attr("ipx-177cd1.mp4", "IPX-177"))
diff --git a/javsp/nfo.py b/javsp/nfo.py
index 573aa0cc3..01a05f544 100644
--- a/javsp/nfo.py
+++ b/javsp/nfo.py
@@ -1,4 +1,5 @@
"""与操作nfo文件相关的功能"""
+
from lxml.etree import tostring
from lxml.builder import E
@@ -43,13 +44,13 @@ def write_nfo(info: MovieInfo, nfo_file):
# 但是Emby不支持此特性,Jellyfin的文档和社区都比较弱,没找到相关说明,推测多半也不支持
# fanart通常也是通过给fanart图片命名来匹配
- nfo.append(E.mpaa('NC-17')) # 分级
+ nfo.append(E.mpaa("NC-17")) # 分级
# 将DVD ID和CID写入到uniqueid字段
if info.dvdid:
- nfo.append(E.uniqueid(info.dvdid, type='num', default='true'))
+ nfo.append(E.uniqueid(info.dvdid, type="num", default="true"))
if info.cid:
- nfo.append(E.uniqueid(info.cid, type='cid'))
+ nfo.append(E.uniqueid(info.cid, type="cid"))
# 选择要写入的genre数据源字段:将[]作为后备结果,以确保genre结果为None时后续不会抛出异常
for genre_item in (info.genre_norm, info.genre, []):
@@ -70,7 +71,7 @@ def write_nfo(info: MovieInfo, nfo_file):
tags = []
# 添加自定义tag
for tag_new in Cfg().summarizer.nfo.custom_tags_fields:
- tags.append(tag_new.format(**dic))
+ tags.append(tag_new.format(**dic))
# 去重
tags = list(set(tags))
# 写入tag
@@ -78,7 +79,7 @@ def write_nfo(info: MovieInfo, nfo_file):
nfo.append(E.tag(i))
# Kodi上的country字段没说必须使用国家的代码(比如JP),所以目前暂定直接使用国家名
- nfo.append(E.country('日本'))
+ nfo.append(E.country("日本"))
if info.serial:
# 部分影片有系列。set字段支持overview作为介绍,但是目前没发现有地方可以获取到系列的介绍
@@ -109,13 +110,20 @@ def write_nfo(info: MovieInfo, nfo_file):
else:
nfo.append(E.actor(E.name(i)))
- with open(nfo_file, 'wt', encoding='utf-8') as f:
- f.write(tostring(nfo, encoding='unicode', pretty_print=True,
- doctype=''))
+ with open(nfo_file, "wt", encoding="utf-8") as f:
+ f.write(
+ tostring(
+ nfo,
+ encoding="unicode",
+ pretty_print=True,
+ doctype='',
+ )
+ )
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
- info = MovieInfo(from_file=R'unittest\data\IPX-177 (javbus).json')
+ info = MovieInfo(from_file=R"unittest\data\IPX-177 (javbus).json")
write_nfo(info)
diff --git a/javsp/print.py b/javsp/print.py
index 651b75679..d96e5b82c 100644
--- a/javsp/print.py
+++ b/javsp/print.py
@@ -1,9 +1,10 @@
"""改写内置的print函数,将其输出重定向到tqdm"""
+
import tqdm
import inspect
-__all__ = ['TqdmOut']
+__all__ = ["TqdmOut"]
# 普通输出和tqdm的输出混在一起会导致显示错乱,故在使用tqdm时要使用tqdm.write方法。
@@ -13,17 +14,22 @@
# 在单个模块内,不执行导入,这样的话在各个模块内仍然可以直接使用print
builtin_print = print
+
+
def flex_print(*args, **kwargs):
try:
tqdm.tqdm.write(*args, **kwargs)
except:
- builtin_print(*args, ** kwargs)
+ builtin_print(*args, **kwargs)
+
+
# 替换内置的print
inspect.builtins.print = flex_print
class TqdmOut:
"""用于将logging的stream输出重定向到tqdm"""
+
@classmethod
def write(cls, s, file=None, nolock=False):
- tqdm.tqdm.write(s, file=file, end='', nolock=nolock)
+ tqdm.tqdm.write(s, file=file, end="", nolock=nolock)
diff --git a/javsp/web/airav.py b/javsp/web/airav.py
index 22e9fdbf7..9626f2653 100644
--- a/javsp/web/airav.py
+++ b/javsp/web/airav.py
@@ -1,4 +1,5 @@
"""从airav抓取数据"""
+
import re
import logging
from html import unescape
@@ -11,13 +12,13 @@
# 初始化Request实例
request = Request(use_scraper=True)
-request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9'
+request.headers["Accept-Language"] = "zh-TW,zh;q=0.9"
# 近期airav服务器似乎不稳定,时好时坏,单次查询平均在17秒左右,timeout时间增加到20秒
request.timeout = 20
logger = logging.getLogger(__name__)
-base_url = 'https://www.airav.wiki'
+base_url = "https://www.airav.wiki"
def search_movie(dvdid):
@@ -27,74 +28,78 @@ def search_movie(dvdid):
count = 1
result = []
while len(result) < count:
- url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}'
+ url = (
+ f"{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}"
+ )
r = request.get(url).json()
# {"offset": 2460, "count": 12345, "result": [...], "status": "ok"}
- if r['result']:
- result.extend(r['result'])
- count = r['count']
+ if r["result"]:
+ result.extend(r["result"])
+ count = r["count"]
page += 1
- else: # 结果为空,结束循环
+ else: # 结果为空,结束循环
break
# 如果什么都没搜索到,直接返回
if not result:
raise MovieNotFoundError(__name__, dvdid)
# 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472')
- result.sort(key=lambda x:x['barcode'])
+ result.sort(key=lambda x: x["barcode"])
# 从所有搜索结果中选择最可能的番号,返回它的URL
- target = dvdid.replace('-', '_')
+ target = dvdid.replace("-", "_")
for item in result:
# {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''}
- barcode = item['barcode'].replace('-', '_')
+ barcode = item["barcode"].replace("-", "_")
if target in barcode:
- return item['barcode']
+ return item["barcode"]
raise MovieNotFoundError(__name__, dvdid, result)
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
- url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
+ url = f"{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW"
resp = request.get(url).json()
# 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息
- if resp['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid):
+ if resp["count"] == 0 and re.match(r"\d{6}[-_]\d{2,3}", movie.dvdid):
barcode = search_movie(movie.dvdid)
if barcode:
- url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW'
+ url = f"{base_url}/api/video/barcode/{barcode}?lng=zh-TW"
resp = request.get(url).json()
- if resp['count'] == 0:
+ if resp["count"] == 0:
raise MovieNotFoundError(__name__, movie.dvdid, resp)
# 从API返回的数据中提取需要的字段
# TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
- data = resp['result']
- dvdid = data['barcode']
+ data = resp["result"]
+ dvdid = data["barcode"]
movie.dvdid = dvdid
- movie.url = base_url + '/video/' + dvdid
+ movie.url = base_url + "/video/" + dvdid
# plot和title中可能含有HTML的转义字符,需要进行解转义处理
- movie.plot = unescape(data['description']) or None
- movie.cover = data['img_url']
+ movie.plot = unescape(data["description"]) or None
+ movie.cover = data["img_url"]
# airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
- movie.genre = [i['name'] for i in data['tags']]
- movie.title = unescape(data['name'])
- movie.actress = [i['name'] for i in data['actors']]
- movie.publish_date = data['publish_date']
- movie.preview_pics = data['images'] or []
- if data['factories']:
- movie.producer = data['factories'][0]['name']
+ movie.genre = [i["name"] for i in data["tags"]]
+ movie.title = unescape(data["name"])
+ movie.actress = [i["name"] for i in data["actors"]]
+ movie.publish_date = data["publish_date"]
+ movie.preview_pics = data["images"] or []
+ if data["factories"]:
+ movie.producer = data["factories"][0]["name"]
if Cfg().crawler.hardworking:
# 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
- video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
+ video_url = (
+ f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
+ )
resp = request.get(video_url).json()
# 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
- if 'data' in resp:
+ if "data" in resp:
# 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
# TODO: 发现部分影片(如080719-976)的传统格式预览片错误
- movie.preview_video = resp['data'].get('url')
+ movie.preview_video = resp["data"].get("url")
# airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确
- for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'):
+ for keyword in ("馬賽克破壞版", "馬賽克破解版", "無碼流出版"):
if movie.title and keyword in movie.title:
movie.title = None
movie.genre = []
@@ -107,10 +112,11 @@ def parse_data(movie: MovieInfo):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('DSAD-938')
+ movie = MovieInfo("DSAD-938")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/arzon.py b/javsp/web/arzon.py
index 433949018..156b5e045 100644
--- a/javsp/web/arzon.py
+++ b/javsp/web/arzon.py
@@ -1,6 +1,5 @@
"""从arzon抓取数据"""
-import os
-import sys
+
import logging
import re
@@ -13,22 +12,26 @@
logger = logging.getLogger(__name__)
base_url = "https://www.arzon.jp"
+
def get_cookie():
# https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
- skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
+ skip_verify_url = (
+ "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
+ )
session = requests.Session()
session.get(skip_verify_url, timeout=(12, 7))
return session.cookies.get_dict()
+
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
full_id = movie.dvdid
cookies = get_cookie()
- url = f'{base_url}/itemlist.html?t=&m=all&s=&q={full_id}'
+ url = f"{base_url}/itemlist.html?t=&m=all&s=&q={full_id}"
# url = f'{base_url}/imagelist.html?q={full_id}'
r = request_get(url, cookies, delay_raise=True)
if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
+ raise MovieNotFoundError(__name__, movie.dvdid)
# https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
data = html.fromstring(r.content)
@@ -38,61 +41,65 @@ def parse_data(movie: MovieInfo):
item_url = base_url + urls[0]
e = request_get(item_url, cookies, delay_raise=True)
- item = html.fromstring(e.content)
+ item = html.fromstring(e.content)
title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0]
cover = item.xpath("//td[@align='center']//a/img/@src")[0]
item_text = item.xpath("//div[@class='item_text']/text()")
- plot = [item.strip() for item in item_text if item.strip() != ''][0]
+ plot = [item.strip() for item in item_text if item.strip() != ""][0]
preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src")
# 使用列表推导式添加 "http:" 并去除 "m_"
preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr]
container = item.xpath("//div[@class='item_register']/table//tr")
for row in container:
- key = row.xpath("./td[1]/text()")[0]
- contents = row.xpath("./td[2]//text()")
- content = [item.strip() for item in contents if item.strip() != '']
- index = 0
- value = content[index] if content and index < len(content) else None
- if key == "AV女優:":
- movie.actress = content
- if key == "AVメーカー:":
- movie.producer = value
- if key == "AVレーベル:":
- video_type = value
- if key == "シリーズ:":
- movie.serial = value
- if key == "監督:":
- movie.director = value
- if key == "発売日:" and value:
- movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
- if key == "収録時間:" and value:
- movie.duration = re.search(r'([\d.]+)分', value).group(1)
- if key == "品番:":
- dvd_id = value
- elif key == "タグ:":
- genre = value
-
- genres = ''
+ key = row.xpath("./td[1]/text()")[0]
+ contents = row.xpath("./td[2]//text()")
+ content = [item.strip() for item in contents if item.strip() != ""]
+ index = 0
+ value = content[index] if content and index < len(content) else None
+ if key == "AV女優:":
+ movie.actress = content
+ if key == "AVメーカー:":
+ movie.producer = value
+ if key == "AVレーベル:":
+ video_type = value
+ if key == "シリーズ:":
+ movie.serial = value
+ if key == "監督:":
+ movie.director = value
+ if key == "発売日:" and value:
+ movie.publish_date = (
+ re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
+ )
+ if key == "収録時間:" and value:
+ movie.duration = re.search(r"([\d.]+)分", value).group(1)
+ if key == "品番:":
+ dvd_id = value
+ elif key == "タグ:":
+ genre = value
+
+ genres = ""
if video_type:
- genres = [video_type]
- if(genre != None):
- genres.append(genre)
+ genres = [video_type]
+ if genre != None:
+ genres.append(genre)
movie.genre = genres
movie.url = item_url
movie.title = title
movie.plot = plot
- movie.cover = f'https:{cover}'
+ movie.cover = f"https:{cover}"
movie.preview_pics = preview_pics
+
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('csct-011')
+ movie = MovieInfo("csct-011")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/arzon_iv.py b/javsp/web/arzon_iv.py
index 3ea7a322f..37748c091 100644
--- a/javsp/web/arzon_iv.py
+++ b/javsp/web/arzon_iv.py
@@ -1,6 +1,5 @@
"""从arzon抓取数据"""
-import os
-import sys
+
import logging
import re
@@ -13,21 +12,25 @@
logger = logging.getLogger(__name__)
base_url = "https://www.arzon.jp"
+
def get_cookie():
# https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
- skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
+ skip_verify_url = (
+ "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
+ )
session = requests.Session()
session.get(skip_verify_url, timeout=(12, 7))
return session.cookies.get_dict()
+
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
full_id = movie.dvdid
cookies = get_cookie()
- url = f'{base_url}/imagelist.html?q={full_id}'
+ url = f"{base_url}/imagelist.html?q={full_id}"
r = request_get(url, cookies, delay_raise=True)
if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
+ raise MovieNotFoundError(__name__, movie.dvdid)
# https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
data = html.fromstring(r.content)
@@ -37,55 +40,59 @@ def parse_data(movie: MovieInfo):
item_url = base_url + urls[0]
e = request_get(item_url, cookies, delay_raise=True)
- item = html.fromstring(e.content)
+ item = html.fromstring(e.content)
title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0]
cover = item.xpath("//td[@align='center']//a/img/@src")[0]
item_text = item.xpath("//div[@class='item_text']/text()")
- plot = [item.strip() for item in item_text if item.strip() != ''][0]
+ plot = [item.strip() for item in item_text if item.strip() != ""][0]
container = item.xpath("//div[@class='item_register']/table//tr")
for row in container:
- key = row.xpath("./td[1]/text()")[0]
- contents = row.xpath("./td[2]//text()")
- content = [item.strip() for item in contents if item.strip() != '']
- index = 0
- value = content[index] if content and index < len(content) else None
- if key == "タレント:":
- movie.actress = content
- if key == "イメージメーカー:":
- movie.producer = value
- if key == "イメージレーベル:":
- video_type = value
- if key == "監督:":
- movie.director = value
- if key == "発売日:" and value:
- movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
- if key == "収録時間:" and value:
- movie.duration = re.search(r'([\d.]+)分', value).group(1)
- if key == "品番:":
- dvd_id = value
- elif key == "タグ:":
- genre = value
-
- genres = ''
+ key = row.xpath("./td[1]/text()")[0]
+ contents = row.xpath("./td[2]//text()")
+ content = [item.strip() for item in contents if item.strip() != ""]
+ index = 0
+ value = content[index] if content and index < len(content) else None
+ if key == "タレント:":
+ movie.actress = content
+ if key == "イメージメーカー:":
+ movie.producer = value
+ if key == "イメージレーベル:":
+ video_type = value
+ if key == "監督:":
+ movie.director = value
+ if key == "発売日:" and value:
+ movie.publish_date = (
+ re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
+ )
+ if key == "収録時間:" and value:
+ movie.duration = re.search(r"([\d.]+)分", value).group(1)
+ if key == "品番:":
+ dvd_id = value
+ elif key == "タグ:":
+ genre = value
+
+ genres = ""
if video_type:
- genres = [video_type]
- if(genre != None):
- genres.append(genre)
+ genres = [video_type]
+ if genre != None:
+ genres.append(genre)
movie.genre = genres
movie.url = item_url
movie.title = title
movie.plot = plot
- movie.cover = f'https:{cover}'
+ movie.cover = f"https:{cover}"
+
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('KIDM-1137B')
+ movie = MovieInfo("KIDM-1137B")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/avsox.py b/javsp/web/avsox.py
index ea96d6cc3..3d0bbae80 100644
--- a/javsp/web/avsox.py
+++ b/javsp/web/avsox.py
@@ -1,4 +1,5 @@
"""从avsox抓取数据"""
+
import logging
from javsp.web.base import get_html
@@ -15,15 +16,15 @@ def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页
full_id = movie.dvdid
- if full_id.startswith('FC2-'):
- full_id = full_id.replace('FC2-', 'FC2-PPV-')
- html = get_html(f'{base_url}tw/search/{full_id}')
+ if full_id.startswith("FC2-"):
+ full_id = full_id.replace("FC2-", "FC2-PPV-")
+ html = get_html(f"{base_url}tw/search/{full_id}")
ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()")
urls = html.xpath("//a[contains(@class, 'movie-box')]/@href")
ids_lower = list(map(str.lower, ids))
if full_id.lower() in ids_lower:
url = urls[ids_lower.index(full_id.lower())]
- url = url.replace('/tw/', '/cn/', 1)
+ url = url.replace("/tw/", "/cn/", 1)
else:
raise MovieNotFoundError(__name__, movie.dvdid, ids)
@@ -35,7 +36,7 @@ def parse_data(movie: MovieInfo):
info = container.xpath("div/div[@class='col-md-3 info']")[0]
dvdid = info.xpath("p/span[@style]/text()")[0]
publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
- duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip()
+ duration = info.xpath("p/span[text()='长度:']")[0].tail.replace("分钟", "").strip()
producer, serial = None, None
producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
if producer_tag:
@@ -46,15 +47,15 @@ def parse_data(movie: MovieInfo):
genre = info.xpath("p/span[@class='genre']/a/text()")
actress = container.xpath("//a[@class='avatar-box']/span/text()")
- movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-')
+ movie.dvdid = dvdid.replace("FC2-PPV-", "FC2-")
movie.url = url
- movie.title = title.replace(dvdid, '').strip()
+ movie.title = title.replace(dvdid, "").strip()
movie.cover = cover
movie.publish_date = publish_date
movie.duration = duration
movie.genre = genre
movie.actress = actress
- if full_id.startswith('FC2-'):
+ if full_id.startswith("FC2-"):
# avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整
movie.producer = serial
else:
@@ -64,10 +65,11 @@ def parse_data(movie: MovieInfo):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('082713-417')
+ movie = MovieInfo("082713-417")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/avwiki.py b/javsp/web/avwiki.py
index fbd4ecbb3..216621a70 100644
--- a/javsp/web/avwiki.py
+++ b/javsp/web/avwiki.py
@@ -1,4 +1,5 @@
"""从av-wiki抓取数据"""
+
import logging
@@ -7,7 +8,7 @@
from javsp.datatype import MovieInfo
logger = logging.getLogger(__name__)
-base_url = 'https://av-wiki.net'
+base_url = "https://av-wiki.net"
def parse_data(movie: MovieInfo):
@@ -15,7 +16,7 @@ def parse_data(movie: MovieInfo):
Args:
movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
"""
- movie.url = url = f'{base_url}/{movie.dvdid}'
+ movie.url = url = f"{base_url}/{movie.dvdid}"
resp = request_get(url, delay_raise=True)
if resp.status_code == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
@@ -24,47 +25,56 @@ def parse_data(movie: MovieInfo):
cover_tag = html.xpath("//header/div/a[@class='image-link-border']/img")
if cover_tag:
try:
- srcset = cover_tag[0].get('srcset').split(', ')
+ srcset = cover_tag[0].get("srcset").split(", ")
src_set_urls = {}
for src in srcset:
url, width = src.split()
- width = int(width.rstrip('w'))
+ width = int(width.rstrip("w"))
src_set_urls[width] = url
- max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True)
+ max_pic = sorted(src_set_urls.items(), key=lambda x: x[0], reverse=True)
movie.cover = max_pic[0][1]
except:
- movie.cover = cover_tag[0].get('src')
+ movie.cover = cover_tag[0].get("src")
body = html.xpath("//section[@class='article-body']")[0]
title = body.xpath("div/p/text()")[0]
- title = title.replace(f"【{movie.dvdid}】", '')
+ title = title.replace(f"【{movie.dvdid}】", "")
cite_url = body.xpath("div/cite/a/@href")[0]
- cite_url = cite_url.split('?aff=')[0]
+ cite_url = cite_url.split("?aff=")[0]
info = body.xpath("dl[@class='dltable']")[0]
dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd")
data = {}
for dt_txt, dd in zip(dt_txt_ls, dd_tags):
dt_txt = dt_txt.strip()
- a_tag = dd.xpath('a')
+ a_tag = dd.xpath("a")
if len(a_tag) == 0:
dd_txt = dd.text.strip()
else:
dd_txt = [i.text.strip() for i in a_tag]
- if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留
+ if (
+ isinstance(dd_txt, list) and dt_txt != "AV女優名"
+ ): # 只有女优名以列表的数据格式保留
dd_txt = dd_txt[0]
data[dt_txt] = dd_txt
- ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'}
+ ATTR_MAP = {
+ "メーカー": "producer",
+ "AV女優名": "actress",
+ "メーカー品番": "dvdid",
+ "シリーズ": "serial",
+ "配信開始日": "publish_date",
+ }
for key, attr in ATTR_MAP.items():
setattr(movie, attr, data.get(key))
movie.title = title
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
- movie = MovieInfo('259LUXU-593')
+ movie = MovieInfo("259LUXU-593")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/base.py b/javsp/web/base.py
index 717b5168a..fee9da7c3 100644
--- a/javsp/web/base.py
+++ b/javsp/web/base.py
@@ -1,4 +1,5 @@
"""网络请求的统一接口"""
+
import os
import sys
import time
@@ -18,27 +19,42 @@
from javsp.web.exceptions import *
-__all__ = ['Request', 'get_html', 'post_html', 'request_get', 'resp2html', 'is_connectable', 'download', 'get_resp_text', 'read_proxy']
+__all__ = [
+ "Request",
+ "get_html",
+ "post_html",
+ "request_get",
+ "resp2html",
+ "is_connectable",
+ "download",
+ "get_resp_text",
+ "read_proxy",
+]
-headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
+headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
+}
logger = logging.getLogger(__name__)
# 删除js脚本相关的tag,避免网页检测到没有js运行环境时强行跳转,影响调试
-cleaner = Cleaner(kill_tags=['script', 'noscript'])
+cleaner = Cleaner(kill_tags=["script", "noscript"])
+
def read_proxy():
if Cfg().network.proxy_server is None:
return {}
else:
proxy = str(Cfg().network.proxy_server)
- return {'http': proxy, 'https': proxy}
+ return {"http": proxy, "https": proxy}
+
# 与网络请求相关的功能汇总到一个模块中以方便处理,但是不同站点的抓取器又有自己的需求(针对不同网站
# 需要使用不同的UA、语言等)。每次都传递参数很麻烦,而且会面临函数参数越加越多的问题。因此添加这个
# 处理网络请求的类,它带有默认的属性,但是也可以在各个抓取器模块里进行进行定制
-class Request():
+class Request:
"""作为网络请求出口并支持各个模块定制功能"""
+
def __init__(self, use_scraper=False) -> None:
# 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效
self.headers = headers.copy()
@@ -59,44 +75,54 @@ def __init__(self, use_scraper=False) -> None:
def _scraper_monitor(self, func):
"""监控cloudscraper的工作状态,遇到不支持的Challenge时尝试退回常规的requests请求"""
+
def wrapper(*args, **kw):
try:
return func(*args, **kw)
except Exception as e:
- logger.debug(f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求")
+ logger.debug(
+ f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求"
+ )
if func == self.scraper.get:
return requests.get(*args, **kw)
else:
return requests.post(*args, **kw)
+
return wrapper
def get(self, url, delay_raise=False):
- r = self.__get(url,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
+ r = self.__get(
+ url,
+ headers=self.headers,
+ proxies=self.proxies,
+ cookies=self.cookies,
+ timeout=self.timeout,
+ )
if not delay_raise:
r.raise_for_status()
return r
def post(self, url, data, delay_raise=False):
- r = self.__post(url,
- data=data,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
+ r = self.__post(
+ url,
+ data=data,
+ headers=self.headers,
+ proxies=self.proxies,
+ cookies=self.cookies,
+ timeout=self.timeout,
+ )
if not delay_raise:
r.raise_for_status()
return r
def head(self, url, delay_raise=True):
- r = self.__head(url,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
+ r = self.__head(
+ url,
+ headers=self.headers,
+ proxies=self.proxies,
+ cookies=self.cookies,
+ timeout=self.timeout,
+ )
if not delay_raise:
r.raise_for_status()
return r
@@ -118,10 +144,12 @@ def request_get(url, cookies={}, timeout=None, delay_raise=False):
"""获取指定url的原始请求"""
if timeout is None:
timeout = Cfg().network.timeout.seconds
-
- r = requests.get(url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout)
+
+ r = requests.get(
+ url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout
+ )
if not delay_raise:
- if r.status_code == 403 and b'>Just a moment...<' in r.content:
+ if r.status_code == 403 and b">Just a moment...<" in r.content:
raise SiteBlocked(f"403 Forbidden: 无法通过CloudFlare检测: {url}")
else:
r.raise_for_status()
@@ -132,7 +160,14 @@ def request_post(url, data, cookies={}, timeout=None, delay_raise=False):
"""向指定url发送post请求"""
if timeout is None:
timeout = Cfg().network.timeout.seconds
- r = requests.post(url, data=data, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout)
+ r = requests.post(
+ url,
+ data=data,
+ headers=headers,
+ proxies=read_proxy(),
+ cookies=cookies,
+ timeout=timeout,
+ )
if not delay_raise:
r.raise_for_status()
return r
@@ -147,7 +182,7 @@ def get_resp_text(resp: Response, encoding=None):
return resp.text
-def get_html(url, encoding='utf-8'):
+def get_html(url, encoding="utf-8"):
"""使用get方法访问指定网页并返回经lxml解析后的document"""
resp = request_get(url)
text = get_resp_text(resp, encoding=encoding)
@@ -155,23 +190,23 @@ def get_html(url, encoding='utf-8'):
html.make_links_absolute(url, resolve_base_href=True)
# 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus)
# html = cleaner.clean_html(html)
- if hasattr(sys, 'javsp_debug_mode'):
+ if hasattr(sys, "javsp_debug_mode"):
lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
return html
-def resp2html(resp, encoding='utf-8') -> lxml.html.HtmlComment:
+def resp2html(resp, encoding="utf-8") -> lxml.html.HtmlComment:
"""将request返回的response转换为经lxml解析后的document"""
text = get_resp_text(resp, encoding=encoding)
html = lxml.html.fromstring(text)
html.make_links_absolute(resp.url, resolve_base_href=True)
# html = cleaner.clean_html(html)
- if hasattr(sys, 'javsp_debug_mode'):
+ if hasattr(sys, "javsp_debug_mode"):
lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
return html
-def post_html(url, data, encoding='utf-8', cookies={}):
+def post_html(url, data, encoding="utf-8", cookies={}):
"""使用post方法访问指定网页并返回经lxml解析后的document"""
resp = request_post(url, data, cookies=cookies)
text = get_resp_text(resp, encoding=encoding)
@@ -179,11 +214,11 @@ def post_html(url, data, encoding='utf-8', cookies={}):
# jav321提供ed2k形式的资源链接,其中的非ASCII字符可能导致转换失败,因此要先进行处理
ed2k_tags = html.xpath("//a[starts-with(@href,'ed2k://')]")
for tag in ed2k_tags:
- tag.attrib['ed2k'], tag.attrib['href'] = tag.attrib['href'], ''
+ tag.attrib["ed2k"], tag.attrib["href"] = tag.attrib["href"], ""
html.make_links_absolute(url, resolve_base_href=True)
for tag in ed2k_tags:
- tag.attrib['href'] = tag.attrib['ed2k']
- tag.attrib.pop('ed2k')
+ tag.attrib["href"] = tag.attrib["ed2k"]
+ tag.attrib.pop("ed2k")
# html = cleaner.clean_html(html)
# lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
return html
@@ -192,9 +227,9 @@ def post_html(url, data, encoding='utf-8', cookies={}):
def dump_xpath_node(node, filename=None):
"""将xpath节点dump到文件"""
if not filename:
- filename = node.tag + '.html'
- with open(filename, 'wt', encoding='utf-8') as f:
- content = etree.tostring(node, pretty_print=True).decode('utf-8')
+ filename = node.tag + ".html"
+ with open(filename, "wt", encoding="utf-8") as f:
+ content = etree.tostring(node, pretty_print=True).decode("utf-8")
f.write(content)
@@ -213,16 +248,17 @@ def urlretrieve(url, filename=None, reporthook=None, headers=None):
headers["Referer"] = "https://www.arzon.jp/"
"""使用requests实现urlretrieve"""
# https://blog.csdn.net/qq_38282706/article/details/80253447
- with contextlib.closing(requests.get(url, headers=headers,
- proxies=read_proxy(), stream=True)) as r:
+ with contextlib.closing(
+ requests.get(url, headers=headers, proxies=read_proxy(), stream=True)
+ ) as r:
header = r.headers
- with open(filename, 'wb+') as fp:
+ with open(filename, "wb+") as fp:
bs = 1024
size = -1
blocknum = 0
if "content-length" in header:
- size = int(header["Content-Length"]) # 文件总大小(理论值)
- if reporthook: # 写入前运行一次回调函数
+ size = int(header["Content-Length"]) # 文件总大小(理论值)
+ if reporthook: # 写入前运行一次回调函数
reporthook(blocknum, bs, size)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
@@ -236,35 +272,40 @@ def urlretrieve(url, filename=None, reporthook=None, headers=None):
def download(url, output_path, desc=None):
"""下载指定url的资源"""
# 支持“下载”本地资源,以供fc2fan的本地镜像所使用
- if not url.startswith('http'):
+ if not url.startswith("http"):
start_time = time.time()
shutil.copyfile(url, output_path)
filesize = os.path.getsize(url)
elapsed = time.time() - start_time
- info = {'total': filesize, 'elapsed': elapsed, 'rate': filesize/elapsed}
+ info = {"total": filesize, "elapsed": elapsed, "rate": filesize / elapsed}
return info
if not desc:
- desc = url.split('/')[-1]
+ desc = url.split("/")[-1]
referrer = headers.copy()
- referrer['referer'] = url[:url.find('/', 8)+1] # 提取base_url部分
- with DownloadProgressBar(unit='B', unit_scale=True,
- miniters=1, desc=desc, leave=False) as t:
+ referrer["referer"] = url[: url.find("/", 8) + 1] # 提取base_url部分
+ with DownloadProgressBar(
+ unit="B", unit_scale=True, miniters=1, desc=desc, leave=False
+ ) as t:
urlretrieve(url, filename=output_path, reporthook=t.update_to, headers=referrer)
- info = {k: t.format_dict[k] for k in ('total', 'elapsed', 'rate')}
+ info = {k: t.format_dict[k] for k in ("total", "elapsed", "rate")}
return info
def open_in_chrome(url, new=0, autoraise=True):
"""使用指定的Chrome Profile打开url,便于调试"""
import subprocess
- chrome = R'C:\Program Files\Google\Chrome\Application\chrome.exe'
+
+ chrome = R"C:\Program Files\Google\Chrome\Application\chrome.exe"
subprocess.run(f'"{chrome}" --profile-directory="Profile 2" {url}', shell=True)
+
import webbrowser
+
webbrowser.open = open_in_chrome
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
- download('https://www.javbus.com/pics/cover/6n54_b.jpg', 'cover.jpg')
+ download("https://www.javbus.com/pics/cover/6n54_b.jpg", "cover.jpg")
diff --git a/javsp/web/dl_getchu.py b/javsp/web/dl_getchu.py
index 15267f1f7..da6884cc7 100644
--- a/javsp/web/dl_getchu.py
+++ b/javsp/web/dl_getchu.py
@@ -1,4 +1,5 @@
"""从dl.getchu官网抓取数据"""
+
import re
import logging
@@ -9,19 +10,19 @@
logger = logging.getLogger(__name__)
# https://dl.getchu.com/i/item4045373
-base_url = 'https://dl.getchu.com'
+base_url = "https://dl.getchu.com"
# dl.getchu用utf-8会乱码
-base_encode = 'euc-jp'
+base_encode = "euc-jp"
def get_movie_title(html):
container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]")
if len(container) > 0:
container = container[0]
- rows = container.xpath('.//tr')
- title = ''
+ rows = container.xpath(".//tr")
+ title = ""
for row in rows:
- for cell in row.xpath('.//td/div'):
+ for cell in row.xpath(".//td/div"):
# 获取单元格文本内容
if cell.text:
title = str(cell.text).strip()
@@ -29,11 +30,11 @@ def get_movie_title(html):
def get_movie_img(html, getchu_id):
- img_src = ''
+ img_src = ""
container = html.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]')
if len(container) > 0:
container = container[0]
- img_src = container.get('src')
+ img_src = container.get("src")
return img_src
@@ -42,20 +43,22 @@ def get_movie_preview(html, getchu_id):
container = html.xpath(f'//img[contains(@src, "{getchu_id}_")]')
if len(container) > 0:
for c in container:
- preview_pics.append(c.get('src'))
+ preview_pics.append(c.get("src"))
return preview_pics
-DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分')
+DURATION_PATTERN = re.compile(r"(?:動画)?(\d+)分")
+
+
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# 去除番号中的'GETCHU'字样
id_uc = movie.dvdid.upper()
- if not id_uc.startswith('GETCHU-'):
- raise ValueError('Invalid GETCHU number: ' + movie.dvdid)
- getchu_id = id_uc.replace('GETCHU-', '')
+ if not id_uc.startswith("GETCHU-"):
+ raise ValueError("Invalid GETCHU number: " + movie.dvdid)
+ getchu_id = id_uc.replace("GETCHU-", "")
# 抓取网页
- url = f'{base_url}/i/item{getchu_id}'
+ url = f"{base_url}/i/item{getchu_id}"
r = request_get(url, delay_raise=True)
if r.status_code == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
@@ -64,7 +67,7 @@ def parse_data(movie: MovieInfo):
if len(container) > 0:
container = container[0]
# 将表格提取为键值对
- rows = container.xpath('.//table/tr')
+ rows = container.xpath(".//table/tr")
kv_rows = [i for i in rows if len(i) == 2]
data = {}
for row in kv_rows:
@@ -80,26 +83,26 @@ def parse_data(movie: MovieInfo):
data[key] = value
for key, value in data.items():
- if key == 'サークル':
+ if key == "サークル":
movie.producer = value[0]
- elif key == '作者':
+ elif key == "作者":
# 暂时没有在getchu找到多个actress的片子
movie.actress = [i.strip() for i in value]
- elif key == '画像数&ページ数':
- match = DURATION_PATTERN.search(' '.join(value))
+ elif key == "画像数&ページ数":
+ match = DURATION_PATTERN.search(" ".join(value))
if match:
movie.duration = match.group(1)
- elif key == '配信開始日':
- movie.publish_date = value[0].replace('/', '-')
- elif key == '趣向':
+ elif key == "配信開始日":
+ movie.publish_date = value[0].replace("/", "-")
+ elif key == "趣向":
movie.genre = value
- elif key == '作品内容':
+ elif key == "作品内容":
idx = -1
for i, line in enumerate(value):
- if line.lstrip().startswith('※'):
+ if line.lstrip().startswith("※"):
idx = i
break
- movie.plot = ''.join(value[:idx])
+ movie.plot = "".join(value[:idx])
movie.title = get_movie_title(html)
movie.cover = get_movie_img(html, getchu_id)
@@ -114,7 +117,7 @@ def parse_data(movie: MovieInfo):
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('getchu-4041026')
+ movie = MovieInfo("getchu-4041026")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/exceptions.py b/javsp/web/exceptions.py
index 0ea720d01..5db4af266 100644
--- a/javsp/web/exceptions.py
+++ b/javsp/web/exceptions.py
@@ -1,6 +1,15 @@
"""网页抓取相关的异常"""
-__all__ = ['CrawlerError', 'MovieNotFoundError', 'MovieDuplicateError', 'SiteBlocked',
- 'SitePermissionError', 'CredentialError', 'WebsiteError', 'OtherError']
+
+__all__ = [
+ "CrawlerError",
+ "MovieNotFoundError",
+ "MovieDuplicateError",
+ "SiteBlocked",
+ "SitePermissionError",
+ "CredentialError",
+ "WebsiteError",
+ "OtherError",
+]
class CrawlerError(Exception):
@@ -9,6 +18,7 @@ class CrawlerError(Exception):
class MovieNotFoundError(CrawlerError):
"""表示某个站点没有抓取到某部影片"""
+
# 保持异常消息的简洁,同时又支持使用'logger.info(e, exc_info=True)'记录完整信息
def __init__(self, mod, avid, *args) -> None:
msg = f"{mod}: 未找到影片: '{avid}'"
@@ -20,6 +30,7 @@ def __str__(self):
class MovieDuplicateError(CrawlerError):
"""影片重复"""
+
def __init__(self, mod, avid, dup_count, *args) -> None:
msg = f"{mod}: '{avid}': 存在{dup_count}个完全匹配目标番号的搜索结果"
super().__init__(msg, *args)
diff --git a/javsp/web/fanza.py b/javsp/web/fanza.py
index e975c4c8f..305aaad8f 100644
--- a/javsp/web/fanza.py
+++ b/javsp/web/fanza.py
@@ -1,7 +1,6 @@
"""从fanza抓取数据"""
-import os
+
import re
-import sys
import json
import logging
from typing import Dict, List, Tuple
@@ -14,25 +13,43 @@
logger = logging.getLogger(__name__)
-base_url = 'https://www.dmm.co.jp'
+base_url = "https://www.dmm.co.jp"
# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
request = Request()
-request.cookies = {'age_check_done': '1'}
-request.headers['Accept-Language'] = 'ja,en-US;q=0.9'
+request.cookies = {"age_check_done": "1"}
+request.headers["Accept-Language"] = "ja,en-US;q=0.9"
+
+
+_PRODUCT_PRIORITY = {"digital": 10, "mono": 5, "monthly": 2, "rental": 1}
+_TYPE_PRIORITY = {
+ "videoa": 10,
+ "anime": 8,
+ "nikkatsu": 6,
+ "doujin": 4,
+ "dvd": 3,
+ "ppr": 2,
+ "paradisetv": 1,
+}
-_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1}
-_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1}
def sort_search_result(result: List[Dict]):
"""排序搜索结果"""
- scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result}
- sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True)
+ scores = {
+ i["url"]: (
+ _PRODUCT_PRIORITY.get(i["product"], 0),
+ _TYPE_PRIORITY.get(i["type"], 0),
+ )
+ for i in result
+ }
+ sorted_result = sorted(result, key=lambda x: scores[x["url"]], reverse=True)
return sorted_result
def get_urls_of_cid(cid: str) -> Tuple[str, str]:
"""搜索cid可能的影片URL"""
- r = request.get(f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0")
+ r = request.get(
+ f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0"
+ )
if r.status_code == 404:
raise MovieNotFoundError(__name__, cid)
r.raise_for_status()
@@ -40,19 +57,21 @@ def get_urls_of_cid(cid: str) -> Tuple[str, str]:
result = html.xpath("//ul[@id='list']/li/div/p/a/@href")
parsed_result = {}
for url in result:
- items = url.split('/')
+ items = url.split("/")
type_, cid = None, None
for i, part in enumerate(items):
- if part == '-':
- product, type_ = items[i-2], items[i-1]
- elif part.startswith('cid='):
+ if part == "-":
+ product, type_ = items[i - 2], items[i - 1]
+ elif part.startswith("cid="):
cid = part[4:]
- new_url = '/'.join(i for i in items if not i.startswith('?')) + '/'
- parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url})
+ new_url = "/".join(i for i in items if not i.startswith("?")) + "/"
+ parsed_result.setdefault(cid, []).append(
+ {"product": product, "type": type_, "url": new_url}
+ )
break
if cid not in parsed_result:
if len(result) > 0:
- logger.debug(f"Unknown URL in search result: " + ', '.join(result))
+ logger.debug("Unknown URL in search result: " + ", ".join(result))
raise MovieNotFoundError(__name__, cid)
sorted_result = sort_search_result(parsed_result[cid])
return sorted_result
@@ -60,16 +79,18 @@ def get_urls_of_cid(cid: str) -> Tuple[str, str]:
def resp2html_wrapper(resp):
html = resp2html(resp)
- if 'not available in your region' in html.text_content():
- raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
- elif '/login/' in resp.url:
- raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP')
+ if "not available in your region" in html.text_content():
+ raise SiteBlocked(
+ "FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置"
+ )
+ elif "/login/" in resp.url:
+ raise SiteBlocked("FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP")
return html
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
- default_url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/'
+ default_url = f"{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/"
r0 = request.get(default_url, delay_raise=True)
if r0.status_code == 404:
urls = get_urls_of_cid(movie.cid)
@@ -80,16 +101,18 @@ def parse_data(movie: MovieInfo):
else:
logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}")
continue
- r = request.get(d['url'])
+ r = request.get(d["url"])
html = resp2html_wrapper(r)
try:
parse_func(movie, html)
- movie.url = d['url']
+ movie.url = d["url"]
break
except:
logger.debug(f"Fail to parse {d['url']}", exc_info=True)
if d is urls[-1]:
- logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败")
+ logger.warning(
+ f"在fanza查找到的cid={movie.cid}的影片页面均解析失败"
+ )
raise
else:
html = resp2html_wrapper(r0)
@@ -104,22 +127,32 @@ def parse_videoa_page(movie: MovieInfo, html):
container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
# 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
- date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()")
+ date_tag = container.xpath(
+ "//td[text()='配信開始日:']/following-sibling::td/text()"
+ )
if date_tag:
- movie.publish_date = date_tag[0].strip().replace('/', '-')
- duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
- match = re.search(r'\d+', duration_str)
+ movie.publish_date = date_tag[0].strip().replace("/", "-")
+ duration_str = container.xpath(
+ "//td[text()='収録時間:']/following-sibling::td/text()"
+ )[0].strip()
+ match = re.search(r"\d+", duration_str)
if match:
movie.duration = match.group(0)
# 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
actress = container.xpath("//span[@id='performer']/a/text()")
- director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()")
+ director_tag = container.xpath(
+ "//td[text()='監督:']/following-sibling::td/a/text()"
+ )
if director_tag:
movie.director = director_tag[0].strip()
- serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
+ serial_tag = container.xpath(
+ "//td[text()='シリーズ:']/following-sibling::td/a/text()"
+ )
if serial_tag:
movie.serial = serial_tag[0].strip()
- producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
+ producer_tag = container.xpath(
+ "//td[text()='メーカー:']/following-sibling::td/a/text()"
+ )
if producer_tag:
movie.producer = producer_tag[0].strip()
# label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
@@ -127,40 +160,48 @@ def parse_videoa_page(movie: MovieInfo, html):
# if label_tag:
# label = label_tag[0].strip()
# fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
- genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]")
+ genre_tags = container.xpath(
+ "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]"
+ )
genre, genre_id = [], []
for tag in genre_tags:
genre.append(tag.text.strip())
- genre_id.append(tag.get('href').split('=')[-1].strip('/'))
- cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
+ genre_id.append(tag.get("href").split("=")[-1].strip("/"))
+ cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[
+ 0
+ ].strip()
plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip()
preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
score_tag = container.xpath("//p[@class='d-review__average']/strong/text()")
if score_tag:
- match = re.search(r'\d+', score_tag[0].strip())
+ match = re.search(r"\d+", score_tag[0].strip())
if match:
score = float(match.group()) * 2
- movie.score = f'{score:.2f}'
+ movie.score = f"{score:.2f}"
else:
- score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
- movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
-
+ score_img = container.xpath(
+ "//td[text()='平均評価:']/following-sibling::td/img/@src"
+ )[0]
+ movie.score = int(score_img.split("/")[-1].split(".")[0]) # 00, 05 ... 50
+
if Cfg().crawler.hardworking:
# 预览视频是动态加载的,不在静态网页中
- video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}'
+ video_url = f"{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}"
html2 = request.get_html(video_url)
# 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
- script = html2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip()
- match = re.search(r'\{.*\}', script)
+ script = html2.xpath(
+ "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()"
+ )[0].strip()
+ match = re.search(r"\{.*\}", script)
# 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
try:
data = json.loads(match.group())
- video_url = data.get('src')
- if video_url and video_url.startswith('//'):
- video_url = 'https:' + video_url
+ video_url = data.get("src")
+ if video_url and video_url.startswith("//"):
+ video_url = "https:" + video_url
movie.preview_video = video_url
except Exception as e:
- logger.debug('解析视频地址时异常: ' + repr(e))
+ logger.debug("解析视频地址时异常: " + repr(e))
movie.cid = cid
movie.title = title
@@ -170,7 +211,7 @@ def parse_videoa_page(movie: MovieInfo, html):
movie.genre_id = genre_id
movie.plot = plot
movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_anime_page(movie: MovieInfo, html):
@@ -178,27 +219,41 @@ def parse_anime_page(movie: MovieInfo, html):
title = html.xpath("//h1[@id='title']/text()")[0]
container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
cover = container.xpath("//img[@name='package-image']/@src")[0]
- date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip()
- publish_date = date_str.replace('/', '-')
- duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")
+ date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[
+ 0
+ ].strip()
+ publish_date = date_str.replace("/", "-")
+ duration_tag = container.xpath(
+ "//td[text()='収録時間:']/following-sibling::td/text()"
+ )
if duration_tag:
- movie.duration = duration_tag[0].strip().replace('分', '')
- serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
+ movie.duration = duration_tag[0].strip().replace("分", "")
+ serial_tag = container.xpath(
+ "//td[text()='シリーズ:']/following-sibling::td/a/text()"
+ )
if serial_tag:
movie.serial = serial_tag[0].strip()
- producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
+ producer_tag = container.xpath(
+ "//td[text()='メーカー:']/following-sibling::td/a/text()"
+ )
if producer_tag:
movie.producer = producer_tag[0].strip()
- genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]")
+ genre_tags = container.xpath(
+ "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]"
+ )
genre, genre_id = [], []
for tag in genre_tags:
genre.append(tag.text.strip())
- genre_id.append(tag.get('href').split('=')[-1].strip('/'))
- cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
+ genre_id.append(tag.get("href").split("=")[-1].strip("/"))
+ cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[
+ 0
+ ].strip()
plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip()
preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy")
- score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
- score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
+ score_img = container.xpath(
+ "//td[text()='平均評価:']/following-sibling::td/img/@src"
+ )[0]
+ score = int(score_img.split("/")[-1].split(".")[0]) # 00, 05 ... 50
movie.cid = cid
movie.title = title
@@ -207,9 +262,9 @@ def parse_anime_page(movie: MovieInfo, html):
movie.genre = genre
movie.genre_id = genre_id
movie.plot = plot
- movie.score = f'{score/5:.2f}' # 转换为10分制
+ movie.score = f"{score/5:.2f}" # 转换为10分制
movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
# parse_dvd_page = parse_videoa_page # 118wtktabf067
@@ -220,10 +275,11 @@ def parse_anime_page(movie: MovieInfo, html):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo(cid='d_aisoft3356')
+ movie = MovieInfo(cid="d_aisoft3356")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/fc2.py b/javsp/web/fc2.py
index 66be7ae4e..5e47df6e7 100644
--- a/javsp/web/fc2.py
+++ b/javsp/web/fc2.py
@@ -1,4 +1,5 @@
"""从FC2官网抓取数据"""
+
import logging
@@ -10,12 +11,12 @@
logger = logging.getLogger(__name__)
-base_url = 'https://adult.contents.fc2.com'
+base_url = "https://adult.contents.fc2.com"
def get_movie_score(fc2_id):
"""通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None"""
- html = get_html(f'{base_url}/article/{fc2_id}/review')
+ html = get_html(f"{base_url}/article/{fc2_id}/review")
review_tags = html.xpath("//ul[@class='items_comment_headerReviewInArea']/li")
reviews = {}
for tag in review_tags:
@@ -23,9 +24,9 @@ def get_movie_score(fc2_id):
vote = int(tag.xpath("span")[0].text_content())
reviews[score] = vote
total_votes = sum(reviews.values())
- if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧
- summary = sum([k*v for k, v in reviews.items()])
- final_score = summary / total_votes * 2 # 乘以2转换为10分制
+ if total_votes >= 2: # 至少也该有两个人评价才有参考意义一点吧
+ summary = sum([k * v for k, v in reviews.items()])
+ final_score = summary / total_votes * 2 # 乘以2转换为10分制
return final_score
@@ -33,14 +34,14 @@ def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# 去除番号中的'FC2'字样
id_uc = movie.dvdid.upper()
- if not id_uc.startswith('FC2-'):
- raise ValueError('Invalid FC2 number: ' + movie.dvdid)
- fc2_id = id_uc.replace('FC2-', '')
+ if not id_uc.startswith("FC2-"):
+ raise ValueError("Invalid FC2 number: " + movie.dvdid)
+ fc2_id = id_uc.replace("FC2-", "")
# 抓取网页
- url = f'{base_url}/article/{fc2_id}/'
+ url = f"{base_url}/article/{fc2_id}/"
resp = request_get(url)
- if '/id.fc2.com/' in resp.url:
- raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP')
+ if "/id.fc2.com/" in resp.url:
+ raise SiteBlocked("FC2要求当前IP登录账号才可访问,请尝试更换为日本IP")
html = resp2html(resp)
container = html.xpath("//div[@class='items_article_left']")
if len(container) > 0:
@@ -49,7 +50,7 @@ def parse_data(movie: MovieInfo):
raise MovieNotFoundError(__name__, movie.dvdid)
# FC2 标题增加反爬乱码,使用数组合并标题
title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()")
- title = ''.join(title_arr)
+ title = "".join(title_arr)
thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0]
thumb_pic = thumb_tag.xpath("span/img/@src")[0]
duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0]
@@ -57,25 +58,31 @@ def parse_data(movie: MovieInfo):
producer = container.xpath("//li[text()='by ']/a/text()")[0]
genre = container.xpath("//a[@class='tag tagTag']/text()")
date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0]
- publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30'
+ publish_date = date_str[-10:].replace("/", "-") # '販売日 : 2017/11/30'
preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href")
if Cfg().crawler.hardworking:
# 通过评论数据来计算准确的评分
score = get_movie_score(fc2_id)
if score:
- movie.score = f'{score:.2f}'
+ movie.score = f"{score:.2f}"
# 预览视频是动态加载的,不在静态网页中
- desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0]
- key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa...
- api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}'
+ desc_frame_url = container.xpath(
+ "//section[@class='items_article_Contents']/iframe/@src"
+ )[0]
+ key = desc_frame_url.split("=")[
+ -1
+ ] # /widget/article/718323/description?ac=60fc08fa...
+ api_url = f"{base_url}/api/v2/videos/{fc2_id}/sample?key={key}"
r = request_get(api_url).json()
- movie.preview_video = r['path']
+ movie.preview_video = r["path"]
else:
# 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星
- score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0]
+ score_tag_attr = container.xpath(
+ "//a[@class='items_article_Stars']/p/span/@class"
+ )[0]
score = int(score_tag_attr[-1]) * 2
- movie.score = f'{score:.2f}'
+ movie.score = f"{score:.2f}"
movie.dvdid = id_uc
movie.url = url
@@ -94,10 +101,11 @@ def parse_data(movie: MovieInfo):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('FC2-718323')
+ movie = MovieInfo("FC2-718323")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/fc2fan.py b/javsp/web/fc2fan.py
index 229b3e3df..f3baa1d3b 100644
--- a/javsp/web/fc2fan.py
+++ b/javsp/web/fc2fan.py
@@ -1,4 +1,5 @@
"""解析fc2fan本地镜像的数据"""
+
# FC2官网的影片下架就无法再抓取数据,如果用户有fc2fan的镜像,那可以尝试从镜像中解析影片数据
import os
import re
@@ -21,7 +22,7 @@
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
if use_local_mirror:
- html_file = f'{base_path}/{movie.dvdid}.html'
+ html_file = f"{base_path}/{movie.dvdid}.html"
if not os.path.exists(html_file):
raise MovieNotFoundError(__name__, movie.dvdid, html_file)
html = lxml.html.parse(html_file)
@@ -30,23 +31,23 @@ def parse_data(movie: MovieInfo):
r = requests.get(url)
if r.status_code == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
- elif r.text == '':
- raise WebsiteError(f'fc2fan: 站点不可用 (HTTP {r.status_code}): {url}')
+ elif r.text == "":
+ raise WebsiteError(f"fc2fan: 站点不可用 (HTTP {r.status_code}): {url}")
html = resp2html(r)
try:
container = html.xpath("//div[@class='col-sm-8']")[0]
except IndexError:
- raise WebsiteError(f'fc2fan: 站点不可用')
+ raise WebsiteError("fc2fan: 站点不可用")
title = container.xpath("h3/text()")[0]
score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip()
- match = re.search(r'\d+', score_str)
+ match = re.search(r"\d+", score_str)
if match:
- score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
- movie.score = f'{score:.1f}'
+ score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
+ movie.score = f"{score:.1f}"
resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail
- if '无码' in resource_info:
+ if "无码" in resource_info:
movie.uncensored = True
- elif '有码' in resource_info:
+ elif "有码" in resource_info:
movie.uncensored = False
# FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商
producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text
@@ -56,7 +57,9 @@ def parse_data(movie: MovieInfo):
actress = container.xpath("h5/strong[text()='女优名字']/../a/text()")
preview_pics = container.xpath("//ul[@class='slides']/li/img/@src")
if use_local_mirror:
- preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics]
+ preview_pics = [
+ os.path.normpath(os.path.join(base_path, i)) for i in preview_pics
+ ]
# big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0] # 影片真实截图,目前暂时用不到
movie.title = title
@@ -69,10 +72,11 @@ def parse_data(movie: MovieInfo):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('FC2-1879420')
+ movie = MovieInfo("FC2-1879420")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/fc2ppvdb.py b/javsp/web/fc2ppvdb.py
index b0ad60892..ad06a5b9d 100644
--- a/javsp/web/fc2ppvdb.py
+++ b/javsp/web/fc2ppvdb.py
@@ -1,4 +1,5 @@
"""从FC2PPVDB抓取数据"""
+
import logging
from typing import List
@@ -10,35 +11,47 @@
logger = logging.getLogger(__name__)
-base_url = 'https://fc2ppvdb.com'
+base_url = "https://fc2ppvdb.com"
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# 去除番号中的'FC2'字样
id_uc = movie.dvdid.upper()
- if not id_uc.startswith('FC2-'):
- raise ValueError('Invalid FC2 number: ' + movie.dvdid)
- fc2_id = id_uc.replace('FC2-', '')
+ if not id_uc.startswith("FC2-"):
+ raise ValueError("Invalid FC2 number: " + movie.dvdid)
+ fc2_id = id_uc.replace("FC2-", "")
# 抓取网页
- url = f'{base_url}/articles/{fc2_id}'
+ url = f"{base_url}/articles/{fc2_id}"
html = get_html(url)
- container = html.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]")
+ container = html.xpath(
+ "//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]"
+ )
if len(container) > 0:
container = container[0]
else:
raise MovieNotFoundError(__name__, movie.dvdid)
-
+
title = container.xpath("//h2/a/text()")
thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src")
- duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()")
+ duration_str = container.xpath(
+ "//div[starts-with(text(),'収録時間:')]/span/text()"
+ )
actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()")
genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()")
publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()")
publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()")
- uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()")
- uncensored_str_f = get_list_first(uncensored_str);
- uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None
+ uncensored_str = container.xpath(
+ "//div[starts-with(text(),'モザイク:')]/span/text()"
+ )
+ uncensored_str_f = get_list_first(uncensored_str)
+ uncensored = (
+ True
+ if uncensored_str_f == "無"
+ else False
+ if uncensored_str_f == "有"
+ else None
+ )
preview_pics = None
preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href")
@@ -60,15 +73,18 @@ def parse_data(movie: MovieInfo):
else:
movie.cover = get_list_first(thumb_pic)
-def get_list_first(list:List):
+
+def get_list_first(list: List):
return list[0] if list and len(list) > 0 else None
+
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('FC2-4497837')
+ movie = MovieInfo("FC2-4497837")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/gyutto.py b/javsp/web/gyutto.py
index db7d6c795..d24d592b8 100644
--- a/javsp/web/gyutto.py
+++ b/javsp/web/gyutto.py
@@ -1,4 +1,5 @@
"""从https://gyutto.com/官网抓取数据"""
+
import logging
import time
@@ -9,38 +10,41 @@
logger = logging.getLogger(__name__)
# https://dl.gyutto.com/i/item266923
-base_url = 'http://gyutto.com'
-base_encode = 'euc-jp'
+base_url = "http://gyutto.com"
+base_encode = "euc-jp"
+
def get_movie_title(html):
container = html.xpath("//h1")
if len(container) > 0:
container = container[0]
title = container.text
-
+
return title
-def get_movie_img(html, index = 1):
+
+def get_movie_img(html, index=1):
images = []
container = html.xpath("//a[@class='highslide']/img")
if len(container) > 0:
if index == 0:
- return container[0].get('src')
-
+ return container[0].get("src")
+
for row in container:
- images.append(row.get('src'))
+ images.append(row.get("src"))
return images
+
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# 去除番号中的'gyutto'字样
id_uc = movie.dvdid.upper()
- if not id_uc.startswith('GYUTTO-'):
- raise ValueError('Invalid gyutto number: ' + movie.dvdid)
- gyutto_id = id_uc.replace('GYUTTO-', '')
+ if not id_uc.startswith("GYUTTO-"):
+ raise ValueError("Invalid gyutto number: " + movie.dvdid)
+ gyutto_id = id_uc.replace("GYUTTO-", "")
# 抓取网页
- url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1'
+ url = f"{base_url}/i/item{gyutto_id}?select_uaflag=1"
r = request_get(url, delay_raise=True)
if r.status_code == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
@@ -50,17 +54,17 @@ def parse_data(movie: MovieInfo):
for row in container:
key = row.xpath(".//dt/text()")
if key[0] == "サークル":
- producer = ''.join(row.xpath(".//dd/a/text()"))
+ producer = "".join(row.xpath(".//dd/a/text()"))
elif key[0] == "ジャンル":
genre = row.xpath(".//dd/a/text()")
elif key[0] == "配信開始日":
date = row.xpath(".//dd/text()")
- date_str = ''.join(date)
+ date_str = "".join(date)
date_time = time.strptime(date_str, "%Y年%m月%d日")
publish_date = time.strftime("%Y-%m-%d", date_time)
plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0]
-
+
movie.title = get_movie_title(html)
movie.cover = get_movie_img(html, 0)
movie.preview_pics = get_movie_img(html)
@@ -73,12 +77,13 @@ def parse_data(movie: MovieInfo):
movie.genre = genre
movie.plot = plot
+
if __name__ == "__main__":
import pretty_errors
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('gyutto-266923')
+ movie = MovieInfo("gyutto-266923")
try:
parse_data(movie)
diff --git a/javsp/web/jav321.py b/javsp/web/jav321.py
index 4e42617a5..2f3df7f0d 100644
--- a/javsp/web/jav321.py
+++ b/javsp/web/jav321.py
@@ -1,4 +1,5 @@
"""从jav321抓取数据"""
+
import re
import logging
@@ -9,17 +10,17 @@
logger = logging.getLogger(__name__)
-base_url = 'https://www.jav321.com'
+base_url = "https://www.jav321.com"
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
- html = post_html(f'{base_url}/search', data={'sn': movie.dvdid})
+ html = post_html(f"{base_url}/search", data={"sn": movie.dvdid})
page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
- #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
- cid = page_url.split('/')[-1] # /video/ipx00177
+ # TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
+ cid = page_url.split("/")[-1] # /video/ipx00177
# 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片
- if cid == 'search':
+ if cid == "search":
raise MovieNotFoundError(__name__, movie.dvdid)
title = html.xpath("//div[@class='panel-heading']/h3/text()")[0]
info = html.xpath("//div[@class='col-md-9']")[0]
@@ -30,10 +31,12 @@ def parse_data(movie: MovieInfo):
# actress, actress_pics
# jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白
actress, actress_pics = [], {}
- actress_tags = html.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
+ actress_tags = html.xpath(
+ "//div[@class='thumbnail']/a[contains(@href,'/star/')]/img"
+ )
for tag in actress_tags:
name = tag.tail.strip()
- pic_url = tag.get('src')
+ pic_url = tag.get("src")
actress.append(name)
# jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url,
# 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
@@ -43,17 +46,19 @@ def parse_data(movie: MovieInfo):
genre, genre_id = [], []
for tag in genre_tags:
genre.append(tag.text)
- genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1
- dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
- publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
+ genre_id.append(tag.get("href").split("/")[-2]) # genre/4025/1
+ dvdid = info.xpath("b[text()='品番']")[0].tail.replace(": ", "").upper()
+ publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(": ", "")
duration_str = info.xpath("b[text()='収録時間']")[0].tail
- match = re.search(r'\d+', duration_str)
+ match = re.search(r"\d+", duration_str)
if match:
movie.duration = match.group(0)
# 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星
- score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original")
+ score_tag = info.xpath(
+ "//b[text()='平均評価']/following-sibling::img/@data-original"
+ )
if score_tag:
- score = int(score_tag[0][5:7])/5 # /10*2
+ score = int(score_tag[0][5:7]) / 5 # /10*2
movie.score = str(score)
serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
if serial_tag:
@@ -61,15 +66,21 @@ def parse_data(movie: MovieInfo):
preview_video_tag = info.xpath("//video/source/@src")
if preview_video_tag:
movie.preview_video = preview_video_tag[0]
- plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()")
+ plot_tag = info.xpath(
+ "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()"
+ )
if plot_tag:
movie.plot = plot_tag[0]
- preview_pics = html.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src")
+ preview_pics = html.xpath(
+ "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src"
+ )
if len(preview_pics) == 0:
# 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL
- preview_pics = html.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src")
+ preview_pics = html.xpath(
+ "//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src"
+ )
# 有的图片链接里有多个//,网站质量堪忧……
- preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics]
+ preview_pics = [i[:8] + i[8:].replace("//", "/") for i in preview_pics]
# 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析
movie.url = page_url
@@ -89,10 +100,11 @@ def parse_data(movie: MovieInfo):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('SCUTE-1177')
+ movie = MovieInfo("SCUTE-1177")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/javbus.py b/javsp/web/javbus.py
index a98cd9974..e40c29025 100644
--- a/javsp/web/javbus.py
+++ b/javsp/web/javbus.py
@@ -1,4 +1,5 @@
"""从JavBus抓取数据"""
+
import logging
@@ -10,8 +11,8 @@
logger = logging.getLogger(__name__)
-genre_map = GenreMap('data/genre_javbus.csv')
-permanent_url = 'https://www.javbus.com'
+genre_map = GenreMap("data/genre_javbus.csv")
+permanent_url = "https://www.javbus.com"
if Cfg().network.proxy_server is not None:
base_url = permanent_url
else:
@@ -23,7 +24,7 @@ def parse_data(movie: MovieInfo):
Args:
movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
"""
- url = f'{base_url}/{movie.dvdid}'
+ url = f"{base_url}/{movie.dvdid}"
resp = request_get(url, delay_raise=True)
# 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息
if resp.history and resp.history[0].status_code == 302:
@@ -31,8 +32,8 @@ def parse_data(movie: MovieInfo):
else:
html = resp2html(resp)
# 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404
- page_title = html.xpath('/html/head/title/text()')
- if page_title and page_title[0].startswith('404 Page Not Found!'):
+ page_title = html.xpath("/html/head/title/text()")
+ if page_title and page_title[0].startswith("404 Page Not Found!"):
raise MovieNotFoundError(__name__, movie.dvdid)
container = html.xpath("//div[@class='container']")[0]
@@ -42,9 +43,9 @@ def parse_data(movie: MovieInfo):
info = container.xpath("//div[@class='col-md-3 info']")[0]
dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
- duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip()
+ duration = info.xpath("p/span[text()='長度:']")[0].tail.replace("分鐘", "").strip()
director_tag = info.xpath("p/span[text()='導演:']")
- if director_tag: # xpath没有匹配时将得到空列表
+ if director_tag: # xpath没有匹配时将得到空列表
movie.director = director_tag[0].getnext().text.strip()
producer_tag = info.xpath("p/span[text()='製作商:']")
if producer_tag:
@@ -61,12 +62,12 @@ def parse_data(movie: MovieInfo):
genre_tags = info.xpath("//span[@class='genre']/label/a")
genre, genre_id = [], []
for tag in genre_tags:
- tag_url = tag.get('href')
- pre_id = tag_url.split('/')[-1]
+ tag_url = tag.get("href")
+ pre_id = tag_url.split("/")[-1]
genre.append(tag.text)
- if 'uncensored' in tag_url:
+ if "uncensored" in tag_url:
movie.uncensored = True
- genre_id.append('uncensored-' + pre_id)
+ genre_id.append("uncensored-" + pre_id)
else:
movie.uncensored = False
genre_id.append(pre_id)
@@ -75,18 +76,18 @@ def parse_data(movie: MovieInfo):
actress, actress_pics = [], {}
actress_tags = html.xpath("//a[@class='avatar-box']/div/img")
for tag in actress_tags:
- name = tag.get('title')
- pic_url = tag.get('src')
+ name = tag.get("title")
+ pic_url = tag.get("src")
actress.append(name)
- if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像
+ if not pic_url.endswith("nowprinting.gif"): # 略过默认的头像
actress_pics[name] = pic_url
# 整理数据并更新movie的相应属性
- movie.url = f'{permanent_url}/{movie.dvdid}'
+ movie.url = f"{permanent_url}/{movie.dvdid}"
movie.dvdid = dvdid
- movie.title = title.replace(dvdid, '').strip()
+ movie.title = title.replace(dvdid, "").strip()
movie.cover = cover
movie.preview_pics = preview_pics
- if publish_date != '0000-00-00': # 丢弃无效的发布日期
+ if publish_date != "0000-00-00": # 丢弃无效的发布日期
movie.publish_date = publish_date
movie.duration = duration if int(duration) else None
movie.genre = genre
@@ -99,15 +100,16 @@ def parse_clean_data(movie: MovieInfo):
"""解析指定番号的影片数据并进行清洗"""
parse_data(movie)
movie.genre_norm = genre_map.map(movie.genre_id)
- movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换)
+ movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换)
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('NANP-030')
+ movie = MovieInfo("NANP-030")
try:
parse_clean_data(movie)
print(movie)
diff --git a/javsp/web/javdb.py b/javsp/web/javdb.py
index 5120aae76..149f05eeb 100644
--- a/javsp/web/javdb.py
+++ b/javsp/web/javdb.py
@@ -1,4 +1,5 @@
"""从JavDB抓取数据"""
+
import os
import re
import logging
@@ -14,11 +15,13 @@
# 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析
request = Request(use_scraper=True)
-request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'
+request.headers["Accept-Language"] = (
+ "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5"
+)
logger = logging.getLogger(__name__)
-genre_map = GenreMap('data/genre_javdb.csv')
-permanent_url = 'https://javdb.com'
+genre_map = GenreMap("data/genre_javdb.csv")
+permanent_url = "https://javdb.com"
if Cfg().network.proxy_server is not None:
base_url = permanent_url
else:
@@ -31,29 +34,39 @@ def get_html_wrapper(url):
r = request.get(url, delay_raise=True)
if r.status_code == 200:
# 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页
- if r.history and '/login' in r.url:
+ if r.history and "/login" in r.url:
# 仅在需要时去读取Cookies
- if 'cookies_pool' not in globals():
+ if "cookies_pool" not in globals():
try:
cookies_pool = get_browsers_cookies()
except (PermissionError, OSError) as e:
- logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True)
+ logger.warning(
+ f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件",
+ exc_info=True,
+ )
cookies_pool = []
except Exception as e:
- logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True)
+ logger.warning(
+ f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器",
+ exc_info=True,
+ )
cookies_pool = []
if len(cookies_pool) > 0:
item = cookies_pool.pop()
# 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies
request = Request(use_scraper=True)
- request.cookies = item['cookies']
- cookies_source = (item['profile'], item['site'])
- logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}')
+ request.cookies = item["cookies"]
+ cookies_source = (item["profile"], item["site"])
+ logger.debug(
+ f"未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}"
+ )
return get_html_wrapper(url)
else:
- raise CredentialError('JavDB: 所有浏览器Cookies均已过期')
- elif r.history and 'pay' in r.url.split('/')[-1]:
- raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'")
+ raise CredentialError("JavDB: 所有浏览器Cookies均已过期")
+ elif r.history and "pay" in r.url.split("/")[-1]:
+ raise SitePermissionError(
+ f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'"
+ )
else:
html = resp2html(r)
return html
@@ -62,42 +75,48 @@ def get_html_wrapper(url):
code_tag = html.xpath("//span[@class='code-label']/span")
error_code = code_tag[0].text if code_tag else None
if error_code:
- if error_code == '1020':
- block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器'
+ if error_code == "1020":
+ block_msg = f"JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器"
else:
- block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})'
+ block_msg = (
+ f"JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})"
+ )
else:
- block_msg = f'JavDB: {r.status_code} 禁止访问: {url}'
+ block_msg = f"JavDB: {r.status_code} 禁止访问: {url}"
raise SiteBlocked(block_msg)
else:
- raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}')
+ raise WebsiteError(f"JavDB: {r.status_code} 非预期状态码: {url}")
def get_user_info(site, cookies):
"""获取cookies对应的JavDB用户信息"""
try:
request.cookies = cookies
- html = request.get_html(f'https://{site}/users/profile')
+ html = request.get_html(f"https://{site}/users/profile")
except Exception as e:
- logger.info('JavDB: 获取用户信息时出错')
+ logger.info("JavDB: 获取用户信息时出错")
logger.debug(e, exc_info=1)
return
# 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点
- if 'JavDB' in html.text:
- email = html.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip()
- username = html.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip()
+ if "JavDB" in html.text:
+ email = html.xpath(
+ "//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()"
+ )[0].strip()
+ username = html.xpath(
+ "//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()"
+ )[0].strip()
return email, username
else:
- logger.debug('JavDB: 域名已过期: ' + site)
+ logger.debug("JavDB: 域名已过期: " + site)
def get_valid_cookies():
"""扫描浏览器,获取一个可用的Cookies"""
# 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用
for d in cookies_pool:
- info = get_user_info(d['site'], d['cookies'])
+ info = get_user_info(d["site"], d["cookies"])
if info:
- return d['cookies']
+ return d["cookies"]
else:
logger.debug(f"{d['profile']}, {d['site']}: Cookies无效")
@@ -108,7 +127,7 @@ def parse_data(movie: MovieInfo):
movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
"""
# JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个
- html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}')
+ html = get_html_wrapper(f"{base_url}/search?q={movie.dvdid}")
ids = list(map(str.lower, html.xpath("//div[@class='video-title']/strong/text()")))
movie_urls = html.xpath("//a[@class='box']/@href")
match_count = len([i for i in ids if i == movie.dvdid.lower()])
@@ -123,11 +142,11 @@ def parse_data(movie: MovieInfo):
# 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面
box = html.xpath("//a[@class='box']")[index]
movie.url = new_url
- movie.title = box.get('title')
+ movie.title = box.get("title")
movie.cover = box.xpath("div/img/@src")[0]
score_str = box.xpath("div[@class='score']/span/span")[0].tail
- score = re.search(r'([\d.]+)分', score_str).group(1)
- movie.score = "{:.2f}".format(float(score)*2)
+ score = re.search(r"([\d.]+)分", score_str).group(1)
+ movie.score = "{:.2f}".format(float(score) * 2)
movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip()
return
else:
@@ -136,25 +155,34 @@ def parse_data(movie: MovieInfo):
container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0]
info = container.xpath("//nav[@class='panel movie-panel-info']")[0]
title = container.xpath("h2/strong[@class='current-title']/text()")[0]
- show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]")
+ show_orig_title = container.xpath(
+ "//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]"
+ )
if show_orig_title:
movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0]
cover = container.xpath("//img[@class='video-cover']/@src")[0]
- preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href")
+ preview_pics = container.xpath(
+ "//a[@class='tile-item'][@data-fancybox='gallery']/@href"
+ )
preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src")
if preview_video_tag:
preview_video = preview_video_tag[0]
- if preview_video.startswith('//'):
- preview_video = 'https:' + preview_video
+ if preview_video.startswith("//"):
+ preview_video = "https:" + preview_video
movie.preview_video = preview_video
dvdid = info.xpath("div/span")[0].text_content()
publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
- duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip()
+ duration = (
+ info.xpath("div/strong[text()='時長:']")[0]
+ .getnext()
+ .text.replace("分鍾", "")
+ .strip()
+ )
director_tag = info.xpath("div/strong[text()='導演:']")
if director_tag:
movie.director = director_tag[0].getnext().text_content().strip()
av_type = guess_av_type(movie.dvdid)
- if av_type != 'fc2':
+ if av_type != "fc2":
producer_tag = info.xpath("div/strong[text()='片商:']")
else:
producer_tag = info.xpath("div/strong[text()='賣家:']")
@@ -169,27 +197,29 @@ def parse_data(movie: MovieInfo):
score_tag = info.xpath("//span[@class='score-stars']")
if score_tag:
score_str = score_tag[0].tail
- score = re.search(r'([\d.]+)分', score_str).group(1)
- movie.score = "{:.2f}".format(float(score)*2)
+ score = re.search(r"([\d.]+)分", score_str).group(1)
+ movie.score = "{:.2f}".format(float(score) * 2)
genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
genre, genre_id = [], []
for tag in genre_tags:
- pre_id = tag.get('href').split('/')[-1]
+ pre_id = tag.get("href").split("/")[-1]
genre.append(tag.text)
genre_id.append(pre_id)
# 判定影片有码/无码
- subsite = pre_id.split('?')[0]
- movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite)
+ subsite = pre_id.split("?")[0]
+ movie.uncensored = {"uncensored": True, "tags": False}.get(subsite)
# JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优
actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
all_actors = actors_tag.xpath("a/text()")
genders = actors_tag.xpath("strong/text()")
- actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
- magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href")
+ actress = [i for i in all_actors if genders[all_actors.index(i)] == "♀"]
+ magnet = container.xpath(
+ "//div[@class='magnet-name column is-four-fifths']/a/@href"
+ )
movie.dvdid = dvdid
movie.url = new_url.replace(base_url, permanent_url)
- movie.title = title.replace(dvdid, '').strip()
+ movie.title = title.replace(dvdid, "").strip()
movie.cover = cover
movie.preview_pics = preview_pics
movie.publish_date = publish_date
@@ -197,7 +227,7 @@ def parse_data(movie: MovieInfo):
movie.genre = genre
movie.genre_id = genre_id
movie.actress = actress
- movie.magnet = [i.replace('[javdb.com]','') for i in magnet]
+ movie.magnet = [i.replace("[javdb.com]", "") for i in magnet]
def parse_clean_data(movie: MovieInfo):
@@ -211,10 +241,12 @@ def parse_clean_data(movie: MovieInfo):
movie.cover = None
except SiteBlocked:
raise
- logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试')
- if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')):
+ logger.error("JavDB: 可能触发了反爬虫机制,请稍后再试")
+ if movie.genre_id and (not movie.genre_id[0].startswith("fc2?")):
movie.genre_norm = genre_map.map(movie.genre_id)
- movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
+ movie.genre_id = (
+ None # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
+ )
def collect_actress_alias(type=0, use_original=True):
@@ -325,7 +357,7 @@ def collect_actress_alias(type=0, use_original=True):
if __name__ == "__main__":
# collect_actress_alias()
- movie = MovieInfo('FC2-2735981')
+ movie = MovieInfo("FC2-2735981")
try:
parse_clean_data(movie)
print(movie)
diff --git a/javsp/web/javlib.py b/javsp/web/javlib.py
index 85f77b75f..f4b8c055a 100644
--- a/javsp/web/javlib.py
+++ b/javsp/web/javlib.py
@@ -1,4 +1,5 @@
"""从JavLibrary抓取数据"""
+
import logging
from urllib.parse import urlsplit
@@ -7,21 +8,21 @@
from javsp.web.exceptions import *
from javsp.web.proxyfree import get_proxy_free_url
from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo
+from javsp.datatype import MovieInfo
# 初始化Request实例
request = Request(use_scraper=True)
logger = logging.getLogger(__name__)
-permanent_url = 'https://www.javlibrary.com'
-base_url = ''
+permanent_url = "https://www.javlibrary.com"
+base_url = ""
def init_network_cfg():
"""设置合适的代理模式和base_url"""
request.timeout = 5
- proxy_free_url = get_proxy_free_url('javlib')
+ proxy_free_url = get_proxy_free_url("javlib")
urls = [str(Cfg().network.proxy_free[CrawlerID.javlib]), permanent_url]
if proxy_free_url and proxy_free_url not in urls:
urls.insert(1, proxy_free_url)
@@ -39,7 +40,7 @@ def init_network_cfg():
return url
except Exception as e:
logger.debug(f"Fail to connect to '{url}': {e}")
- logger.warning('无法绕开JavLib的反爬机制')
+ logger.warning("无法绕开JavLib的反爬机制")
request.timeout = Cfg().network.timeout.seconds
return permanent_url
@@ -51,7 +52,7 @@ def parse_data(movie: MovieInfo):
if not base_url:
base_url = init_network_cfg()
logger.debug(f"JavLib网络配置: {base_url}, proxy={request.proxies}")
- url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
+ url = new_url = f"{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}"
resp = request.get(url)
html = resp2html(resp)
if resp.history:
@@ -61,10 +62,10 @@ def parse_data(movie: MovieInfo):
else:
# 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段,
# 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据
- base_url = 'https://' + urlsplit(resp.url).netloc
+ base_url = "https://" + urlsplit(resp.url).netloc
logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}")
return parse_data(movie)
- else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
+ else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
video_tags = html.xpath("//div[@class='video'][@id]/a")
# 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果
pre_choose = []
@@ -72,7 +73,7 @@ def parse_data(movie: MovieInfo):
tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
if tag_dvdid.upper() == movie.dvdid.upper():
pre_choose.append(tag)
- pre_choose_urls = [i.get('href') for i in pre_choose]
+ pre_choose_urls = [i.get("href") for i in pre_choose]
match_count = len(pre_choose)
if match_count == 0:
raise MovieNotFoundError(__name__, movie.dvdid)
@@ -81,18 +82,24 @@ def parse_data(movie: MovieInfo):
elif match_count == 2:
no_blueray = []
for tag in pre_choose:
- if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc
+ if "ブルーレイディスク" not in tag.get("title"): # Blu-ray Disc
no_blueray.append(tag)
no_blueray_count = len(no_blueray)
if no_blueray_count == 1:
- new_url = no_blueray[0].get('href')
- logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}")
+ new_url = no_blueray[0].get("href")
+ logger.debug(
+ f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}"
+ )
else:
# 两个结果中没有谁是蓝光影片,说明影片番号重复了
- raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
+ raise MovieDuplicateError(
+ __name__, movie.dvdid, match_count, pre_choose_urls
+ )
else:
# 存在不同影片但是番号相同的情况,如MIDV-010
- raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
+ raise MovieDuplicateError(
+ __name__, movie.dvdid, match_count, pre_choose_urls
+ )
# 重新抓取网页
html = request.get_html(new_url)
container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0]
@@ -112,15 +119,15 @@ def parse_data(movie: MovieInfo):
movie.publisher = publisher_tag[0]
score_tag = info.xpath("//span[@class='score']/text()")
if score_tag:
- movie.score = score_tag[0].strip('()')
+ movie.score = score_tag[0].strip("()")
genre = info.xpath("//span[@class='genre']/a/text()")
actress = info.xpath("//span[@class='star']/a/text()")
movie.dvdid = dvdid
movie.url = new_url.replace(base_url, permanent_url)
- movie.title = title.replace(dvdid, '').strip()
- if cover.startswith('//'): # 补全URL中缺少的协议段
- cover = 'https:' + cover
+ movie.title = title.replace(dvdid, "").strip()
+ if cover.startswith("//"): # 补全URL中缺少的协议段
+ cover = "https:" + cover
movie.cover = cover
movie.publish_date = publish_date
movie.duration = duration
@@ -131,9 +138,10 @@ def parse_data(movie: MovieInfo):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
base_url = permanent_url
- movie = MovieInfo('IPX-177')
+ movie = MovieInfo("IPX-177")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/javmenu.py b/javsp/web/javmenu.py
index 5296a69cd..310b5de0f 100644
--- a/javsp/web/javmenu.py
+++ b/javsp/web/javmenu.py
@@ -1,4 +1,5 @@
"""从JavMenu抓取数据"""
+
import logging
from javsp.web.base import Request, resp2html
@@ -9,7 +10,7 @@
request = Request()
logger = logging.getLogger(__name__)
-base_url = 'https://mrzyx.xyz'
+base_url = "https://mrzyx.xyz"
def parse_data(movie: MovieInfo):
@@ -18,7 +19,7 @@ def parse_data(movie: MovieInfo):
movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
"""
# JavMenu网页做得很不走心,将就了
- url = f'{base_url}/{movie.dvdid}'
+ url = f"{base_url}/{movie.dvdid}"
r = request.get(url)
if r.history:
# 被重定向到主页说明找不到影片资源
@@ -28,13 +29,13 @@ def parse_data(movie: MovieInfo):
container = html.xpath("//div[@class='col-md-9 px-0']")[0]
title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0]
# 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站
- title = title.replace(' | JAV目錄大全 | 每日更新', '')
- title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '')
+ title = title.replace(" | JAV目錄大全 | 每日更新", "")
+ title = title.replace(" 免費在線看", "").replace(" 免費AV在線看", "")
cover_tag = container.xpath("//div[@class='single-video']")
if len(cover_tag) > 0:
- video_tag = cover_tag[0].find('video')
+ video_tag = cover_tag[0].find("video")
# URL首尾竟然也有空格……
- movie.cover = video_tag.get('data-poster').strip()
+ movie.cover = video_tag.get("data-poster").strip()
# 预览影片改为blob了,无法获取
# movie.preview_video = video_tag.find('source').get('src').strip()
else:
@@ -43,30 +44,39 @@ def parse_data(movie: MovieInfo):
movie.cover = cover_img_tag[0].strip()
info = container.xpath("//div[@class='card-body']")[0]
publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text
- duration = info.xpath("div/span[contains(text(), '時長:')]")[0].getnext().text.replace('分鐘', '')
- producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()")
+ duration = (
+ info.xpath("div/span[contains(text(), '時長:')]")[0]
+ .getnext()
+ .text.replace("分鐘", "")
+ )
+ producer = info.xpath(
+ "div/span[contains(text(), '製作:')]/following-sibling::a/span/text()"
+ )
if producer:
movie.producer = producer[0]
genre_tags = info.xpath("//a[@class='genre']")
genre, genre_id = [], []
for tag in genre_tags:
- items = tag.get('href').split('/')
- pre_id = items[-3] + '/' + items[-1]
+ items = tag.get("href").split("/")
+ pre_id = items[-3] + "/" + items[-1]
genre.append(tag.text.strip())
genre_id.append(pre_id)
# genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠……
- actress = info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()") or None
+ actress = (
+ info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()")
+ or None
+ )
magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody")
if magnet_table:
magnet_links = magnet_table[0].xpath("tr/td/a/@href")
# 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以
- movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links]
+ movie.magnet = [i.replace("[javdb.com]", "") for i in magnet_links]
preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href")
if (not movie.cover) and preview_pics:
movie.cover = preview_pics[0]
movie.url = url
- movie.title = title.replace(movie.dvdid, '').strip()
+ movie.title = title.replace(movie.dvdid, "").strip()
movie.preview_pics = preview_pics
movie.publish_date = publish_date
movie.duration = duration
@@ -77,10 +87,11 @@ def parse_data(movie: MovieInfo):
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('FC2-718323')
+ movie = MovieInfo("FC2-718323")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/mgstage.py b/javsp/web/mgstage.py
index 4904e51db..a3d1ac7e8 100644
--- a/javsp/web/mgstage.py
+++ b/javsp/web/mgstage.py
@@ -1,4 +1,5 @@
"""从蚊香社-mgstage抓取数据"""
+
import re
import logging
@@ -10,18 +11,18 @@
logger = logging.getLogger(__name__)
-base_url = 'https://www.mgstage.com'
+base_url = "https://www.mgstage.com"
# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
request = Request()
-request.cookies = {'adc': '1'}
+request.cookies = {"adc": "1"}
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
- url = f'{base_url}/product/product_detail/{movie.dvdid}/'
+ url = f"{base_url}/product/product_detail/{movie.dvdid}/"
resp = request.get(url, delay_raise=True)
if resp.status_code == 403:
- raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
+ raise SiteBlocked("mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理")
# url不存在时会被重定向至主页。history非空时说明发生了重定向
elif resp.history:
raise MovieNotFoundError(__name__, movie.dvdid)
@@ -33,18 +34,28 @@ def parse_data(movie: MovieInfo):
cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
# 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表
actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()")
- actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()")
+ actress_link = container.xpath(
+ "//th[text()='出演:']/following-sibling::td/a/text()"
+ )
actress = [i.strip() for i in actress_text + actress_link]
- actress = [i for i in actress if i] # 移除空字符串
- producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip()
- duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0]
- match = re.search(r'\d+', duration_str)
+ actress = [i for i in actress if i] # 移除空字符串
+ producer = container.xpath(
+ "//th[text()='メーカー:']/following-sibling::td/a/text()"
+ )[0].strip()
+ duration_str = container.xpath(
+ "//th[text()='収録時間:']/following-sibling::td/text()"
+ )[0]
+ match = re.search(r"\d+", duration_str)
if match:
movie.duration = match.group(0)
dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0]
- date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0]
- publish_date = date_str.replace('/', '-')
- serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()")
+ date_str = container.xpath(
+ "//th[text()='配信開始日:']/following-sibling::td/text()"
+ )[0]
+ publish_date = date_str.replace("/", "-")
+ serial_tag = container.xpath(
+ "//th[text()='シリーズ:']/following-sibling::td/a/text()"
+ )
if serial_tag:
movie.serial = serial_tag[0].strip()
# label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
@@ -52,10 +63,10 @@ def parse_data(movie: MovieInfo):
genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a")
genre = [i.text.strip() for i in genre_tags]
score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
- match = re.search(r'^[\.\d]+', score_str)
+ match = re.search(r"^[\.\d]+", score_str)
if match:
score = float(match.group()) * 2
- movie.score = f'{score:.2f}'
+ movie.score = f"{score:.2f}"
# plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签
plots = []
plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]")
@@ -66,26 +77,26 @@ def parse_data(movie: MovieInfo):
plots.append(p.text_content())
continue
for child in children:
- if child.tag == 'br' and plots[-1] != '\n':
- plots.append('\n')
+ if child.tag == "br" and plots[-1] != "\n":
+ plots.append("\n")
else:
if child.text:
plots.append(child.text)
if child.tail:
plots.append(child.tail)
- plot = ''.join(plots).strip()
+ plot = "".join(plots).strip()
preview_pics = container.xpath("//a[@class='sample_image']/@href")
if Cfg().crawler.hardworking:
# 预览视频是点击按钮后再加载的,不在静态网页中
btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
- video_pid = btn_url.split('/')[-1]
- req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
+ video_pid = btn_url.split("/")[-1]
+ req_url = f"{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}"
resp = request.get(req_url).json()
- video_url = resp.get('url')
+ video_url = resp.get("url")
if video_url:
# /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX
- preview_video = video_url.split('.ism/')[0] + '.mp4'
+ preview_video = video_url.split(".ism/")[0] + ".mp4"
movie.preview_video = preview_video
movie.dvdid = dvdid
@@ -98,15 +109,16 @@ def parse_data(movie: MovieInfo):
movie.genre = genre
movie.plot = plot
movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('HRV-045')
+ movie = MovieInfo("HRV-045")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/njav.py b/javsp/web/njav.py
index f94e943f3..331e44e6a 100644
--- a/javsp/web/njav.py
+++ b/javsp/web/njav.py
@@ -1,4 +1,5 @@
"""从NJAV抓取数据"""
+
import re
import logging
from typing import List
@@ -11,12 +12,13 @@
logger = logging.getLogger(__name__)
-base_url = 'https://njav.tv/ja'
+base_url = "https://njav.tv/ja"
+
def search_video(movie: MovieInfo):
id_uc = movie.dvdid
# 抓取网页
- url = f'{base_url}/search?keyword={id_uc}'
+ url = f"{base_url}/search?keyword={id_uc}"
html = get_html(url)
list = html.xpath("//div[@class='box-item']/div[@class='detail']/a")
video_url = None
@@ -26,13 +28,14 @@ def search_video(movie: MovieInfo):
video_url = item.xpath("@href")
break
if id_uc.startswith("FC2-"):
- fc2id = id_uc.replace('FC2-', '')
+ fc2id = id_uc.replace("FC2-", "")
if "FC2" in search_title and fc2id in search_title:
video_url = item.xpath("@href")
break
-
+
return get_list_first(video_url)
-
+
+
def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# 抓取网页
@@ -45,8 +48,10 @@ def parse_data(movie: MovieInfo):
container = container[0]
else:
raise MovieNotFoundError(__name__, movie.dvdid)
-
- title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0]
+
+ title = container.xpath(
+ "//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()"
+ )[0]
thumb_pic = container.xpath("//div[@id='player']/@data-poster")
plot = " ".join(container.xpath("//div[@class='description']/p/text()"))
magnet = container.xpath("//div[@class='magnet']/a/@href")
@@ -64,13 +69,13 @@ def parse_data(movie: MovieInfo):
detail_dic = {}
for item in container.xpath("//div[@class='detail-item']/div"):
- item_title = item.xpath('span/text()')[0]
+ item_title = item.xpath("span/text()")[0]
if "タグ:" in item_title:
genre += item.xpath("span")[1].xpath("a/text()")
elif "ジャンル:" in item_title:
genre += item.xpath("span")[1].xpath("a/text()")
elif "レーベル:" in item_title:
- genre += item.xpath("span")[1].xpath("a/text()")
+ genre += item.xpath("span")[1].xpath("a/text()")
elif "女優:" in item_title:
actress = item.xpath("span")[1].xpath("a/text()")
elif "シリーズ:" in item_title:
@@ -83,18 +88,18 @@ def parse_data(movie: MovieInfo):
publish_date = get_list_first(item.xpath("span")[1].xpath("text()"))
elif "再生時間:" in item_title:
duration_str = get_list_first(item.xpath("span")[1].xpath("text()"))
-
+
# 清除标题里的番号字符
keywords = [real_id, " "]
if movie.dvdid.startswith("FC2"):
- keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]]
+ keywords += ["FC2", "PPV", "-"] + [movie.dvdid.split("-")[-1]]
for keyword in keywords:
- title = re.sub(re.escape(keyword), "", title, flags=re.I)
+ title = re.sub(re.escape(keyword), "", title, flags=re.I)
# 判断是否无码
uncensored_arr = magnet + [title]
for uncensored_str in uncensored_arr:
- if 'uncensored' in uncensored_str.lower():
+ if "uncensored" in uncensored_str.lower():
uncensored = True
movie.url = url
@@ -118,15 +123,18 @@ def parse_data(movie: MovieInfo):
else:
movie.cover = get_list_first(thumb_pic)
-def get_list_first(list:List):
+
+def get_list_first(list: List):
return list[0] if list and len(list) > 0 else None
+
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('012023_002')
+ movie = MovieInfo("012023_002")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/prestige.py b/javsp/web/prestige.py
index f6884c658..65e0eee0b 100644
--- a/javsp/web/prestige.py
+++ b/javsp/web/prestige.py
@@ -1,4 +1,5 @@
"""从蚊香社-prestige抓取数据"""
+
import re
import logging
@@ -9,10 +10,10 @@
logger = logging.getLogger(__name__)
-base_url = 'https://www.prestige-av.com'
+base_url = "https://www.prestige-av.com"
# prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面
# (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取)
-cookies = {'__age_auth__': 'true'}
+cookies = {"__age_auth__": "true"}
def parse_data(movie: MovieInfo):
@@ -20,13 +21,15 @@ def parse_data(movie: MovieInfo):
Args:
movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
"""
- url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}'
+ url = f"{base_url}/goods/goods_detail.php?sku={movie.dvdid}"
resp = request_get(url, cookies=cookies, delay_raise=True)
if resp.status_code == 500:
# 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试
raise MovieNotFoundError(__name__, movie.dvdid)
elif resp.status_code == 403:
- raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
+ raise SiteBlocked(
+ "prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理"
+ )
resp.raise_for_status()
html = resp2html(resp)
container_tags = html.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']")
@@ -35,25 +38,41 @@ def parse_data(movie: MovieInfo):
container = container_tags[0]
title = container.xpath("h1/span")[0].tail.strip()
- cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0]
- cover = cover.split('?')[0]
- actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()")
+ cover = container.xpath(
+ "//div[@class='c-ratio-image mr-8']/picture/source/img/@src"
+ )[0]
+ cover = cover.split("?")[0]
+ actress = container.xpath(
+ "//p[text()='出演者:']/following-sibling::div/p/a/text()"
+ )
# 移除女优名中的空格,使女优名与其他网站保持一致
- actress = [i.strip().replace(' ', '') for i in actress]
- duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content()
- match = re.search(r'\d+', duration_str)
+ actress = [i.strip().replace(" ", "") for i in actress]
+ duration_str = (
+ container.xpath("//p[text()='収録時間:']")[0].getnext().text_content()
+ )
+ match = re.search(r"\d+", duration_str)
if match:
movie.duration = match.group(0)
- date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0]
- publish_date = date_url.split('?date=')[-1]
- producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip()
+ date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[
+ 0
+ ]
+ publish_date = date_url.split("?date=")[-1]
+ producer = container.xpath(
+ "//p[text()='メーカー:']/following-sibling::div/a/text()"
+ )[0].strip()
dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0]
genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a")
genre = [tag.text.strip() for tag in genre_tags]
- serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip()
- plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip()
- preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src")
- preview_pics = [i.split('?')[0] for i in preview_pics]
+ serial = container.xpath(
+ "//p[text()='レーベル:']/following-sibling::div/a/text()"
+ )[0].strip()
+ plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[
+ 0
+ ].text.strip()
+ preview_pics = container.xpath(
+ "//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src"
+ )
+ preview_pics = [i.split("?")[0] for i in preview_pics]
# prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效
movie.url = url
@@ -67,15 +86,18 @@ def parse_data(movie: MovieInfo):
movie.serial = serial
movie.plot = plot
movie.preview_pics = preview_pics
- movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
+ movie.uncensored = (
+ False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
+ )
if __name__ == "__main__":
import pretty_errors
+
pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('ABP-647')
+ movie = MovieInfo("ABP-647")
try:
parse_data(movie)
print(movie)
diff --git a/javsp/web/proxyfree.py b/javsp/web/proxyfree.py
index 89c1e63a4..2c98b7cef 100644
--- a/javsp/web/proxyfree.py
+++ b/javsp/web/proxyfree.py
@@ -1,4 +1,5 @@
"""获取各个网站的免代理地址"""
+
import re
import sys
@@ -17,15 +18,15 @@ def get_proxy_free_url(site_name: str, prefer_url=None) -> str:
return prefer_url
# 当prefer_url不可用时,尝试自动获取指定网站的免代理地址
site_name = site_name.lower()
- func_name = f'_get_{site_name}_urls'
- get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith('_get_')]
+ func_name = f"_get_{site_name}_urls"
+ get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith("_get_")]
if func_name in get_funcs:
get_urls = getattr(sys.modules[__name__], func_name)
try:
urls = get_urls()
return _choose_one(urls)
except:
- return ''
+ return ""
else:
raise Exception("Dont't know how to get proxy-free url for " + site_name)
@@ -34,42 +35,52 @@ def _choose_one(urls) -> str:
for url in urls:
if is_connectable(url, timeout=5):
return url
- return ''
+ return ""
def _get_avsox_urls() -> list:
- html = get_html('https://tellme.pw/avsox')
- urls = html.xpath('//h4/strong/a/@href')
+ html = get_html("https://tellme.pw/avsox")
+ urls = html.xpath("//h4/strong/a/@href")
return urls
def _get_javbus_urls() -> list:
- html = get_html('https://www.javbus.one/')
+ html = get_html("https://www.javbus.one/")
text = html.text_content()
- urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A)
+ urls = re.findall(
+ r"防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})",
+ text,
+ re.I | re.A,
+ )
return urls
def _get_javlib_urls() -> list:
- html = get_html('https://github.com/javlibcom')
- text = html.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content()
- match = re.search(r'[\w\.]+', text, re.A)
+ html = get_html("https://github.com/javlibcom")
+ text = html.xpath(
+ "//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']"
+ )[0].text_content()
+ match = re.search(r"[\w\.]+", text, re.A)
if match:
- domain = f'https://www.{match.group(0)}.com'
+ domain = f"https://www.{match.group(0)}.com"
return [domain]
def _get_javdb_urls() -> list:
- html = get_html('https://jav524.app')
+ html = get_html("https://jav524.app")
js_links = html.xpath("//script[@src]/@src")
for link in js_links:
- if '/js/index' in link:
+ if "/js/index" in link:
text = get_resp_text(request_get(link))
- match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
+ match = re.search(
+ r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"',
+ text,
+ flags=re.I | re.A,
+ )
if match:
return [match.group(1)]
if __name__ == "__main__":
- print('javdb:\t', _get_javdb_urls())
- print('javlib:\t', _get_javlib_urls())
+ print("javdb:\t", _get_javdb_urls())
+ print("javlib:\t", _get_javlib_urls())
diff --git a/javsp/web/translate.py b/javsp/web/translate.py
index 2e762cb15..1ef736287 100644
--- a/javsp/web/translate.py
+++ b/javsp/web/translate.py
@@ -1,23 +1,30 @@
"""网页翻译接口"""
+
# 由于翻译服务不走代理,而且需要自己的错误处理机制,因此不通过base.py来管理网络请求
+import logging
+import random
import time
-from typing import Union
import uuid
-import random
-import logging
-from pydantic_core import Url
-import requests
from hashlib import md5
+from typing import Union
+import requests
+from pydantic_core import Url
-__all__ = ['translate', 'translate_movie_info']
+__all__ = ["translate", "translate_movie_info"]
-from javsp.config import BaiduTranslateEngine, BingTranslateEngine, Cfg, ClaudeTranslateEngine, GoogleTranslateEngine, OpenAITranslateEngine, TranslateEngine
+from javsp.config import (
+ BaiduTranslateEngine,
+ BingTranslateEngine,
+ Cfg,
+ ClaudeTranslateEngine,
+ GoogleTranslateEngine,
+ OpenAITranslateEngine,
+)
from javsp.datatype import MovieInfo
from javsp.web.base import read_proxy
-
logger = logging.getLogger(__name__)
@@ -26,36 +33,42 @@ def translate_movie_info(info: MovieInfo):
# 翻译标题
if info.title and Cfg().translator.fields.title and info.ori_title is None:
result = translate(info.title, Cfg().translator.engine, info.actress)
- if 'trans' in result:
+ if "trans" in result:
info.ori_title = info.title
- info.title = result['trans']
+ info.title = result["trans"]
# 如果有的话,附加断句信息
- if 'orig_break' in result:
- setattr(info, 'ori_title_break', result['orig_break'])
- if 'trans_break' in result:
- setattr(info, 'title_break', result['trans_break'])
+ if "orig_break" in result:
+ setattr(info, "ori_title_break", result["orig_break"])
+ if "trans_break" in result:
+ setattr(info, "title_break", result["trans_break"])
else:
- logger.error('翻译标题时出错: ' + result['error'])
+ logger.error("翻译标题时出错: " + result["error"])
return False
# 翻译简介
if info.plot and Cfg().translator.fields.plot:
result = translate(info.plot, Cfg().translator.engine, info.actress)
- if 'trans' in result:
+ if "trans" in result:
# 只有翻译过plot的影片才可能需要ori_plot属性,因此在运行时动态添加,而不添加到类型定义里
- setattr(info, 'ori_plot', info.plot)
- info.plot = result['trans']
+ setattr(info, "ori_plot", info.plot)
+ info.plot = result["trans"]
else:
- logger.error('翻译简介时出错: ' + result['error'])
+ logger.error("翻译简介时出错: " + result["error"])
return False
return True
-def translate(texts, engine: Union[
+
+def translate(
+ texts,
+ engine: Union[
BaiduTranslateEngine,
BingTranslateEngine,
ClaudeTranslateEngine,
OpenAITranslateEngine,
- None
- ], actress=[]):
+ GoogleTranslateEngine,
+ None,
+ ],
+ actress=[],
+):
"""
翻译入口:对错误进行处理并且统一返回格式
@@ -65,84 +78,108 @@ def translate(texts, engine: Union[
翻译出错: {'error': 'baidu: 54000: PARAM_FROM_TO_OR_Q_EMPTY'}
"""
rtn = {}
- err_msg = ''
- if engine.name == 'baidu':
+ err_msg = ""
+ if engine.name == "baidu":
result = baidu_translate(texts, engine.app_id, engine.api_key)
- if 'error_code' not in result:
+ if "error_code" not in result:
# 百度翻译的结果中的组表示的是按换行符分隔的不同段落,而不是句子
- paragraphs = [i['dst'] for i in result['trans_result']]
- rtn = {'trans': '\n'.join(paragraphs)}
+ paragraphs = [i["dst"] for i in result["trans_result"]]
+ rtn = {"trans": "\n".join(paragraphs)}
else:
- err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg'])
- elif engine.name == 'bing':
+ err_msg = "{}: {}: {}".format(
+ engine, result["error_code"], result["error_msg"]
+ )
+ elif engine.name == "bing":
# 使用动态词典保护原文中的女优名,防止翻译后认不出来
for i in actress:
- texts = texts.replace(i, f'{i}')
+ texts = texts.replace(
+ i, f'{i}'
+ )
result = bing_translate(texts, api_key=engine.api_key)
- if 'error' not in result:
- sentLen = result[0]['translations'][0]['sentLen']
+ if "error" not in result:
+ sentLen = result[0]["translations"][0]["sentLen"]
orig_break, trans_break = [], []
# 对原文进行断句
remaining = texts
- for i in sentLen['srcSentLen']:
+ for i in sentLen["srcSentLen"]:
orig_break.append(remaining[:i])
remaining = remaining[i:]
# 对译文进行断句
- remaining = result[0]['translations'][0]['text']
- for i in sentLen['transSentLen']:
+ remaining = result[0]["translations"][0]["text"]
+ for i in sentLen["transSentLen"]:
# Bing会在译文的每个句尾添加一个空格,这并不符合中文的标点习惯,所以去掉这个空格
- trans_break.append(remaining[:i].rstrip(' '))
+ trans_break.append(remaining[:i].rstrip(" "))
remaining = remaining[i:]
- trans = ''.join(trans_break)
- rtn = {'trans': trans, 'orig_break': orig_break, 'trans_break': trans_break}
+ trans = "".join(trans_break)
+ rtn = {"trans": trans, "orig_break": orig_break, "trans_break": trans_break}
else:
- err_msg = "{}: {}: {}".format(engine, result['error']['code'], result['error']['message'])
- elif engine.name == 'claude':
+ err_msg = "{}: {}: {}".format(
+ engine, result["error"]["code"], result["error"]["message"]
+ )
+ elif engine.name == "claude":
try:
result = claude_translate(texts, engine.api_key)
- if 'error_code' not in result:
- rtn = {'trans': result}
+ if "error_code" not in result:
+ rtn = {"trans": result}
else:
- err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg'])
+ err_msg = "{}: {}: {}".format(
+ engine, result["error_code"], result["error_msg"]
+ )
except Exception as e:
err_msg = "{}: {}: Exception: {}".format(engine, -2, repr(e))
- elif engine.name == 'openai':
+ elif engine.name == "openai":
try:
result = openai_translate(texts, engine.url, engine.api_key, engine.model)
- if 'error_code' not in result:
- rtn = {'trans': result}
+ if "error_code" not in result:
+ rtn = {"trans": result}
else:
- err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg'])
+ err_msg = "{}: {}: {}".format(
+ engine, result["error_code"], result["error_msg"]
+ )
except Exception as e:
err_msg = "{}: {}: Exception: {}".format(engine, -2, repr(e))
- elif engine.name == 'google':
+ elif engine.name == "google":
try:
result = google_trans(texts)
# 经测试,翻译成功时会带有'sentences'字段;失败时不带,也没有故障码
- if 'sentences' in result:
+ if "sentences" in result:
# Google会对句子分组,完整的译文需要自行拼接
- orig_break = [i['orig'] for i in result['sentences']]
- trans_break = [i['trans'] for i in result['sentences']]
- trans = ''.join(trans_break)
- rtn = {'trans': trans, 'orig_break': orig_break, 'trans_break': trans_break}
+ orig_break = [i["orig"] for i in result["sentences"]]
+ trans_break = [i["trans"] for i in result["sentences"]]
+ trans = "".join(trans_break)
+ rtn = {
+ "trans": trans,
+ "orig_break": orig_break,
+ "trans_break": trans_break,
+ }
else:
- err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg'])
+ err_msg = "{}: {}: {}".format(
+ engine, result["error_code"], result["error_msg"]
+ )
except Exception as e:
err_msg = "{}: {}: Exception: {}".format(engine, -2, repr(e))
else:
- return {'trans': texts}
+ return {"trans": texts}
-def baidu_translate(texts, app_id, api_key, to='zh'):
+
+def baidu_translate(texts, app_id, api_key, to="zh"):
"""使用百度翻译文本(默认翻译为简体中文)"""
api_url = "https://api.fanyi.baidu.com/api/trans/vip/translate"
- headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+ headers = {"Content-Type": "application/x-www-form-urlencoded"}
salt = random.randint(0, 0x7FFFFFFF)
sign_input = app_id + texts + str(salt) + api_key
- sign = md5(sign_input.encode('utf-8')).hexdigest()
- payload = {'appid': app_id, 'q': texts, 'from': 'auto', 'to': to, 'salt': salt, 'sign': sign}
+ sign = md5(sign_input.encode("utf-8")).hexdigest()
+ payload = {
+ "appid": app_id,
+ "q": texts,
+ "from": "auto",
+ "to": to,
+ "salt": salt,
+ "sign": sign,
+ }
# 由于百度标准版限制QPS为1,连续翻译标题和简介会超限,因此需要添加延时
now = time.perf_counter()
- last_access = getattr(baidu_translate, '_last_access', -1)
+ last_access = getattr(baidu_translate, "_last_access", -1)
wait = 1.0 - (now - last_access)
if wait > 0:
time.sleep(wait)
@@ -152,24 +189,26 @@ def baidu_translate(texts, app_id, api_key, to='zh'):
return result
-def bing_translate(texts, api_key, to='zh-Hans'):
+def bing_translate(texts, api_key, to="zh-Hans"):
"""使用Bing翻译文本(默认翻译为简体中文)"""
api_url = "https://api.cognitive.microsofttranslator.com/translate"
- params = {'api-version': '3.0', 'to': to, 'includeSentenceLength': True}
+ params = {"api-version": "3.0", "to": to, "includeSentenceLength": True}
headers = {
- 'Ocp-Apim-Subscription-Key': api_key,
- 'Ocp-Apim-Subscription-Region': 'global',
- 'Content-type': 'application/json',
- 'X-ClientTraceId': str(uuid.uuid4())
+ "Ocp-Apim-Subscription-Key": api_key,
+ "Ocp-Apim-Subscription-Region": "global",
+ "Content-type": "application/json",
+ "X-ClientTraceId": str(uuid.uuid4()),
}
- body = [{'text': texts}]
+ body = [{"text": texts}]
r = requests.post(api_url, params=params, headers=headers, json=body)
result = r.json()
return result
_google_trans_wait = 60
-def google_trans(texts, to='zh_CN'):
+
+
+def google_trans(texts, to="zh_CN"):
"""使用Google翻译文本(默认翻译为简体中文)"""
# API: https://www.jianshu.com/p/ce35d89c25c3
# client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017
@@ -178,7 +217,9 @@ def google_trans(texts, to='zh_CN'):
proxies = read_proxy()
r = requests.get(url, proxies=proxies)
while r.status_code == 429:
- logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试")
+ logger.warning(
+ f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试"
+ )
time.sleep(_google_trans_wait)
r = requests.get(url, proxies=proxies)
if r.status_code == 429:
@@ -186,10 +227,11 @@ def google_trans(texts, to='zh_CN'):
if r.status_code == 200:
result = r.json()
else:
- result = {'error_code': r.status_code, 'error_msg': r.reason}
- time.sleep(4) # Google翻译的API有QPS限制,因此需要等待一段时间
+ result = {"error_code": r.status_code, "error_msg": r.reason}
+ time.sleep(4) # Google翻译的API有QPS限制,因此需要等待一段时间
return result
+
def claude_translate(texts, api_key, to="zh_CN"):
"""使用Claude翻译文本(默认翻译为简体中文)"""
api_url = "https://api.anthropic.com/v1/messages"
@@ -214,6 +256,7 @@ def claude_translate(texts, api_key, to="zh_CN"):
}
return result
+
def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"):
"""使用 OpenAI 翻译文本(默认翻译为简体中文)"""
api_url = str(url)
@@ -222,29 +265,32 @@ def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"):
"Authorization": f"Bearer {api_key}",
}
data = {
- "messages": [
- {
- "role": "system",
- "content": f"Translate the following Japanese paragraph into {to}, while leaving non-Japanese text, names, or text that does not look like Japanese untranslated. Reply with the translated text only, do not add any text that is not in the original content."
- },
- {
- "role": "user",
- "content": texts
- }
- ],
- "model": model,
- "temperature": 0,
- "max_tokens": 1024,
+ "messages": [
+ {
+ "role": "system",
+ "content": f"Translate the following Japanese paragraph into {to}, while leaving non-Japanese text, names, or text that does not look like Japanese untranslated. Reply with the translated text only, do not add any text that is not in the original content.",
+ },
+ {"role": "user", "content": texts},
+ ],
+ "model": model,
+ "temperature": 0,
+ "max_tokens": 1024,
}
r = requests.post(api_url, headers=headers, json=data)
if r.status_code == 200:
- if 'error' in r.json():
+ if "error" in r.json():
result = {
"error_code": r.status_code,
"error_msg": r.json().get("error", {}).get("message", ""),
}
else:
- result = r.json().get("choices", [{}])[0].get("message", {}).get("content", "").strip()
+ result = (
+ r.json()
+ .get("choices", [{}])[0]
+ .get("message", {})
+ .get("content", "")
+ .strip()
+ )
else:
result = {
"error_code": r.status_code,
diff --git a/poetry.lock b/poetry.lock
index 1c92293a3..14bb35742 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -549,27 +549,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "flake8"
-version = "7.1.1"
-description = "the modular source code checker: pep8 pyflakes and co"
-optional = false
-python-versions = ">=3.8.1"
-files = [
- {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"},
- {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"},
-]
-
-[package.dependencies]
-mccabe = ">=0.7.0,<0.8.0"
-pycodestyle = ">=2.12.0,<2.13.0"
-pyflakes = ">=3.2.0,<3.3.0"
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "idna"
version = "3.10"
@@ -889,22 +868,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "mccabe"
-version = "0.7.0"
-description = "McCabe checker, plugin for flake8"
-optional = false
-python-versions = ">=3.6"
-files = [
- {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
- {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "packaging"
version = "24.1"
@@ -1176,22 +1139,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "pycodestyle"
-version = "2.12.1"
-description = "Python style guide checker"
-optional = false
-python-versions = ">=3.8"
-files = [
- {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"},
- {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "pycparser"
version = "2.22"
@@ -1412,22 +1359,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "pyflakes"
-version = "3.2.0"
-description = "passive checker of Python programs"
-optional = false
-python-versions = ">=3.8"
-files = [
- {file = "pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a"},
- {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "pyparsing"
version = "3.1.4"
@@ -1668,6 +1599,38 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "ruff"
+version = "0.6.8"
+description = "An extremely fast Python linter and code formatter, written in Rust."
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "ruff-0.6.8-py3-none-linux_armv6l.whl", hash = "sha256:77944bca110ff0a43b768f05a529fecd0706aac7bcce36d7f1eeb4cbfca5f0f2"},
+ {file = "ruff-0.6.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:27b87e1801e786cd6ede4ada3faa5e254ce774de835e6723fd94551464c56b8c"},
+ {file = "ruff-0.6.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:cd48f945da2a6334f1793d7f701725a76ba93bf3d73c36f6b21fb04d5338dcf5"},
+ {file = "ruff-0.6.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:677e03c00f37c66cea033274295a983c7c546edea5043d0c798833adf4cf4c6f"},
+ {file = "ruff-0.6.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9f1476236b3eacfacfc0f66aa9e6cd39f2a624cb73ea99189556015f27c0bdeb"},
+ {file = "ruff-0.6.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f5a2f17c7d32991169195d52a04c95b256378bbf0de8cb98478351eb70d526f"},
+ {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5fd0d4b7b1457c49e435ee1e437900ced9b35cb8dc5178921dfb7d98d65a08d0"},
+ {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8034b19b993e9601f2ddf2c517451e17a6ab5cdb1c13fdff50c1442a7171d87"},
+ {file = "ruff-0.6.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6cfb227b932ba8ef6e56c9f875d987973cd5e35bc5d05f5abf045af78ad8e098"},
+ {file = "ruff-0.6.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef0411eccfc3909269fed47c61ffebdcb84a04504bafa6b6df9b85c27e813b0"},
+ {file = "ruff-0.6.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:007dee844738c3d2e6c24ab5bc7d43c99ba3e1943bd2d95d598582e9c1b27750"},
+ {file = "ruff-0.6.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:ce60058d3cdd8490e5e5471ef086b3f1e90ab872b548814e35930e21d848c9ce"},
+ {file = "ruff-0.6.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1085c455d1b3fdb8021ad534379c60353b81ba079712bce7a900e834859182fa"},
+ {file = "ruff-0.6.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:70edf6a93b19481affd287d696d9e311388d808671bc209fb8907b46a8c3af44"},
+ {file = "ruff-0.6.8-py3-none-win32.whl", hash = "sha256:792213f7be25316f9b46b854df80a77e0da87ec66691e8f012f887b4a671ab5a"},
+ {file = "ruff-0.6.8-py3-none-win_amd64.whl", hash = "sha256:ec0517dc0f37cad14a5319ba7bba6e7e339d03fbf967a6d69b0907d61be7a263"},
+ {file = "ruff-0.6.8-py3-none-win_arm64.whl", hash = "sha256:8d3bb2e3fbb9875172119021a13eed38849e762499e3cfde9588e4b4d70968dc"},
+ {file = "ruff-0.6.8.tar.gz", hash = "sha256:a5bf44b1aa0adaf6d9d20f86162b34f7c593bfedabc51239953e446aefc8ce18"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "setuptools"
version = "75.1.0"
@@ -2041,4 +2004,4 @@ reference = "mirrors"
[metadata]
lock-version = "2.0"
python-versions = "<3.13,>=3.10"
-content-hash = "056b2f7a21b0286a04a5ecadb809f6472c636348fe07976ac42c9c47c620f04c"
+content-hash = "29f8d207debd76155da3db3331fa117832c969b27889645d3a65570c8692f47d"
diff --git a/pyproject.toml b/pyproject.toml
index a5e1b4d10..152f1a289 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ confz = "^2.0.1"
pydantic-extra-types = "^2.9.0"
pendulum = "^3.0.0"
slimeface = "^2024.9.27"
+ruff = "^0.6.8"
[tool.poetry.scripts]
javsp = "javsp.__main__:entry"
@@ -42,7 +43,6 @@ priority = "primary"
[tool.poetry.group.dev.dependencies]
pytest = "^8.1.1"
-flake8 = "^7.0.0"
cx-freeze = "^7.2.2"
types-lxml = "^2024.4.14"
types-pillow = "^10.2.0.20240822"
diff --git a/setup.py b/setup.py
index 5d3aba2a8..81652452d 100644
--- a/setup.py
+++ b/setup.py
@@ -9,39 +9,34 @@
include_files: List[Tuple[str, str]] = [
- (f'{proj_root}/config.yml', 'config.yml'),
- (f'{proj_root}/data', 'data'),
- (f'{proj_root}/image', 'image')
+ (f"{proj_root}/config.yml", "config.yml"),
+ (f"{proj_root}/data", "data"),
+ (f"{proj_root}/image", "image"),
]
includes = []
-for file in os.listdir('javsp/web'):
+for file in os.listdir("javsp/web"):
name, ext = os.path.splitext(file)
- if ext == '.py':
- includes.append('javsp.web.' + name)
+ if ext == ".py":
+ includes.append("javsp.web." + name)
-packages = [
- 'pendulum' # pydantic_extra_types depends on pendulum
+packages = [
+ "pendulum" # pydantic_extra_types depends on pendulum
]
build_exe = {
- 'include_files': include_files,
- 'includes': includes,
- 'excludes': ['unittest'],
- 'packages': packages,
+ "include_files": include_files,
+ "includes": includes,
+ "excludes": ["unittest"],
+ "packages": packages,
}
javsp = Executable(
- './javsp/__main__.py',
- target_name='JavSP',
+ "./javsp/__main__.py",
+ target_name="JavSP",
base=base,
- icon='./image/JavSP.ico',
-)
-
-setup(
- name='JavSP',
- options = {'build_exe': build_exe},
- executables=[javsp]
+ icon="./image/JavSP.ico",
)
+setup(name="JavSP", options={"build_exe": build_exe}, executables=[javsp])
diff --git a/tools/airav_search.py b/tools/airav_search.py
index ca6aa95d8..678b1ab48 100644
--- a/tools/airav_search.py
+++ b/tools/airav_search.py
@@ -1,37 +1,38 @@
"""获取airav指定关键词的所有搜索结果"""
+
import os
import sys
import json
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from javsp.web.base import Request
request = Request()
-request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9'
+request.headers["Accept-Language"] = "zh-TW,zh;q=0.9"
-base_url = 'https://www.airav.wiki'
+base_url = "https://www.airav.wiki"
def search(keyword):
"""搜索指定影片的所有结果"""
all_results = []
page = 1
- data = {'offset': 0, 'count': 1, 'result': []}
- while (data['offset'] + len(data['result']) < data['count']):
- url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={keyword}&page={page}'
+ data = {"offset": 0, "count": 1, "result": []}
+ while data["offset"] + len(data["result"]) < data["count"]:
+ url = f"{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={keyword}&page={page}"
data = request.get(url).json()
- all_results.extend(data['result'])
+ all_results.extend(data["result"])
print(f"Get page {page}: {len(data['result'])} movie(s)")
page += 1
for i in all_results:
- if not i['url']:
- i['url'] = f"{base_url}/video/{i['barcode']}"
+ if not i["url"]:
+ i["url"] = f"{base_url}/video/{i['barcode']}"
return all_results
if __name__ == "__main__":
- keyword = '版'
+ keyword = "版"
results = search(keyword)
- with open(f'airav_search_{keyword}.json', 'wt', encoding='utf-8') as f:
+ with open(f"airav_search_{keyword}.json", "wt", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
diff --git a/tools/call_crawler.py b/tools/call_crawler.py
index ed17b4ba2..da4315a19 100644
--- a/tools/call_crawler.py
+++ b/tools/call_crawler.py
@@ -1,4 +1,5 @@
"""调用抓取器抓取数据"""
+
import os
import sys
@@ -11,21 +12,21 @@
file_dir = os.path.dirname(__file__)
-data_dir = os.path.abspath(os.path.join(file_dir, '../unittest/data'))
-sys.path.insert(0, os.path.abspath(os.path.join(file_dir, '..')))
+data_dir = os.path.abspath(os.path.join(file_dir, "../unittest/data"))
+sys.path.insert(0, os.path.abspath(os.path.join(file_dir, "..")))
from javsp.datatype import MovieInfo
# 搜索抓取器并导入它们
all_crawler = {}
-exclude_files = ['fc2fan']
-for file in os.listdir('web'):
+exclude_files = ["fc2fan"]
+for file in os.listdir("web"):
name, ext = os.path.splitext(file)
- if ext == '.py' and name not in exclude_files:
- modu = 'web.' + name
+ if ext == ".py" and name not in exclude_files:
+ modu = "web." + name
__import__(modu)
- if hasattr(sys.modules[modu], 'parse_data'):
- parser = getattr(sys.modules[modu], 'parse_data')
+ if hasattr(sys.modules[modu], "parse_data"):
+ parser = getattr(sys.modules[modu], "parse_data")
all_crawler[name] = parser
@@ -38,18 +39,18 @@ def call_crawlers(dvdid_list: list, used_crawlers=None):
crawlers (list[str], optional): 要使用的抓取器,未指定时将使用全部抓取器
"""
if used_crawlers:
- crawlers = {i:all_crawler[i] for i in used_crawlers}
+ crawlers = {i: all_crawler[i] for i in used_crawlers}
else:
crawlers = all_crawler
- outer_bar = tqdm(dvdid_list, desc='抓取影片数据', leave=False)
+ outer_bar = tqdm(dvdid_list, desc="抓取影片数据", leave=False)
for avid in outer_bar:
success, fail = [], []
- outer_bar.set_description(f'抓取影片数据: {avid}')
- inner_bar = tqdm(crawlers.items(), desc='抓取器', leave=False)
+ outer_bar.set_description(f"抓取影片数据: {avid}")
+ inner_bar = tqdm(crawlers.items(), desc="抓取器", leave=False)
for name, parser in inner_bar:
- inner_bar.set_description(f'正在抓取{name}'.rjust(10+len(avid)))
+ inner_bar.set_description(f"正在抓取{name}".rjust(10 + len(avid)))
# 每次都会创建一个全新的实例,所以不同抓取器的结果之间不会有影响
- if name != 'fanza':
+ if name != "fanza":
movie = MovieInfo(avid)
else:
movie = MovieInfo(cid=avid)
@@ -60,7 +61,9 @@ def call_crawlers(dvdid_list: list, used_crawlers=None):
success.append(name)
except:
fail.append(name)
- out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format(avid, len(success), ' '.join(success), len(fail), ' '.join(fail))
+ out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format(
+ avid, len(success), " ".join(success), len(fail), " ".join(fail)
+ )
tqdm.write(out)
@@ -69,16 +72,16 @@ def call_crawlers(dvdid_list: list, used_crawlers=None):
# 带参数调用时,将参数全部视作番号并调用所有抓取器抓取数据
call_crawlers(sys.argv[1:])
else:
- user_in = input('请输入要抓取数据的影片番号: ')
+ user_in = input("请输入要抓取数据的影片番号: ")
dvdid_list = user_in.split()
# 提示选择要使用的抓取器
names = list(all_crawler.keys())
for i in range(len(names)):
- print(f"{i+1}. {names[i]}", end=' ')
- user_in2 = input('\n请选择要使用的抓取器(回车表示全部使用): ')
+ print(f"{i+1}. {names[i]}", end=" ")
+ user_in2 = input("\n请选择要使用的抓取器(回车表示全部使用): ")
if user_in2:
items = user_in2.split()
- indexes = [int(i)-1 for i in items if i.isdigit()]
+ indexes = [int(i) - 1 for i in items if i.isdigit()]
valid_indexes = [i for i in indexes if i < len(names)]
used = [names[i] for i in valid_indexes]
else:
diff --git a/tools/check_genre.py b/tools/check_genre.py
index dd562dc65..f8357aa79 100644
--- a/tools/check_genre.py
+++ b/tools/check_genre.py
@@ -16,21 +16,25 @@
import csv
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from javsp.web.base import *
from javsp.config import cfg
def get_javbus_genre():
"""获取JavBus的genre各语言对照列表"""
- record = {} # {id: [cn_url, zh_tw, ja, en]}
+ record = {} # {id: [cn_url, zh_tw, ja, en]}
base_url = cfg.ProxyFree.javbus
subsite_urls = {
- 'normal': ['/genre', '/ja/genre', '/en/genre'],
- 'uncensored': ['/uncensored/genre', '/ja/uncensored/genre', '/en/uncensored/genre'],
+ "normal": ["/genre", "/ja/genre", "/en/genre"],
+ "uncensored": [
+ "/uncensored/genre",
+ "/ja/uncensored/genre",
+ "/en/uncensored/genre",
+ ],
}
for subsite, urls in subsite_urls.items():
- id_prefix = 'uncensored-' if subsite == 'uncensored' else ''
+ id_prefix = "uncensored-" if subsite == "uncensored" else ""
zh_tw = get_html(base_url + urls[0])
ja = get_html(base_url + urls[1])
en = get_html(base_url + urls[2])
@@ -38,8 +42,8 @@ def get_javbus_genre():
genre_tags = html.xpath("//div[@class='row genre-box']/a")
# 提取各个genre的信息
for tag in genre_tags:
- url = tag.get('href')
- id = id_prefix + url.split('/')[-1]
+ url = tag.get("href")
+ id = id_prefix + url.split("/")[-1]
name = tag.text.strip()
if id in record:
record[id].append(name)
@@ -47,9 +51,9 @@ def get_javbus_genre():
record[id] = [url, name]
# 将相关数据进行结构化后返回
data = {
- 'site_name': 'javbus',
- 'header': ['id', 'url', 'zh_tw', 'ja', 'en'],
- 'record': record
+ "site_name": "javbus",
+ "header": ["id", "url", "zh_tw", "ja", "en"],
+ "record": record,
}
return data
@@ -61,9 +65,9 @@ def get_javdb_genre():
record = {}
base_url = cfg.ProxyFree.javdb
subsite_urls = {
- 'normal': ['/tags?locale=zh', '/tags?locale=en'],
- 'uncensored': ['/tags/uncensored?locale=zh', '/tags/uncensored?locale=en'],
- 'western': ['/tags/western?locale=zh', '/tags/western?locale=en']
+ "normal": ["/tags?locale=zh", "/tags?locale=en"],
+ "uncensored": ["/tags/uncensored?locale=zh", "/tags/uncensored?locale=en"],
+ "western": ["/tags/western?locale=zh", "/tags/western?locale=en"],
}
for subsite, urls in subsite_urls.items():
zh_tw = get_html(base_url + urls[0])
@@ -72,8 +76,8 @@ def get_javdb_genre():
genre_tags = html.xpath("//span[@class='tag_labels']/a")
# 提取各个genre的信息
for tag in genre_tags:
- url = tag.get('href')
- id = url.split('/')[-1]
+ url = tag.get("href")
+ id = url.split("/")[-1]
name = tag.text.strip()
if id in record:
record[id].append(name)
@@ -81,14 +85,14 @@ def get_javdb_genre():
record[id] = [url, name]
# 移除分类中的c9:'筛选', c10:'年份', c11:'时长'
for id, _ in record.copy().items():
- catelog = id.split('?')[1].split('=')[0] # e.g. tags?c11=2021
- if catelog in ['c9', 'c10', 'c11']:
+ catelog = id.split("?")[1].split("=")[0] # e.g. tags?c11=2021
+ if catelog in ["c9", "c10", "c11"]:
del record[id]
# 将相关数据进行结构化后返回
data = {
- 'site_name': 'javdb',
- 'header': ['id', 'url', 'zh_tw', 'en'],
- 'record': record
+ "site_name": "javdb",
+ "header": ["id", "url", "zh_tw", "en"],
+ "record": record,
}
return data
@@ -97,22 +101,22 @@ def get_avsox_genre():
"""获取AVSOX的genre各语言对照列表"""
record = {}
base_url = cfg.ProxyFree.avsox
- languages = ['cn', 'tw', 'en', 'ja']
+ languages = ["cn", "tw", "en", "ja"]
for lang in languages:
- html = get_html(f'{base_url}/{lang}/genre')
+ html = get_html(f"{base_url}/{lang}/genre")
genre_tags = html.xpath("//div[@class='row genre-box']/a")
for tag in genre_tags:
- url = tag.get('href')
- id = url.split('/')[-1]
+ url = tag.get("href")
+ id = url.split("/")[-1]
name = tag.text.strip()
if id in record:
record[id].append(name)
else:
record[id] = [url, name]
data = {
- 'site_name': 'avsox',
- 'header': ['id', 'url', 'zh_cn', 'zh_tw', 'en', 'ja'],
- 'record': record
+ "site_name": "avsox",
+ "header": ["id", "url", "zh_cn", "zh_tw", "en", "ja"],
+ "record": record,
}
return data
@@ -121,22 +125,22 @@ def get_javlib_genre():
"""获取JavLibrary的genre各语言对照列表"""
record = {}
base_url = cfg.ProxyFree.javlib
- languages = ['cn', 'tw', 'en', 'ja']
+ languages = ["cn", "tw", "en", "ja"]
for lang in languages:
- html = get_html(f'{base_url}/{lang}/genres.php')
+ html = get_html(f"{base_url}/{lang}/genres.php")
genre_tags = html.xpath("//div[@class='genreitem']/a")
for tag in genre_tags:
- url = tag.get('href')
- id = url.split('=')[-1]
+ url = tag.get("href")
+ id = url.split("=")[-1]
name = tag.text.strip()
if id in record:
record[id].append(name)
else:
record[id] = [url, name]
data = {
- 'site_name': 'javlib',
- 'header': ['id', 'url', 'zh_cn', 'zh_tw', 'en', 'ja'],
- 'record': record
+ "site_name": "javlib",
+ "header": ["id", "url", "zh_cn", "zh_tw", "en", "ja"],
+ "record": record,
}
return data
@@ -144,12 +148,12 @@ def get_javlib_genre():
def write_csv(data):
"""将genre按照中文翻译排序后写入csv文件"""
# data格式: {'site_name': name, 'header': ['id', 'url', 'zh_tw'...], 'record': {id1: [ls1], id2: [ls2]...}}
- record = data['record']
+ record = data["record"]
csv_name = f"data/genre_{data['site_name']}.csv"
- csv_header = data['header'] + ['translate', 'note']
+ csv_header = data["header"] + ["translate", "note"]
# p[1][1] 必须是最接近最终翻译文本的那一列(如繁体中文)
sort_record = {k: v for k, v in sorted(record.items(), key=lambda p: p[1][1])}
- with open(csv_name, 'wt', encoding='utf-8-sig', newline='') as csvfile:
+ with open(csv_name, "wt", encoding="utf-8-sig", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(csv_header)
for id, genres in sort_record.items():
diff --git a/tools/config_migration.py b/tools/config_migration.py
index 95adc45d6..d1360d388 100644
--- a/tools/config_migration.py
+++ b/tools/config_migration.py
@@ -3,55 +3,68 @@
import re
arg_parser = ArgumentParser(
- prog='config migration',
- description='migration your javsp config to yaml')
+ prog="config migration", description="migration your javsp config to yaml"
+)
-arg_parser.add_argument('-i', '--input', help='path to config.ini')
-arg_parser.add_argument('-o', '--output', help='path to output config', default="config.yml")
+arg_parser.add_argument("-i", "--input", help="path to config.ini")
+arg_parser.add_argument(
+ "-o", "--output", help="path to output config", default="config.yml"
+)
args, _ = arg_parser.parse_known_args()
-if(args.input is None):
+if args.input is None:
print("Expecting an input config file, try `config_migration.py -h` to see help.")
exit(1)
cfg = ConfigParser()
cfg.read(args.input)
-ignore_regexes: list[str] = cfg['MovieID']['ignore_regex'].split(';')
-ignore_regexes += cfg['MovieID']['ignore_whole_word'].split(';')
-ignore_regexes.append('(144|240|360|480|720|1080)[Pp]')
-ignore_regexes.append('[24][Kk]')
+ignore_regexes: list[str] = cfg["MovieID"]["ignore_regex"].split(";")
+ignore_regexes += cfg["MovieID"]["ignore_whole_word"].split(";")
+ignore_regexes.append("(144|240|360|480|720|1080)[Pp]")
+ignore_regexes.append("[24][Kk]")
-input_directory = cfg['File']['scan_dir']
-input_directory = 'null' if len(input_directory) == 0 else f"'{input_directory}'"
+input_directory = cfg["File"]["scan_dir"]
+input_directory = "null" if len(input_directory) == 0 else f"'{input_directory}'"
-filename_extensions = cfg['File']['media_ext'].split(';')
+filename_extensions = cfg["File"]["media_ext"].split(";")
-ignored_folders = cfg['File']['ignore_folder'].split(';')
+ignored_folders = cfg["File"]["ignore_folder"].split(";")
+
+proxy_disabled = cfg["Network"]["use_proxy"] == "no" or cfg["Network"]["proxy"] == ""
-proxy_disabled = cfg['Network']['use_proxy'] == 'no' or cfg['Network']['proxy'] == ''
def yes_to_true(s):
- return 'true' if s == 'yes' else 'false'
+ return "true" if s == "yes" else "false"
+
def use_javdb_cover(s):
- if s == 'yes': return 'no'
- elif s == 'no': return 'yes'
- elif s == 'auto': return 'fallback'
+ if s == "yes":
+ return "no"
+ elif s == "no":
+ return "yes"
+ elif s == "auto":
+ return "fallback"
+
def path_len_by_byte(s):
- if s == 'no': return 'false'
- else: return 'true'
+ if s == "no":
+ return "false"
+ else:
+ return "true"
+
def ai_crop_pat(s):
- if s == r'\d':
- return r'^\d{6}[-_]\d{3}$'
+ if s == r"\d":
+ return r"^\d{6}[-_]\d{3}$"
else:
- return '^' + s
+ return "^" + s
+
def fix_pat(p):
- return re.sub(r'\$([a-z]+)', r'{\1}', p)
+ return re.sub(r"\$([a-z]+)", r"{\1}", p)
+
config_str = f"""# vim:foldmethod=marker
################################
@@ -242,6 +255,5 @@ def fix_pat(p):
# 是否允许检查到新版本时自动下载
auto_update: {yes_to_true(cfg['Other']['auto_update'])}"""
-with open(args.output, mode ="w") as file:
+with open(args.output, mode="w") as file:
file.write(config_str)
-
diff --git a/tools/version.py b/tools/version.py
index b018e0a16..9e63bfaba 100644
--- a/tools/version.py
+++ b/tools/version.py
@@ -1,5 +1,5 @@
import importlib.metadata as meta
-javsp_version = meta.version('javsp')
+javsp_version = meta.version("javsp")
print(javsp_version)
diff --git a/unittest/conftest.py b/unittest/conftest.py
index bfd973a8b..f46945597 100644
--- a/unittest/conftest.py
+++ b/unittest/conftest.py
@@ -4,7 +4,7 @@
from glob import glob
-data_dir = os.path.join(os.path.dirname(__file__), 'data')
+data_dir = os.path.join(os.path.dirname(__file__), "data")
def pytest_addoption(parser):
@@ -12,6 +12,7 @@ def pytest_addoption(parser):
"--only", action="store", default="", help="仅测试指定抓取器的数据"
)
+
def pytest_runtest_logreport(report):
"""定制 short test summary info 显示格式"""
# report 的部分属性形如
@@ -20,8 +21,8 @@ def pytest_runtest_logreport(report):
# keywords: {'082713-417: avsox': 1, 'unittest/test_crawlers.py': 1, 'test_crawler[082713-417: avsox]': 1, 'JavSP': 1}
# 为test_crawlers.py定制short test summary格式
- if 'test_crawlers.py::' in report.nodeid:
- report.nodeid = re.sub(r'^.*::test_crawler', '', report.nodeid)
+ if "test_crawlers.py::" in report.nodeid:
+ report.nodeid = re.sub(r"^.*::test_crawler", "", report.nodeid)
@pytest.fixture
@@ -30,17 +31,17 @@ def crawler(request):
def pytest_generate_tests(metafunc):
- if 'crawler_params' in metafunc.fixturenames:
+ if "crawler_params" in metafunc.fixturenames:
# 根据测试数据文件夹中的文件生成测试数据
testcases = {}
- data_files = glob(data_dir + os.sep + '*.json')
+ data_files = glob(data_dir + os.sep + "*.json")
target_crawler = metafunc.config.getoption("--only")
for file in data_files:
basename = os.path.basename(file)
match = re.match(r"([-\w]+) \((\w+)\)", basename, re.I)
if match:
avid, scraper = match.groups()
- name = f'{avid}: {scraper}'
+ name = f"{avid}: {scraper}"
# 仅当未指定抓取器或者指定的抓取器与当前抓取器相同时,才实际执行抓取和比较
if (not target_crawler) or scraper == target_crawler:
testcases[name] = (avid, scraper, file)
diff --git a/unittest/test_avid.py b/unittest/test_avid.py
index ca0c0008f..9ee5006b3 100644
--- a/unittest/test_avid.py
+++ b/unittest/test_avid.py
@@ -5,7 +5,7 @@
from shutil import rmtree
file_dir = os.path.dirname(__file__)
-sys.path.insert(0, os.path.abspath(os.path.join(file_dir, '..')))
+sys.path.insert(0, os.path.abspath(os.path.join(file_dir, "..")))
from javsp.avid import get_id, get_cid
@@ -16,13 +16,13 @@ def prepare_files(files):
Args:
files (list of tuple): 文件列表,仅接受相对路径
"""
- tmp_folder = 'tmp_' + uuid.uuid4().hex[:8]
+ tmp_folder = "tmp_" + uuid.uuid4().hex[:8]
for i in files:
path = os.path.join(tmp_folder, i)
folder = os.path.split(path)[0]
if folder and (not os.path.exists(folder)):
os.makedirs(folder)
- with open(path, 'wt', encoding='utf-8') as f:
+ with open(path, "wt", encoding="utf-8") as f:
f.write(path)
yield
rmtree(tmp_folder)
@@ -30,31 +30,31 @@ def prepare_files(files):
def test_fc2():
- assert 'FC2-123456' == get_id('(2017) [FC2-123456] 【個人撮影】')
- assert 'FC2-123456' == get_id('fc2-ppv-123456-1.delogo.mp4')
- assert 'FC2-123456' == get_id('FC2-PPV-123456.mp4')
- assert 'FC2-123456' == get_id('FC2PPV-123456 Yuukiy')
- assert 'FC2-1234567' == get_id('fc2-ppv_1234567-2.mp4')
+ assert "FC2-123456" == get_id("(2017) [FC2-123456] 【個人撮影】")
+ assert "FC2-123456" == get_id("fc2-ppv-123456-1.delogo.mp4")
+ assert "FC2-123456" == get_id("FC2-PPV-123456.mp4")
+ assert "FC2-123456" == get_id("FC2PPV-123456 Yuukiy")
+ assert "FC2-1234567" == get_id("fc2-ppv_1234567-2.mp4")
def test_normal():
- assert '' == get_id('Yuukiy')
- assert 'ABC-12' == get_id('ABC-12_01.mkv')
- assert 'ABC-123' == get_id('Sky Angel Vol.6 月丘うさぎ(ABC-123).avi')
- assert 'ABCD-123' == get_id('ABCD-123.mp4')
+ assert "" == get_id("Yuukiy")
+ assert "ABC-12" == get_id("ABC-12_01.mkv")
+ assert "ABC-123" == get_id("Sky Angel Vol.6 月丘うさぎ(ABC-123).avi")
+ assert "ABCD-123" == get_id("ABCD-123.mp4")
def test_cid_valid():
- assert 'ab012st' == get_cid('ab012st')
- assert 'ab012st' == get_cid('ab012st.mp4')
- assert '123_0456' == get_cid('123_0456.mp4')
- assert '123abc00045' == get_cid('123abc00045.mp4')
- assert '403abcd56789' == get_cid('403abcd56789_1')
- assert 'h_001abc00001' == get_cid('h_001abc00001.mp4')
- assert '1234wvr00001rp' == get_cid('1234wvr00001rp.mp4')
- assert '402abc_hello000089' == get_cid('402abc_hello000089.mp4')
- assert 'h_826zizd021' == get_cid('h_826zizd021.mp4')
- assert '403abcd56789' == get_cid('403abcd56789cd1.mp4')
+ assert "ab012st" == get_cid("ab012st")
+ assert "ab012st" == get_cid("ab012st.mp4")
+ assert "123_0456" == get_cid("123_0456.mp4")
+ assert "123abc00045" == get_cid("123abc00045.mp4")
+ assert "403abcd56789" == get_cid("403abcd56789_1")
+ assert "h_001abc00001" == get_cid("h_001abc00001.mp4")
+ assert "1234wvr00001rp" == get_cid("1234wvr00001rp.mp4")
+ assert "402abc_hello000089" == get_cid("402abc_hello000089.mp4")
+ assert "h_826zizd021" == get_cid("h_826zizd021.mp4")
+ assert "403abcd56789" == get_cid("403abcd56789cd1.mp4")
def test_from_file():
@@ -62,50 +62,52 @@ def test_from_file():
write_back = False
rewrite_lines = []
- datafile = os.path.join(file_dir, 'testdata_avid.txt')
- with open(datafile, 'rt', encoding='utf-8') as f:
+ datafile = os.path.join(file_dir, "testdata_avid.txt")
+ with open(datafile, "rt", encoding="utf-8") as f:
lines = f.readlines()
for line_no, line in enumerate(lines, start=1):
- items = line.strip('\r\n').split('\t')
+ items = line.strip("\r\n").split("\t")
if len(items) == 2:
(filename, avid), ignore = items, False
else:
filename, avid, ignore = items
guess_id = get_id(filename)
if write_back:
- rewrite_lines.append(f'{filename}\t{guess_id}\n')
+ rewrite_lines.append(f"{filename}\t{guess_id}\n")
continue
if guess_id != avid:
if ignore:
print(f"Ignored: {guess_id} != {avid}\t'{filename}'")
else:
- assert guess_id == avid.upper(), f'AV ID not match at line {line_no}'
+ assert (
+ guess_id == avid.upper()
+ ), f"AV ID not match at line {line_no}"
if write_back:
- with open(datafile, 'wt', encoding='utf-8') as f:
+ with open(datafile, "wt", encoding="utf-8") as f:
f.writelines(rewrite_lines)
def test_cid_invalid():
- assert '' == get_cid('hasUpperletter.mp4')
- assert '' == get_cid('存在非ASCII字符.mp4')
- assert '' == get_cid('has-dash.mp4')
- assert '' == get_cid('403_abcd56789_fgh')
- assert '' == get_cid('many_parts1234-12.mp4')
- assert '' == get_cid('abc12.mp4')
- assert '' == get_cid('ab012st/仅文件夹名称为cid.mp4')
- assert '' == get_cid('123_0456st.mp4')
+ assert "" == get_cid("hasUpperletter.mp4")
+ assert "" == get_cid("存在非ASCII字符.mp4")
+ assert "" == get_cid("has-dash.mp4")
+ assert "" == get_cid("403_abcd56789_fgh")
+ assert "" == get_cid("many_parts1234-12.mp4")
+ assert "" == get_cid("abc12.mp4")
+ assert "" == get_cid("ab012st/仅文件夹名称为cid.mp4")
+ assert "" == get_cid("123_0456st.mp4")
-@pytest.mark.parametrize('files', [('Unknown.mp4',)])
+@pytest.mark.parametrize("files", [("Unknown.mp4",)])
def test_by_folder_name1(prepare_files):
- assert '' == get_id('Unknown.mp4')
+ assert "" == get_id("Unknown.mp4")
-@pytest.mark.parametrize('files', [('FC2-123456/Unknown.mp4',)])
+@pytest.mark.parametrize("files", [("FC2-123456/Unknown.mp4",)])
def test_by_folder_name2(prepare_files):
- assert 'FC2-123456' == get_id('FC2-123456/Unknown.mp4')
+ assert "FC2-123456" == get_id("FC2-123456/Unknown.mp4")
-@pytest.mark.parametrize('files', [('ABC-123/CDF-456.mp4',)])
+@pytest.mark.parametrize("files", [("ABC-123/CDF-456.mp4",)])
def test_by_folder_name3(prepare_files):
- assert 'CDF-456' == get_id('ABC-123/CDF-456.mp4')
+ assert "CDF-456" == get_id("ABC-123/CDF-456.mp4")
diff --git a/unittest/test_crawlers.py b/unittest/test_crawlers.py
index 3b0257e07..42f5cf4c8 100644
--- a/unittest/test_crawlers.py
+++ b/unittest/test_crawlers.py
@@ -6,8 +6,8 @@
file_dir = os.path.dirname(__file__)
-data_dir = os.path.join(file_dir, 'data')
-sys.path.insert(0, os.path.abspath(os.path.join(file_dir, '..')))
+data_dir = os.path.join(file_dir, "data")
+sys.path.insert(0, os.path.abspath(os.path.join(file_dir, "..")))
from javsp.datatype import MovieInfo
from javsp.web.exceptions import CrawlerError, SiteBlocked
@@ -25,27 +25,30 @@ def test_crawler(crawler_params):
compare(*crawler_params)
except requests.exceptions.ReadTimeout:
logger.warning(f"{site} 连接超时: {params}")
- except Exception as e:
- if os.getenv('GITHUB_ACTIONS') and (site in ['javdb', 'javlib', 'airav']):
- logger.debug(f'检测到Github actions环境,已忽略测试失败项: {params}', exc_info=True)
+ except Exception:
+ if os.getenv("GITHUB_ACTIONS") and (site in ["javdb", "javlib", "airav"]):
+ logger.debug(
+ f"检测到Github actions环境,已忽略测试失败项: {params}", exc_info=True
+ )
else:
raise
+
def compare(avid, scraper, file):
"""从本地的数据文件生成Movie实例,并与在线抓取到的数据进行比较"""
local = MovieInfo(from_file=file)
- if scraper != 'fanza':
+ if scraper != "fanza":
online = MovieInfo(avid)
else:
online = MovieInfo(cid=avid)
# 导入抓取器模块
- scraper_mod = 'javsp.web.' + scraper
+ scraper_mod = "javsp.web." + scraper
__import__(scraper_mod)
mod = sys.modules[scraper_mod]
- if hasattr(mod, 'parse_clean_data'):
- parse_data = getattr(mod, 'parse_clean_data')
+ if hasattr(mod, "parse_clean_data"):
+ parse_data = getattr(mod, "parse_clean_data")
else:
- parse_data = getattr(mod, 'parse_data')
+ parse_data = getattr(mod, "parse_data")
try:
parse_data(online)
@@ -61,22 +64,24 @@ def compare(avid, scraper, file):
online_vars = vars(online)
for k, v in online_vars.items():
# 部分字段可能随时间变化,因此只要这些字段不是一方有值一方无值就行
- if k in ['score', 'magnet']:
+ if k in ["score", "magnet"]:
assert bool(v) == bool(local_vars.get(k, None))
- elif k == 'preview_video' and scraper in ['airav', 'javdb']:
+ elif k == "preview_video" and scraper in ["airav", "javdb"]:
assert bool(v) == bool(local_vars.get(k, None))
# JavBus采用免代理域名时图片地址也会是免代理域名,因此只比较path部分即可
- elif k == 'cover' and scraper == 'javbus':
+ elif k == "cover" and scraper == "javbus":
assert urlsplit(v).path == urlsplit(local_vars.get(k, None)).path
- elif k == 'actress_pics' and scraper == 'javbus':
+ elif k == "actress_pics" and scraper == "javbus":
local_tmp = online_tmp = {}
local_pics = local_vars.get(k)
if local_pics:
- local_tmp = {name: urlsplit(url).path for name, url in local_pics.items()}
+ local_tmp = {
+ name: urlsplit(url).path for name, url in local_pics.items()
+ }
if v:
online_tmp = {name: urlsplit(url).path for name, url in v.items()}
assert local_tmp == online_tmp
- elif k == 'preview_pics' and scraper == 'javbus':
+ elif k == "preview_pics" and scraper == "javbus":
local_pics = local_vars.get(k)
if local_pics:
local_tmp = [urlsplit(i).path for i in local_pics]
@@ -84,7 +89,7 @@ def compare(avid, scraper, file):
online_tmp = [urlsplit(i).path for i in v]
assert local_tmp == online_tmp
# 对顺序没有要求的list型字段,比较时也应该忽略顺序信息
- elif k in ['genre', 'genre_id', 'genre_norm', 'actress']:
+ elif k in ["genre", "genre_id", "genre_norm", "actress"]:
if isinstance(v, list):
loc_v = local_vars.get(k)
if loc_v is None:
@@ -96,7 +101,7 @@ def compare(avid, scraper, file):
assert v == local_vars.get(k, None)
except AssertionError:
# 本地运行时更新已有的测试数据,方便利用版本控制系统检查差异项
- if not os.getenv('GITHUB_ACTIONS'):
+ if not os.getenv("GITHUB_ACTIONS"):
online.dump(file)
raise
except Exception as e:
diff --git a/unittest/test_exe.py b/unittest/test_exe.py
index 983d1ff67..3307d5f0c 100644
--- a/unittest/test_exe.py
+++ b/unittest/test_exe.py
@@ -8,25 +8,29 @@
def test_javsp_exe():
cwd = os.getcwd()
- dist_dir = os.path.normpath(os.path.join(os.path.dirname(__file__) + '/../dist'))
+ dist_dir = os.path.normpath(os.path.join(os.path.dirname(__file__) + "/../dist"))
os.chdir(dist_dir)
size = 300 * 2**20
- tmp_folder = '.TMP_' + ''.join(random.choices(string.ascii_uppercase, k=6))
- FILE = '300MAAN-642.RIP.f4v'
+ tmp_folder = ".TMP_" + "".join(random.choices(string.ascii_uppercase, k=6))
+ FILE = "300MAAN-642.RIP.f4v"
try:
os.system(f"fsutil file createnew {FILE} {size}")
- r = subprocess.run(f"JavSP.exe --auto-exit --input . --output {tmp_folder}".split(), capture_output=True, encoding='utf-8')
- print(r.stdout, r.stderr.encode().decode("unicode_escape"), sep='\n')
+ r = subprocess.run(
+ f"JavSP.exe --auto-exit --input . --output {tmp_folder}".split(),
+ capture_output=True,
+ encoding="utf-8",
+ )
+ print(r.stdout, r.stderr.encode().decode("unicode_escape"), sep="\n")
r.check_returncode()
# Check generated files
- files = glob(tmp_folder + '/**/*.*', recursive=True)
- print('\n'.join(files))
+ files = glob(tmp_folder + "/**/*.*", recursive=True)
+ print("\n".join(files))
# assert all('横宮七海' in i for i in files), "Actress name not found"
- assert any(i.endswith('fanart.jpg') for i in files), "fanart not found"
- assert any(i.endswith('poster.jpg') for i in files), "poster not found"
- assert any(i.endswith('.f4v') for i in files), "video file not found"
- assert any(i.endswith('.nfo') for i in files), "nfo file not found"
+ assert any(i.endswith("fanart.jpg") for i in files), "fanart not found"
+ assert any(i.endswith("poster.jpg") for i in files), "poster not found"
+ assert any(i.endswith(".f4v") for i in files), "video file not found"
+ assert any(i.endswith(".nfo") for i in files), "nfo file not found"
finally:
if os.path.exists(FILE):
os.remove(FILE)
diff --git a/unittest/test_file.py b/unittest/test_file.py
index df83467e0..ae3a689da 100644
--- a/unittest/test_file.py
+++ b/unittest/test_file.py
@@ -6,17 +6,19 @@
from shutil import rmtree
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from javsp.file import scan_movies
-tmp_folder = 'TMP_' + ''.join(random.choices(string.ascii_uppercase, k=6))
-DEFAULT_SIZE = 512*2**20 # 512 MiB
+tmp_folder = "TMP_" + "".join(random.choices(string.ascii_uppercase, k=6))
+DEFAULT_SIZE = 512 * 2**20 # 512 MiB
+
def touch_file_size(path: str, size_bytes: int):
- with open(path, 'wb') as f:
+ with open(path, "wb") as f:
f.seek(size_bytes - 1)
- f.write(b'\0')
+ f.write(b"\0")
+
@pytest.fixture
def prepare_files(files):
@@ -26,7 +28,7 @@ def prepare_files(files):
files (list of tuple): 文件列表,仅接受相对路径
"""
if not isinstance(files, dict):
- files = {i:DEFAULT_SIZE for i in files}
+ files = {i: DEFAULT_SIZE for i in files}
for name, size in files.items():
path = os.path.join(tmp_folder, name)
folder = os.path.split(path)[0]
@@ -39,190 +41,234 @@ def prepare_files(files):
# 根文件夹下的单个影片文件
-@pytest.mark.parametrize('files', [('ABC-123.mp4',)])
+@pytest.mark.parametrize("files", [("ABC-123.mp4",)])
def test_single_movie(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 1
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'ABC-123.mp4'
+ assert basenames[0] == "ABC-123.mp4"
# 多个分片以数字排序: 012
-@pytest.mark.parametrize('files', [('ABC-123-0.mp4','ABC-123-1.mp4','ABC-123- 2.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123-0.mp4", "ABC-123-1.mp4", "ABC-123- 2.mp4")]
+)
def test_scan_movies__012(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 3
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'ABC-123-0.mp4'
- assert basenames[1] == 'ABC-123-1.mp4'
- assert basenames[2] == 'ABC-123- 2.mp4'
+ assert basenames[0] == "ABC-123-0.mp4"
+ assert basenames[1] == "ABC-123-1.mp4"
+ assert basenames[2] == "ABC-123- 2.mp4"
# 多个分片以数字排序: 123
-@pytest.mark.parametrize('files', [('ABC-123.1.mp4','ABC-123. 2.mp4','ABC-123.3.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123.1.mp4", "ABC-123. 2.mp4", "ABC-123.3.mp4")]
+)
def test_scan_movies__123(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 3
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'ABC-123.1.mp4'
- assert basenames[1] == 'ABC-123. 2.mp4'
- assert basenames[2] == 'ABC-123.3.mp4'
+ assert basenames[0] == "ABC-123.1.mp4"
+ assert basenames[1] == "ABC-123. 2.mp4"
+ assert basenames[2] == "ABC-123.3.mp4"
# 多个分片以字母排序
-@pytest.mark.parametrize('files', [('ABC-123-A.mp4','ABC-123-B.mp4','ABC-123- C .mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123-A.mp4", "ABC-123-B.mp4", "ABC-123- C .mp4")]
+)
def test_scan_movies__abc(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 3
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'ABC-123-A.mp4'
- assert basenames[1] == 'ABC-123-B.mp4'
- assert basenames[2] == 'ABC-123- C .mp4'
+ assert basenames[0] == "ABC-123-A.mp4"
+ assert basenames[1] == "ABC-123-B.mp4"
+ assert basenames[2] == "ABC-123- C .mp4"
# 多个分片以.CDx编号
-@pytest.mark.parametrize('files', [('ABC-123.CD1.mp4','ABC-123.CD2 .mp4','ABC-123.CD3.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123.CD1.mp4", "ABC-123.CD2 .mp4", "ABC-123.CD3.mp4")]
+)
def test_scan_movies__cdx(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 3
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'ABC-123.CD1.mp4'
- assert basenames[1] == 'ABC-123.CD2 .mp4'
- assert basenames[2] == 'ABC-123.CD3.mp4'
+ assert basenames[0] == "ABC-123.CD1.mp4"
+ assert basenames[1] == "ABC-123.CD2 .mp4"
+ assert basenames[2] == "ABC-123.CD3.mp4"
-@pytest.mark.parametrize('files', [('abc123cd1.mp4','abc123cd2.mp4')])
+@pytest.mark.parametrize("files", [("abc123cd1.mp4", "abc123cd2.mp4")])
def test_scan_movies__cdx_without_delimeter(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 2
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'abc123cd1.mp4'
- assert basenames[1] == 'abc123cd2.mp4'
+ assert basenames[0] == "abc123cd1.mp4"
+ assert basenames[1] == "abc123cd2.mp4"
# 文件夹以番号命名,分片位于文件夹内且无番号信息
-@pytest.mark.parametrize('files', [('ABC-123/CD1.mp4','ABC-123/CD2 .mp4','ABC-123/CD3.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123/CD1.mp4", "ABC-123/CD2 .mp4", "ABC-123/CD3.mp4")]
+)
def test_scan_movies__from_folder(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 3
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'CD1.mp4'
- assert basenames[1] == 'CD2 .mp4'
- assert basenames[2] == 'CD3.mp4'
+ assert basenames[0] == "CD1.mp4"
+ assert basenames[1] == "CD2 .mp4"
+ assert basenames[2] == "CD3.mp4"
# 分片以多位数字编号
-@pytest.mark.parametrize('files', [('ABC-123.01.mp4','ABC-123.02.mp4','ABC-123.03.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123.01.mp4", "ABC-123.02.mp4", "ABC-123.03.mp4")]
+)
def test_scan_movies__0x123(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 3
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'ABC-123.01.mp4'
- assert basenames[1] == 'ABC-123.02.mp4'
- assert basenames[2] == 'ABC-123.03.mp4'
+ assert basenames[0] == "ABC-123.01.mp4"
+ assert basenames[1] == "ABC-123.02.mp4"
+ assert basenames[2] == "ABC-123.03.mp4"
# 无效: 没有可以匹配到番号的文件
-@pytest.mark.parametrize('files', [('什么也没有.mp4',)])
+@pytest.mark.parametrize("files", [("什么也没有.mp4",)])
def test_scan_movies__nothing(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 0
# 无效: 在CWD下没有可以匹配到番号的文件
-@pytest.mark.parametrize('files', [('什么也没有.mp4',)])
+@pytest.mark.parametrize("files", [("什么也没有.mp4",)])
def test_scan_movies__nothing_in_cwd(prepare_files):
cwd = os.getcwd()
os.chdir(tmp_folder)
try:
- movies = scan_movies('.')
+ movies = scan_movies(".")
finally:
os.chdir(cwd)
assert len(movies) == 0
# 无效:多个分片命名杂乱
-@pytest.mark.parametrize('files', [('ABC-123-1.mp4','ABC-123-第2部分.mp4','ABC-123-3.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123-1.mp4", "ABC-123-第2部分.mp4", "ABC-123-3.mp4")]
+)
def test_scan_movies__strange_names(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 0
# 无效:同一影片的分片和非分片混合
-@pytest.mark.parametrize('files', [('ABC-123.mp4','ABC-123-1.mp4','ABC-123-2.mp4')])
+@pytest.mark.parametrize("files", [("ABC-123.mp4", "ABC-123-1.mp4", "ABC-123-2.mp4")])
def test_scan_movies__mix_slices(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 0
# 无效:多个分片位于不同文件夹
-@pytest.mark.parametrize('files', [('ABC-123.CD1.mp4','sub/ABC-123.CD2.mp4','ABC-123.CD3.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123.CD1.mp4", "sub/ABC-123.CD2.mp4", "ABC-123.CD3.mp4")]
+)
def test_scan_movies__wrong_structure(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 0
# 无效:分片的起始编号不合法
-@pytest.mark.parametrize('files', [('ABC-123.CD2.mp4','ABC-123.CD3.mp4','ABC-123.CD4.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123.CD2.mp4", "ABC-123.CD3.mp4", "ABC-123.CD4.mp4")]
+)
def test_scan_movies__wrong_initial_id(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 0
# 无效:分片的编号不连续
-@pytest.mark.parametrize('files', [('ABC-123.CD1.mp4','ABC-123.CD3.mp4','ABC-123.CD4.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123.CD1.mp4", "ABC-123.CD3.mp4", "ABC-123.CD4.mp4")]
+)
def test_scan_movies__not_consecutive(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 0
# 无效:分片的编号重复
-@pytest.mark.parametrize('files', [('ABC-123-1.mp4','ABC-123-1 .mp4','ABC-123-3.mp4')])
+@pytest.mark.parametrize(
+ "files", [("ABC-123-1.mp4", "ABC-123-1 .mp4", "ABC-123-3.mp4")]
+)
def test_scan_movies__duplicate_index(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 0
# 混合有效和无效数据
-@pytest.mark.parametrize('files', [('DEF-456/movie.mp4', 'ABC-123.1.mp4','sub/ABC-123.2.mp4','ABC-123.3.mp4')])
+@pytest.mark.parametrize(
+ "files",
+ [("DEF-456/movie.mp4", "ABC-123.1.mp4", "sub/ABC-123.2.mp4", "ABC-123.3.mp4")],
+)
def test_scan_movies__mix_data(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'DEF-456'
+ assert movies[0].dvdid == "DEF-456"
assert len(movies[0].files) == 1
basenames = [os.path.basename(i) for i in movies[0].files]
- assert basenames[0] == 'movie.mp4'
+ assert basenames[0] == "movie.mp4"
# 文件夹以番号命名,文件夹内同时有带番号的影片和广告
-@pytest.mark.parametrize('files', [{'ABC-123/ABC-123.mp4': DEFAULT_SIZE, 'ABC-123/广告1.mp4': 1024, 'ABC-123/广告2.mp4': 243269631}])
+@pytest.mark.parametrize(
+ "files",
+ [
+ {
+ "ABC-123/ABC-123.mp4": DEFAULT_SIZE,
+ "ABC-123/广告1.mp4": 1024,
+ "ABC-123/广告2.mp4": 243269631,
+ }
+ ],
+)
def test_scan_movies__1_video_with_ad(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 1
- assert movies[0].dvdid == 'ABC-123'
+ assert movies[0].dvdid == "ABC-123"
assert len(movies[0].files) == 1
# 文件夹内同时有多部带番号的影片和广告
-@pytest.mark.parametrize('files', [{'ABC-123.mp4': DEFAULT_SIZE, 'DEF-456.mp4': DEFAULT_SIZE, '广告1.mp4': 1024, '广告2.mp4': 243269631}])
+@pytest.mark.parametrize(
+ "files",
+ [
+ {
+ "ABC-123.mp4": DEFAULT_SIZE,
+ "DEF-456.mp4": DEFAULT_SIZE,
+ "广告1.mp4": 1024,
+ "广告2.mp4": 243269631,
+ }
+ ],
+)
def test_scan_movies__n_video_with_ad(prepare_files):
movies = scan_movies(tmp_folder)
assert len(movies) == 2
- assert movies[0].dvdid == 'ABC-123' and movies[1].dvdid == 'DEF-456'
+ assert movies[0].dvdid == "ABC-123" and movies[1].dvdid == "DEF-456"
assert all(len(i.files) == 1 for i in movies)
diff --git a/unittest/test_func.py b/unittest/test_func.py
index ca6d0560f..ec8328b35 100644
--- a/unittest/test_func.py
+++ b/unittest/test_func.py
@@ -2,16 +2,16 @@
import sys
import random
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from javsp.func import *
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from javsp.func import *
def test_remove_trail_actor_in_title():
run = remove_trail_actor_in_title
- delimiters = list('-xX &·,; &・,;')
- title1 = '东风夜放花千树,更吹落、星如雨。'
- title2 = '辛弃疾 ' + title1
- names = ['辛弃疾', '牛顿', '爱因斯坦', '阿基米德', '伽利略']
+ delimiters = list("-xX &·,; &・,;")
+ title1 = "东风夜放花千树,更吹落、星如雨。"
+ title2 = "辛弃疾 " + title1
+ names = ["辛弃疾", "牛顿", "爱因斯坦", "阿基米德", "伽利略"]
def combine(items):
sep = random.choice(delimiters)
@@ -20,7 +20,7 @@ def combine(items):
return new_str
# 定义测试用例
- assert title1 == run(combine([title1, '辛弃疾']), names)
+ assert title1 == run(combine([title1, "辛弃疾"]), names)
assert title1 == run(combine([title1] + names), names)
- assert title1 == run(combine([title1, '辛弃疾']), names)
+ assert title1 == run(combine([title1, "辛弃疾"]), names)
assert title2 == run(combine([title2] + names), names)
diff --git a/unittest/test_lib.py b/unittest/test_lib.py
index 43a05338c..adff36e73 100644
--- a/unittest/test_lib.py
+++ b/unittest/test_lib.py
@@ -1,26 +1,26 @@
import os
import sys
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from javsp.lib import *
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from javsp.lib import *
def test_detect_special_attr():
run = detect_special_attr
# 定义测试用例
- assert run('STARS-225_UNCENSORED_LEAKED.mp4') == 'U'
- assert run('STARS-225_UNCENSORED_LEAKED-C.mp4') == 'UC'
- assert run('STARS-225_无码.mp4') == ''
- assert run('STARS-225_无码流出.mp4') == 'U'
- assert run('STARS-225_无码破解.mp4') == 'U'
- assert run('STARS-225_UNCEN.mp4') == 'U'
- assert run('STARS-225_UNCEN-C.mp4') == 'UC'
- assert run('STARS-225u.mp4', 'STARS-225') == 'U'
- assert run('STARS-225C.mp4', 'STARS-225') == 'C'
- assert run('STARS-225uC.mp4', 'STARS-225') == 'UC'
- assert run('STARS225u.mp4', 'STARS-225') == 'U'
- assert run('STARS225C.mp4', 'STARS-225') == 'C'
- assert run('STARS225uC.mp4', 'STARS-225') == 'UC'
- assert run('STARS-225CD1.mp4', 'STARS-225') == ''
- assert run('stars225cd2.mp4', 'STARS-225') == ''
+ assert run("STARS-225_UNCENSORED_LEAKED.mp4") == "U"
+ assert run("STARS-225_UNCENSORED_LEAKED-C.mp4") == "UC"
+ assert run("STARS-225_无码.mp4") == ""
+ assert run("STARS-225_无码流出.mp4") == "U"
+ assert run("STARS-225_无码破解.mp4") == "U"
+ assert run("STARS-225_UNCEN.mp4") == "U"
+ assert run("STARS-225_UNCEN-C.mp4") == "UC"
+ assert run("STARS-225u.mp4", "STARS-225") == "U"
+ assert run("STARS-225C.mp4", "STARS-225") == "C"
+ assert run("STARS-225uC.mp4", "STARS-225") == "UC"
+ assert run("STARS225u.mp4", "STARS-225") == "U"
+ assert run("STARS225C.mp4", "STARS-225") == "C"
+ assert run("STARS225uC.mp4", "STARS-225") == "UC"
+ assert run("STARS-225CD1.mp4", "STARS-225") == ""
+ assert run("stars225cd2.mp4", "STARS-225") == ""
diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py
index 1537d93ad..4be1152ca 100644
--- a/unittest/test_proxyfree.py
+++ b/unittest/test_proxyfree.py
@@ -1,18 +1,19 @@
import os
import sys
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from javsp.web.proxyfree import *
def test_get_url():
- assert get_proxy_free_url('javlib') != ''
- assert get_proxy_free_url('javdb') != ''
+ assert get_proxy_free_url("javlib") != ""
+ assert get_proxy_free_url("javdb") != ""
def test_get_url_with_prefer():
- prefer_url = 'https://www.baidu.com'
- assert prefer_url == get_proxy_free_url('javlib', prefer_url)
+ prefer_url = "https://www.baidu.com"
+ assert prefer_url == get_proxy_free_url("javlib", prefer_url)
+
if __name__ == "__main__":
- print(get_proxy_free_url('javlib'))
+ print(get_proxy_free_url("javlib"))