diff --git a/.github/workflows/test-basic-funcs.yml b/.github/workflows/test-basic-funcs.yml index 9d3d85c34..de3330205 100644 --- a/.github/workflows/test-basic-funcs.yml +++ b/.github/workflows/test-basic-funcs.yml @@ -40,12 +40,8 @@ jobs: - name: Install dependencies run: | poetry install - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Lint with ruff + run: poetry run ruff check . - name: Test avid.py run: | poetry run pytest unittest/test_avid.py diff --git a/javsp/__main__.py b/javsp/__main__.py index 7771170e7..ff93f49b0 100644 --- a/javsp/__main__.py +++ b/javsp/__main__.py @@ -1,30 +1,29 @@ +import json +import logging import os import re import sys -import json +import threading import time -import logging +from typing import Dict, List + +import requests from PIL import Image from pydantic import ValidationError from pydantic_extra_types.pendulum_dt import Duration -import requests -import threading -from typing import Dict, List -sys.stdout.reconfigure(encoding='utf-8') +sys.stdout.reconfigure(encoding="utf-8") import colorama import pretty_errors from colorama import Fore, Style from tqdm import tqdm - pretty_errors.configure(display_link=True) +from javsp.cropper import get_cropper from javsp.print import TqdmOut -from javsp.cropper import Cropper, get_cropper - # 将StreamHandler的stream修改为TqdmOut,以与Tqdm协同工作 root_logger = logging.getLogger() @@ -32,23 +31,23 @@ if type(handler) == logging.StreamHandler: handler.stream = TqdmOut -logger = logging.getLogger('main') +logger = logging.getLogger("main") -from javsp.lib import resource_path -from javsp.nfo import write_nfo +from javsp.config import Cfg, CrawlerID +from javsp.datatype import Movie, MovieInfo from javsp.file import * from javsp.func import * from javsp.image import * -from javsp.datatype import Movie, MovieInfo +from javsp.lib import resource_path +from javsp.nfo import write_nfo from javsp.web.base import download from javsp.web.exceptions import * from javsp.web.translate import translate_movie_info -from javsp.config import Cfg, CrawlerID - actressAliasMap = {} + def resolve_alias(name): """将别名解析为固定的名字""" for fixedName, aliases in actressAliasMap.items(): @@ -68,30 +67,33 @@ def import_crawlers(): # if name == 'fc2fan' and (not os.path.isdir(Cfg().Crawler.fc2fan_local_path)): # logger.debug('由于未配置有效的fc2fan路径,已跳过该抓取器') # continue - import_name = 'javsp.web.' + name + import_name = "javsp.web." + name __import__(import_name) - valid_mods.append(import_name) # 抓取器有效: 使用完整模块路径,便于程序实际使用 + valid_mods.append( + import_name + ) # 抓取器有效: 使用完整模块路径,便于程序实际使用 except ModuleNotFoundError: - unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示 + unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示 if unknown_mods: - logger.warning('配置的抓取器无效: ' + ', '.join(unknown_mods)) + logger.warning("配置的抓取器无效: " + ", ".join(unknown_mods)) # 爬虫是IO密集型任务,可以通过多线程提升效率 def parallel_crawler(movie: Movie, tqdm_bar=None): """使用多线程抓取不同网站的数据""" + def wrapper(parser, info: MovieInfo, retry): """对抓取器函数进行包装,便于更新提示信息和自动重试""" crawler_name = threading.current_thread().name - task_info = f'Crawler: {crawler_name}: {info.dvdid}' + task_info = f"Crawler: {crawler_name}: {info.dvdid}" for cnt in range(retry): try: parser(info) movie_id = info.dvdid or info.cid logger.debug(f"{crawler_name}: 抓取成功: '{movie_id}': '{info.url}'") - setattr(info, 'success', True) + setattr(info, "success", True) if isinstance(tqdm_bar, tqdm): - tqdm_bar.set_description(f'{crawler_name}: 抓取完成') + tqdm_bar.set_description(f"{crawler_name}: 抓取完成") break except MovieNotFoundError as e: logger.debug(e) @@ -103,9 +105,11 @@ def wrapper(parser, info: MovieInfo, retry): logger.error(e) break except requests.exceptions.RequestException as e: - logger.debug(f'{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}') + logger.debug( + f"{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}" + ) if isinstance(tqdm_bar, tqdm): - tqdm_bar.set_description(f'{crawler_name}: 网络错误,正在重试') + tqdm_bar.set_description(f"{crawler_name}: 网络错误,正在重试") except Exception as e: logger.exception(e) @@ -114,7 +118,7 @@ def wrapper(parser, info: MovieInfo, retry): all_info = {i.value: MovieInfo(movie) for i in crawler_mods} # 番号为cid但同时也有有效的dvdid时,也尝试使用普通模式进行抓取 - if movie.data_src == 'cid' and movie.dvdid: + if movie.data_src == "cid" and movie.dvdid: crawler_mods = crawler_mods + Cfg().crawler.selection.normal for i in all_info.values(): i.dvdid = None @@ -123,13 +127,15 @@ def wrapper(parser, info: MovieInfo, retry): thread_pool = [] for mod_partial, info in all_info.items(): mod = f"javsp.web.{mod_partial}" - parser = getattr(sys.modules[mod], 'parse_data') + parser = getattr(sys.modules[mod], "parse_data") # 将all_info中的info实例传递给parser,parser抓取完成后,info实例的值已经完成更新 # TODO: 抓取器如果带有parse_data_raw,说明它已经自行进行了重试处理,此时将重试次数设置为1 - if hasattr(sys.modules[mod], 'parse_data_raw'): + if hasattr(sys.modules[mod], "parse_data_raw"): th = threading.Thread(target=wrapper, name=mod, args=(parser, info, 1)) else: - th = threading.Thread(target=wrapper, name=mod, args=(parser, info, Cfg().network.retry)) + th = threading.Thread( + target=wrapper, name=mod, args=(parser, info, Cfg().network.retry) + ) th.start() thread_pool.append(th) # 等待所有线程结束 @@ -138,22 +144,28 @@ def wrapper(parser, info: MovieInfo, retry): th: threading.Thread th.join(timeout=timeout) # 根据抓取结果更新影片类型判定 - if movie.data_src == 'cid' and movie.dvdid: + if movie.data_src == "cid" and movie.dvdid: titles = [all_info[i].title for i in Cfg().crawler.selection[movie.data_src]] if any(titles): movie.dvdid = None - all_info = {k: v for k, v in all_info.items() if k in Cfg().crawler.selection['cid']} + all_info = { + k: v for k, v in all_info.items() if k in Cfg().crawler.selection["cid"] + } else: - logger.debug(f'自动更正影片数据源类型: {movie.dvdid} ({movie.cid}): normal') - movie.data_src = 'normal' + logger.debug(f"自动更正影片数据源类型: {movie.dvdid} ({movie.cid}): normal") + movie.data_src = "normal" movie.cid = None - all_info = {k: v for k, v in all_info.items() if k not in Cfg().crawler.selection['cid']} + all_info = { + k: v + for k, v in all_info.items() + if k not in Cfg().crawler.selection["cid"] + } # 删除抓取失败的站点对应的数据 - all_info = {k:v for k,v in all_info.items() if hasattr(v, 'success')} + all_info = {k: v for k, v in all_info.items() if hasattr(v, "success")} for info in all_info.values(): del info.success # 删除all_info中键名中的'web.' - all_info = {k[4:]:v for k,v in all_info.items()} + all_info = {k[4:]: v for k, v in all_info.items()} return all_info @@ -162,8 +174,8 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): final_info = MovieInfo(movie) ########## 部分字段配置了专门的选取逻辑,先处理这些字段 ########## # genre - if 'javdb' in all_info and all_info['javdb'].genre: - final_info.genre = all_info['javdb'].genre + if "javdb" in all_info and all_info["javdb"].genre: + final_info.genre = all_info["javdb"].genre ########## 移除所有抓取器数据中,标题尾部的女优名 ########## if Cfg().summarizer.title.remove_trailing_actor_name: @@ -172,7 +184,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): ########## 然后检查所有字段,如果某个字段还是默认值,则按照优先级选取数据 ########## # parser直接更新了all_info中的项目,而初始all_info是按照优先级生成的,已经符合配置的优先级顺序了 # 按照优先级取出各个爬虫获取到的信息 - attrs = [i for i in dir(final_info) if not i.startswith('_')] + attrs = [i for i in dir(final_info) if not i.startswith("_")] covers, big_covers = [], [] for name, data in all_info.items(): absorbed = [] @@ -180,15 +192,15 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): for attr in attrs: incoming = getattr(data, attr) current = getattr(final_info, attr) - if attr == 'cover': + if attr == "cover": if incoming and (incoming not in covers): covers.append(incoming) absorbed.append(attr) - elif attr == 'big_cover': + elif attr == "big_cover": if incoming and (incoming not in big_covers): big_covers.append(incoming) absorbed.append(attr) - elif attr == 'uncensored': + elif attr == "uncensored": if (current is None) and (incoming is not None): setattr(final_info, attr, incoming) absorbed.append(attr) @@ -197,7 +209,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): setattr(final_info, attr, incoming) absorbed.append(attr) if absorbed: - logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed)) + logger.debug(f"从'{name}'中获取了字段: " + " ".join(absorbed)) # 使用网站的番号作为番号 if Cfg().crawler.respect_site_avid: id_weight = {} @@ -209,14 +221,19 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): id_weight.setdefault(data.cid, []).append(name) # 根据权重选择最终番号 if id_weight: - id_weight = {k:v for k, v in sorted(id_weight.items(), key=lambda x:len(x[1]), reverse=True)} + id_weight = { + k: v + for k, v in sorted( + id_weight.items(), key=lambda x: len(x[1]), reverse=True + ) + } final_id = list(id_weight.keys())[0] if movie.dvdid: final_info.dvdid = final_id else: final_info.cid = final_id # javdb封面有水印,优先采用其他站点的封面 - javdb_cover = getattr(all_info.get('javdb'), 'cover', None) + javdb_cover = getattr(all_info.get("javdb"), "cover", None) if javdb_cover is not None: match Cfg().crawler.use_javdb_cover: case UseJavDBCover.fallback: @@ -225,8 +242,8 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): case UseJavDBCover.no: covers.remove(javdb_cover) - setattr(final_info, 'covers', covers) - setattr(final_info, 'big_covers', big_covers) + setattr(final_info, "covers", covers) + setattr(final_info, "big_covers", big_covers) # 对cover和big_cover赋值,避免后续检查必须字段时出错 if covers: final_info.cover = covers[0] @@ -237,16 +254,17 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): if final_info.genre is None: final_info.genre = [] if movie.hard_sub: - final_info.genre.append('内嵌字幕') + final_info.genre.append("内嵌字幕") if movie.uncensored: - final_info.genre.append('无码流出/破解') + final_info.genre.append("无码流出/破解") # 女优别名固定 if Cfg().crawler.normalize_actress_name and bool(final_info.actress_pics): final_info.actress = [resolve_alias(i) for i in final_info.actress] if final_info.actress_pics: final_info.actress_pics = { - resolve_alias(key): value for key, value in final_info.actress_pics.items() + resolve_alias(key): value + for key, value in final_info.actress_pics.items() } # 检查是否所有必需的字段都已经获得了值 @@ -258,127 +276,161 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): movie.info = final_info return True + def generate_names(movie: Movie): """按照模板生成相关文件的文件名""" info = movie.info # 准备用来填充命名模板的字典 d = info.get_info_dic() if info.actress and len(info.actress) > Cfg().summarizer.path.max_actress_count: - logging.debug('女优人数过多,按配置保留了其中的前n个: ' + ','.join(info.actress)) - actress = info.actress[:Cfg().summarizer.path.max_actress_count] + ['…'] + logging.debug( + "女优人数过多,按配置保留了其中的前n个: " + ",".join(info.actress) + ) + actress = info.actress[: Cfg().summarizer.path.max_actress_count] + ["…"] else: actress = info.actress - d['actress'] = ','.join(actress) if actress else Cfg().summarizer.default.actress + d["actress"] = ",".join(actress) if actress else Cfg().summarizer.default.actress # 保存label供后面判断裁剪图片的方式使用 - setattr(info, 'label', d['label'].upper()) + setattr(info, "label", d["label"].upper()) # 处理字段:替换不能作为文件名的字符,移除首尾的空字符 for k, v in d.items(): d[k] = replace_illegal_chars(v.strip()) # 生成nfo文件中的影片标题 nfo_title = Cfg().summarizer.nfo.title_pattern.format(**d) - setattr(info, 'nfo_title', nfo_title) - + setattr(info, "nfo_title", nfo_title) + # 使用字典填充模板,生成相关文件的路径(多分片影片要考虑CD-x部分) - cdx = '' if len(movie.files) <= 1 else '-CD1' - if hasattr(info, 'title_break'): + cdx = "" if len(movie.files) <= 1 else "-CD1" + if hasattr(info, "title_break"): title_break = info.title_break else: - title_break = split_by_punc(d['title']) - if hasattr(info, 'ori_title_break'): + title_break = split_by_punc(d["title"]) + if hasattr(info, "ori_title_break"): ori_title_break = info.ori_title_break else: - ori_title_break = split_by_punc(d['rawtitle']) + ori_title_break = split_by_punc(d["rawtitle"]) copyd = d.copy() - copyd['num'] = copyd['num'] + movie.attr_str + copyd["num"] = copyd["num"] + movie.attr_str longest_ext = max((os.path.splitext(i)[1] for i in movie.files), key=len) for end in range(len(ori_title_break), 0, -1): - copyd['rawtitle'] = replace_illegal_chars(''.join(ori_title_break[:end]).strip()) + copyd["rawtitle"] = replace_illegal_chars( + "".join(ori_title_break[:end]).strip() + ) for sub_end in range(len(title_break), 0, -1): - copyd['title'] = replace_illegal_chars(''.join(title_break[:sub_end]).strip()) + copyd["title"] = replace_illegal_chars( + "".join(title_break[:sub_end]).strip() + ) if Cfg().summarizer.move_files: - save_dir = os.path.normpath(Cfg().summarizer.path.output_folder_pattern.format(**copyd)).strip() - basename = os.path.normpath(Cfg().summarizer.path.basename_pattern.format(**copyd)).strip() + save_dir = os.path.normpath( + Cfg().summarizer.path.output_folder_pattern.format(**copyd) + ).strip() + basename = os.path.normpath( + Cfg().summarizer.path.basename_pattern.format(**copyd) + ).strip() else: # 如果不整理文件,则保存抓取的数据到当前目录 save_dir = os.path.dirname(movie.files[0]) filebasename = os.path.basename(movie.files[0]) ext = os.path.splitext(filebasename)[1] - basename = filebasename.replace(ext, '') - long_path = os.path.join(save_dir, basename+longest_ext) + basename = filebasename.replace(ext, "") + long_path = os.path.join(save_dir, basename + longest_ext) remaining = get_remaining_path_len(os.path.abspath(long_path)) if remaining > 0: movie.save_dir = save_dir movie.basename = basename - movie.nfo_file = os.path.join(save_dir, Cfg().summarizer.nfo.basename_pattern.format(**copyd) + '.nfo') - movie.fanart_file = os.path.join(save_dir, Cfg().summarizer.fanart.basename_pattern.format(**copyd) + '.jpg') - movie.poster_file = os.path.join(save_dir, Cfg().summarizer.cover.basename_pattern.format(**copyd) + '.jpg') - if d['title'] != copyd['title']: + movie.nfo_file = os.path.join( + save_dir, + Cfg().summarizer.nfo.basename_pattern.format(**copyd) + ".nfo", + ) + movie.fanart_file = os.path.join( + save_dir, + Cfg().summarizer.fanart.basename_pattern.format(**copyd) + ".jpg", + ) + movie.poster_file = os.path.join( + save_dir, + Cfg().summarizer.cover.basename_pattern.format(**copyd) + ".jpg", + ) + if d["title"] != copyd["title"]: logger.info(f"自动截短标题为:\n{copyd['title']}") - if d['rawtitle'] != copyd['rawtitle']: + if d["rawtitle"] != copyd["rawtitle"]: logger.info(f"自动截短原始标题为:\n{copyd['rawtitle']}") return else: # 以防万一,当整理路径非常深或者标题起始很长一段没有标点符号时,硬性截短生成的名称 - copyd['title'] = copyd['title'][:remaining] - copyd['rawtitle'] = copyd['rawtitle'][:remaining] + copyd["title"] = copyd["title"][:remaining] + copyd["rawtitle"] = copyd["rawtitle"][:remaining] # 如果不整理文件,则保存抓取的数据到当前目录 if not Cfg().summarizer.move_files: save_dir = os.path.dirname(movie.files[0]) filebasename = os.path.basename(movie.files[0]) ext = os.path.splitext(filebasename)[1] - basename = filebasename.replace(ext, '') + basename = filebasename.replace(ext, "") else: - save_dir = os.path.normpath(Cfg().summarizer.path.output_folder_pattern.format(**copyd)).strip() - basename = os.path.normpath(Cfg().summarizer.path.basename_pattern.format(**copyd)).strip() + save_dir = os.path.normpath( + Cfg().summarizer.path.output_folder_pattern.format(**copyd) + ).strip() + basename = os.path.normpath( + Cfg().summarizer.path.basename_pattern.format(**copyd) + ).strip() movie.save_dir = save_dir movie.basename = basename - movie.nfo_file = os.path.join(save_dir, Cfg().summarizer.nfo.basename_pattern.format(**copyd) + '.nfo') - movie.fanart_file = os.path.join(save_dir, Cfg().summarizer.fanart.basename_pattern.format(**copyd) + '.jpg') - movie.poster_file = os.path.join(save_dir, Cfg().summarizer.cover.basename_pattern.format(**copyd) + '.jpg') - - if d['title'] != copyd['title']: + movie.nfo_file = os.path.join( + save_dir, Cfg().summarizer.nfo.basename_pattern.format(**copyd) + ".nfo" + ) + movie.fanart_file = os.path.join( + save_dir, Cfg().summarizer.fanart.basename_pattern.format(**copyd) + ".jpg" + ) + movie.poster_file = os.path.join( + save_dir, Cfg().summarizer.cover.basename_pattern.format(**copyd) + ".jpg" + ) + + if d["title"] != copyd["title"]: logger.info(f"自动截短标题为:\n{copyd['title']}") - if d['rawtitle'] != copyd['rawtitle']: + if d["rawtitle"] != copyd["rawtitle"]: logger.info(f"自动截短原始标题为:\n{copyd['rawtitle']}") + def reviewMovieID(all_movies, root): """人工检查每一部影片的番号""" count = len(all_movies) - logger.info('进入手动模式检查番号: ') + logger.info("进入手动模式检查番号: ") for i, movie in enumerate(all_movies, start=1): id = repr(movie)[7:-2] - print(f'[{i}/{count}]\t{Fore.LIGHTMAGENTA_EX}{id}{Style.RESET_ALL}, 对应文件:') + print(f"[{i}/{count}]\t{Fore.LIGHTMAGENTA_EX}{id}{Style.RESET_ALL}, 对应文件:") relpaths = [os.path.relpath(i, root) for i in movie.files] - print('\n'.join([' '+i for i in relpaths])) - s = input("回车确认当前番号,或直接输入更正后的番号(如'ABC-123'或'cid:sqte00300')") + print("\n".join([" " + i for i in relpaths])) + s = input( + "回车确认当前番号,或直接输入更正后的番号(如'ABC-123'或'cid:sqte00300')" + ) if not s: logger.info(f"已确认影片番号: {','.join(relpaths)}: {id}") else: s = s.strip() s_lc = s.lower() - if s_lc.startswith(('cid:', 'cid=')): + if s_lc.startswith(("cid:", "cid=")): new_movie = Movie(cid=s_lc[4:]) - new_movie.data_src = 'cid' + new_movie.data_src = "cid" new_movie.files = movie.files - elif s_lc.startswith('fc2'): + elif s_lc.startswith("fc2"): new_movie = Movie(s) - new_movie.data_src = 'fc2' + new_movie.data_src = "fc2" new_movie.files = movie.files else: new_movie = Movie(s) - new_movie.data_src = 'normal' + new_movie.data_src = "normal" new_movie.files = movie.files - all_movies[i-1] = new_movie + all_movies[i - 1] = new_movie new_id = repr(new_movie)[7:-2] logger.info(f"已更正影片番号: {','.join(relpaths)}: {id} -> {new_id}") print() -SUBTITLE_MARK_FILE = Image.open(os.path.abspath(resource_path('image/sub_mark.png'))) -UNCENSORED_MARK_FILE = Image.open(os.path.abspath(resource_path('image/unc_mark.png'))) +SUBTITLE_MARK_FILE = Image.open(os.path.abspath(resource_path("image/sub_mark.png"))) +UNCENSORED_MARK_FILE = Image.open(os.path.abspath(resource_path("image/unc_mark.png"))) + def process_poster(movie: Movie): def should_use_ai_crop_match(label): @@ -386,10 +438,13 @@ def should_use_ai_crop_match(label): re.match(r, label) return True return False + crop_engine = None - if (movie.info.uncensored or - movie.data_src == 'fc2' or - should_use_ai_crop_match(movie.info.label.upper())): + if ( + movie.info.uncensored + or movie.data_src == "fc2" + or should_use_ai_crop_match(movie.info.label.upper()) + ): crop_engine = Cfg().summarizer.cover.crop.engine cropper = get_cropper(crop_engine) fanart_image = Image.open(movie.fanart_file) @@ -397,21 +452,27 @@ def should_use_ai_crop_match(label): if Cfg().summarizer.cover.add_label: if movie.hard_sub: - fanart_cropped = add_label_to_poster(fanart_cropped, SUBTITLE_MARK_FILE, LabelPostion.BOTTOM_RIGHT) + fanart_cropped = add_label_to_poster( + fanart_cropped, SUBTITLE_MARK_FILE, LabelPostion.BOTTOM_RIGHT + ) if movie.uncensored: - fanart_cropped = add_label_to_poster(fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT) + fanart_cropped = add_label_to_poster( + fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT + ) fanart_cropped.save(movie.poster_file) + def RunNormalMode(all_movies): """普通整理模式""" - def check_step(result, msg='步骤错误'): + + def check_step(result, msg="步骤错误"): """检查一个整理步骤的结果,并负责更新tqdm的进度""" if result: inner_bar.update() else: - raise Exception(msg + '\n') + raise Exception(msg + "\n") - outer_bar = tqdm(all_movies, desc='整理影片', ascii=True, leave=False) + outer_bar = tqdm(all_movies, desc="整理影片", ascii=True, leave=False) total_step = 6 if Cfg().translator.engine: total_step += 1 @@ -423,34 +484,36 @@ def check_step(result, msg='步骤错误'): try: # 初始化本次循环要整理影片任务 filenames = [os.path.split(i)[1] for i in movie.files] - logger.info('正在整理: ' + ', '.join(filenames)) - inner_bar = tqdm(total=total_step, desc='步骤', ascii=True, leave=False) + logger.info("正在整理: " + ", ".join(filenames)) + inner_bar = tqdm(total=total_step, desc="步骤", ascii=True, leave=False) # 依次执行各个步骤 - inner_bar.set_description(f'启动并发任务') + inner_bar.set_description("启动并发任务") all_info = parallel_crawler(movie, inner_bar) - msg = f'为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息' + msg = f"为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息" check_step(all_info, msg) - inner_bar.set_description('汇总数据') + inner_bar.set_description("汇总数据") has_required_keys = info_summary(movie, all_info) check_step(has_required_keys) if Cfg().translator.engine: - inner_bar.set_description('翻译影片信息') + inner_bar.set_description("翻译影片信息") success = translate_movie_info(movie.info) check_step(success) generate_names(movie) - check_step(movie.save_dir, '无法按命名规则生成目标文件夹') + check_step(movie.save_dir, "无法按命名规则生成目标文件夹") if not os.path.exists(movie.save_dir): os.makedirs(movie.save_dir) - inner_bar.set_description('下载封面图片') + inner_bar.set_description("下载封面图片") if Cfg().summarizer.cover.highres: - cover_dl = download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers) + cover_dl = download_cover( + movie.info.covers, movie.fanart_file, movie.info.big_covers + ) else: cover_dl = download_cover(movie.info.covers, movie.fanart_file) - check_step(cover_dl, '下载封面图片失败') + check_step(cover_dl, "下载封面图片失败") cover, pic_path = cover_dl # 确保实际下载的封面的url与即将写入到movie.info中的一致 if cover != movie.info.cover: @@ -466,23 +529,31 @@ def check_step(result, msg='步骤错误'): check_step(True) if Cfg().summarizer.extra_fanarts.enabled: - scrape_interval = Cfg().summarizer.extra_fanarts.scrap_interval.total_seconds() - inner_bar.set_description('下载剧照') + scrape_interval = ( + Cfg().summarizer.extra_fanarts.scrap_interval.total_seconds() + ) + inner_bar.set_description("下载剧照") if movie.info.preview_pics: - extrafanartdir = movie.save_dir + '/extrafanart' + extrafanartdir = movie.save_dir + "/extrafanart" os.mkdir(extrafanartdir) - for (id, pic_url) in enumerate(movie.info.preview_pics): - inner_bar.set_description(f"Downloading extrafanart {id} from url: {pic_url}") - + for id, pic_url in enumerate(movie.info.preview_pics): + inner_bar.set_description( + f"Downloading extrafanart {id} from url: {pic_url}" + ) + fanart_destination = f"{extrafanartdir}/{id}.png" try: info = download(pic_url, fanart_destination) if valid_pic(fanart_destination): filesize = get_fmt_size(pic_path) width, height = get_pic_size(pic_path) - elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed'])) - speed = get_fmt_size(info['rate']) + '/s' - logger.info(f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]") + elapsed = time.strftime( + "%M:%S", time.gmtime(info["elapsed"]) + ) + speed = get_fmt_size(info["rate"]) + "/s" + logger.info( + f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]" + ) else: check_step(False, f"下载剧照{id}: {pic_url}失败") except: @@ -490,18 +561,20 @@ def check_step(result, msg='步骤错误'): time.sleep(scrape_interval) check_step(True) - inner_bar.set_description('写入NFO') + inner_bar.set_description("写入NFO") write_nfo(movie.info, movie.nfo_file) check_step(True) if Cfg().summarizer.move_files: - inner_bar.set_description('移动影片文件') + inner_bar.set_description("移动影片文件") movie.rename_files(Cfg().summarizer.path.hard_link) check_step(True) - logger.info(f'整理完成,相关文件已保存到: {movie.save_dir}\n') + logger.info(f"整理完成,相关文件已保存到: {movie.save_dir}\n") else: - logger.info(f'刮削完成,相关文件已保存到: {movie.nfo_file}\n') + logger.info(f"刮削完成,相关文件已保存到: {movie.nfo_file}\n") - if movie != all_movies[-1] and Cfg().crawler.sleep_after_scraping > Duration(0): + if movie != all_movies[ + -1 + ] and Cfg().crawler.sleep_after_scraping > Duration(0): time.sleep(Cfg().crawler.sleep_after_scraping.total_seconds()) return_movies.append(movie) # except Exception as e: @@ -523,9 +596,11 @@ def download_cover(covers, fanart_path, big_covers=[]): if valid_pic(pic_path): filesize = get_fmt_size(pic_path) width, height = get_pic_size(pic_path) - elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed'])) - speed = get_fmt_size(info['rate']) + '/s' - logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]") + elapsed = time.strftime("%M:%S", time.gmtime(info["elapsed"])) + speed = get_fmt_size(info["rate"]) + "/s" + logger.info( + f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]" + ) return (url, pic_path) except requests.exceptions.HTTPError: # HTTPError通常说明猜测的高清封面地址实际不可用,因此不再重试 @@ -544,20 +619,22 @@ def download_cover(covers, fanart_path, big_covers=[]): break except Exception as e: logger.debug(e, exc_info=True) - logger.error(f"下载封面图片失败") - logger.debug('big_covers:'+str(big_covers) + ', covers'+str(covers)) + logger.error("下载封面图片失败") + logger.debug("big_covers:" + str(big_covers) + ", covers" + str(covers)) return None + def get_pic_path(fanart_path, url): fanart_base = os.path.splitext(fanart_path)[0] - pic_extend = url.split('.')[-1] + pic_extend = url.split(".")[-1] # 判断 url 是否带?后面的参数 - if '?' in pic_extend: - pic_extend = pic_extend.split('?')[0] - + if "?" in pic_extend: + pic_extend = pic_extend.split("?")[0] + pic_path = fanart_base + "." + pic_extend return pic_path + def error_exit(success, err_info): """检查业务逻辑是否成功完成,如果失败则报错退出程序""" if not success: @@ -581,24 +658,25 @@ def entry(): colorama.init(autoreset=True) # 检查更新 - version_info = 'JavSP ' + getattr(sys, 'javsp_version', '未知版本/从代码运行') - logger.debug(version_info.center(60, '=')) + version_info = "JavSP " + getattr(sys, "javsp_version", "未知版本/从代码运行") + logger.debug(version_info.center(60, "=")) check_update(Cfg().other.check_update, Cfg().other.auto_update) root = get_scan_dir(Cfg().scanner.input_directory) - error_exit(root, '未选择要扫描的文件夹') + error_exit(root, "未选择要扫描的文件夹") # 导入抓取器,必须在chdir之前 import_crawlers() os.chdir(root) - print(f'扫描影片文件...') + print("扫描影片文件...") recognized = scan_movies(root) movie_count = len(recognized) recognize_fail = [] - error_exit(movie_count, '未找到影片文件') - logger.info(f'扫描影片文件:共找到 {movie_count} 部影片') + error_exit(movie_count, "未找到影片文件") + logger.info(f"扫描影片文件:共找到 {movie_count} 部影片") RunNormalMode(recognized + recognize_fail) sys.exit(0) + if __name__ == "__main__": entry() diff --git a/javsp/avid.py b/javsp/avid.py index f535f1fee..2e22b8b2b 100644 --- a/javsp/avid.py +++ b/javsp/avid.py @@ -1,149 +1,159 @@ """获取和转换影片的各类番号(DVD ID, DMM cid, DMM pid)""" + import os import re from pathlib import Path -__all__ = ['get_id', 'get_cid', 'guess_av_type'] +__all__ = ["get_id", "get_cid", "guess_av_type"] from javsp.config import Cfg + def get_id(filepath_str: str) -> str: """从给定的文件路径中提取番号(DVD ID)""" filepath = Path(filepath_str) # 通常是接收文件的路径,当然如果是普通字符串也可以 - ignore_pattern = re.compile('|'.join(Cfg().scanner.ignored_id_pattern)) - norm = ignore_pattern.sub('', filepath.stem).upper() - if 'FC2' in norm: + ignore_pattern = re.compile("|".join(Cfg().scanner.ignored_id_pattern)) + norm = ignore_pattern.sub("", filepath.stem).upper() + if "FC2" in norm: # 根据FC2 Club的影片数据,FC2编号为5-7个数字 - match = re.search(r'FC2[^A-Z\d]{0,5}(PPV[^A-Z\d]{0,5})?(\d{5,7})', norm, re.I) + match = re.search(r"FC2[^A-Z\d]{0,5}(PPV[^A-Z\d]{0,5})?(\d{5,7})", norm, re.I) if match: - return 'FC2-' + match.group(2) - elif 'HEYDOUGA' in norm: - match = re.search(r'(HEYDOUGA)[-_]*(\d{4})[-_]0?(\d{3,5})', norm, re.I) + return "FC2-" + match.group(2) + elif "HEYDOUGA" in norm: + match = re.search(r"(HEYDOUGA)[-_]*(\d{4})[-_]0?(\d{3,5})", norm, re.I) if match: - return '-'.join(match.groups()) - elif 'GETCHU' in norm: - match = re.search(r'GETCHU[-_]*(\d+)', norm, re.I) + return "-".join(match.groups()) + elif "GETCHU" in norm: + match = re.search(r"GETCHU[-_]*(\d+)", norm, re.I) if match: - return 'GETCHU-' + match.group(1) - elif 'GYUTTO' in norm: - match = re.search(r'GYUTTO-(\d+)', norm, re.I) + return "GETCHU-" + match.group(1) + elif "GYUTTO" in norm: + match = re.search(r"GYUTTO-(\d+)", norm, re.I) if match: - return 'GYUTTO-' + match.group(1) - elif '259LUXU' in norm: # special case having form of '259luxu' - match = re.search(r'259LUXU-(\d+)', norm, re.I) + return "GYUTTO-" + match.group(1) + elif "259LUXU" in norm: # special case having form of '259luxu' + match = re.search(r"259LUXU-(\d+)", norm, re.I) if match: - return '259LUXU-' + match.group(1) + return "259LUXU-" + match.group(1) else: # 先尝试移除可疑域名进行匹配,如果匹配不到再使用原始文件名进行匹配 - no_domain = re.sub(r'\w{3,10}\.(COM|NET|APP|XYZ)', '', norm, flags=re.I) + no_domain = re.sub(r"\w{3,10}\.(COM|NET|APP|XYZ)", "", norm, flags=re.I) if no_domain != norm: avid = get_id(no_domain) if avid: return avid # 匹配缩写成hey的heydouga影片。由于番号分三部分,要先于后面分两部分的进行匹配 - match = re.search(r'(?:HEY)[-_]*(\d{4})[-_]0?(\d{3,5})', norm, re.I) + match = re.search(r"(?:HEY)[-_]*(\d{4})[-_]0?(\d{3,5})", norm, re.I) if match: - return 'heydouga-' + '-'.join(match.groups()) + return "heydouga-" + "-".join(match.groups()) # 匹配片商 MUGEN 的奇怪番号。由于MK3D2DBD的模式,要放在普通番号模式之前进行匹配 - match = re.search(r'(MKB?D)[-_]*(S\d{2,3})|(MK3D2DBD|S2M|S2MBD)[-_]*(\d{2,3})', norm, re.I) + match = re.search( + r"(MKB?D)[-_]*(S\d{2,3})|(MK3D2DBD|S2M|S2MBD)[-_]*(\d{2,3})", norm, re.I + ) if match: if match.group(1) is not None: - avid = match.group(1) + '-' + match.group(2) + avid = match.group(1) + "-" + match.group(2) else: - avid = match.group(3) + '-' + match.group(4) + avid = match.group(3) + "-" + match.group(4) return avid # 匹配IBW这样带有后缀z的番号 - match = re.search(r'(IBW)[-_](\d{2,5}z)', norm, re.I) + match = re.search(r"(IBW)[-_](\d{2,5}z)", norm, re.I) if match: - return match.group(1) + '-' + match.group(2) + return match.group(1) + "-" + match.group(2) # 普通番号,优先尝试匹配带分隔符的(如ABC-123) - match = re.search(r'([A-Z]{2,10})[-_](\d{2,5})', norm, re.I) + match = re.search(r"([A-Z]{2,10})[-_](\d{2,5})", norm, re.I) if match: - return match.group(1) + '-' + match.group(2) + return match.group(1) + "-" + match.group(2) # 普通番号,运行到这里时表明无法匹配到带分隔符的番号 # 先尝试匹配东热的red, sky, ex三个不带-分隔符的系列 # (这三个系列已停止更新,因此根据其作品编号将数字范围限制得小一些以降低误匹配概率) - match = re.search(r'(RED[01]\d\d|SKY[0-3]\d\d|EX00[01]\d)', norm, re.I) + match = re.search(r"(RED[01]\d\d|SKY[0-3]\d\d|EX00[01]\d)", norm, re.I) if match: return match.group(1) # 然后再将影片视作缺失了-分隔符来匹配 - match = re.search(r'([A-Z]{2,})(\d{2,5})', norm, re.I) + match = re.search(r"([A-Z]{2,})(\d{2,5})", norm, re.I) if match: - return match.group(1) + '-' + match.group(2) + return match.group(1) + "-" + match.group(2) # 尝试匹配TMA制作的影片(如'T28-557',他家的番号很乱) - match = re.search(r'(T[23]8[-_]\d{3})', norm) + match = re.search(r"(T[23]8[-_]\d{3})", norm) if match: return match.group(1) # 尝试匹配东热n, k系列 - match = re.search(r'(N\d{4}|K\d{4})', norm, re.I) + match = re.search(r"(N\d{4}|K\d{4})", norm, re.I) if match: return match.group(1) # 尝试匹配纯数字番号(无码影片) - match = re.search(r'(\d{6}[-_]\d{2,3})', norm) + match = re.search(r"(\d{6}[-_]\d{2,3})", norm) if match: return match.group(1) # 如果还是匹配不了,尝试将')('替换为'-'后再试,少部分影片的番号是由')('分隔的 - if ')(' in norm: - avid = get_id(norm.replace(')(', '-')) + if ")(" in norm: + avid = get_id(norm.replace(")(", "-")) if avid: return avid # 如果最后仍然匹配不了番号,则尝试使用文件所在文件夹的名字去匹配 - - if filepath.parent.name != '': # haven't reach '.' or '/' + + if filepath.parent.name != "": # haven't reach '.' or '/' return get_id(filepath.parent.name) else: - return '' + return "" + + +CD_POSTFIX = re.compile(r"([-_]\w|cd\d)$") -CD_POSTFIX = re.compile(r'([-_]\w|cd\d)$') def get_cid(filepath: str) -> str: """尝试将给定的文件名匹配为CID(Content ID)""" basename = os.path.splitext(os.path.basename(filepath))[0] # 移除末尾可能带有的分段影片序号 - possible = CD_POSTFIX.sub('', basename) + possible = CD_POSTFIX.sub("", basename) # cid只由数字、小写字母和下划线组成 - match = re.match(r'^([a-z\d_]+)$', possible, re.A) + match = re.match(r"^([a-z\d_]+)$", possible, re.A) if match: possible = match.group(1) - if '_' not in possible: + if "_" not in possible: # 长度为7-14的cid就占了约99.01%. 最长的cid为24,但是长为20-24的比例不到十万分之五 - match = re.match(r'^[a-z\d]{7,19}$', possible) + match = re.match(r"^[a-z\d]{7,19}$", possible) if match: return possible else: # 绝大多数都只有一个下划线(只有约万分之一带有两个下划线) - match2 = re.match(r'''^h_\d{3,4}[a-z]{1,10}\d{2,5}[a-z\d]{0,8}$ # 约 99.17% + match2 = re.match( + r"""^h_\d{3,4}[a-z]{1,10}\d{2,5}[a-z\d]{0,8}$ # 约 99.17% |^\d{3}_\d{4,5}$ # 约 0.57% |^402[a-z]{3,6}\d*_[a-z]{3,8}\d{5,6}$ # 约 0.09% |^h_\d{3,4}wvr\d\w\d{4,5}[a-z\d]{0,8}$ # 约 0.06% - $''', possible, re.VERBOSE) + $""", + possible, + re.VERBOSE, + ) if match2: return possible - return '' + return "" def guess_av_type(avid: str) -> str: """识别给定的番号所属的分类: normal, fc2, cid""" - match = re.match(r'^FC2-\d{5,7}$', avid, re.I) + match = re.match(r"^FC2-\d{5,7}$", avid, re.I) if match: - return 'fc2' - match = re.match(r'^GETCHU-(\d+)',avid,re.I) + return "fc2" + match = re.match(r"^GETCHU-(\d+)", avid, re.I) if match: - return 'getchu' - match = re.match(r'^GYUTTO-(\d+)',avid,re.I) + return "getchu" + match = re.match(r"^GYUTTO-(\d+)", avid, re.I) if match: - return 'gyutto' + return "gyutto" # 如果传入的avid完全匹配cid的模式,则将影片归类为cid cid = get_cid(avid) if cid == avid: - return 'cid' + return "cid" # 以上都不是: 默认归类为normal - return 'normal' + return "normal" if __name__ == "__main__": - print(get_id('FC2-123456/Unknown.mp4')) + print(get_id("FC2-123456/Unknown.mp4")) diff --git a/javsp/chromium.py b/javsp/chromium.py index db315293e..b7d5a5ef4 100644 --- a/javsp/chromium.py +++ b/javsp/chromium.py @@ -1,4 +1,5 @@ """解析Chromium系浏览器Cookies的相关函数""" + import os import sys import json @@ -9,7 +10,7 @@ from shutil import copyfile from datetime import datetime -__all__ = ['get_browsers_cookies'] +__all__ = ["get_browsers_cookies"] from cryptography.hazmat.primitives.ciphers.aead import AESGCM @@ -18,15 +19,16 @@ logger = logging.getLogger(__name__) -class Decrypter(): +class Decrypter: def __init__(self, key): self.key = key + def decrypt(self, encrypted_value): - nonce = encrypted_value[3:3+12] - ciphertext = encrypted_value[3+12:-16] + nonce = encrypted_value[3 : 3 + 12] + ciphertext = encrypted_value[3 + 12 : -16] tag = encrypted_value[-16:] cipher = AES.new(self.key, AES.MODE_GCM, nonce=nonce) - plaintext = cipher.decrypt_and_verify(ciphertext, tag).decode('utf-8') + plaintext = cipher.decrypt_and_verify(ciphertext, tag).decode("utf-8") return plaintext @@ -34,32 +36,38 @@ def get_browsers_cookies(): """获取系统上的所有Chromium系浏览器的JavDB的Cookies""" # 不予支持: Opera, 360安全&极速, 搜狗使用非标的用户目录或数据格式; QQ浏览器屏蔽站点 user_data_dirs = { - 'Chrome': '/Google/Chrome/User Data', - 'Chrome Beta': '/Google/Chrome Beta/User Data', - 'Chrome Canary': '/Google/Chrome SxS/User Data', - 'Chromium': '/Google/Chromium/User Data', - 'Edge': '/Microsoft/Edge/User Data', - 'Vivaldi': '/Vivaldi/User Data' + "Chrome": "/Google/Chrome/User Data", + "Chrome Beta": "/Google/Chrome Beta/User Data", + "Chrome Canary": "/Google/Chrome SxS/User Data", + "Chromium": "/Google/Chromium/User Data", + "Edge": "/Microsoft/Edge/User Data", + "Vivaldi": "/Vivaldi/User Data", } - LocalAppDataDir = os.getenv('LOCALAPPDATA') + LocalAppDataDir = os.getenv("LOCALAPPDATA") all_browser_cookies = [] exceptions = [] for brw, path in user_data_dirs.items(): user_dir = LocalAppDataDir + path - cookies_files = glob(user_dir+'/*/Cookies') + glob(user_dir+'/*/Network/Cookies') - local_state = user_dir+'/Local State' + cookies_files = glob(user_dir + "/*/Cookies") + glob( + user_dir + "/*/Network/Cookies" + ) + local_state = user_dir + "/Local State" if os.path.exists(local_state): key = decrypt_key(local_state) decrypter = Decrypter(key) for file in cookies_files: - profile = brw + ": " + file.split('User Data')[1].split(os.sep)[1] + profile = brw + ": " + file.split("User Data")[1].split(os.sep)[1] file = os.path.normpath(file) try: records = get_cookies(file, decrypter) if records: # 将records转换为便于使用的格式 for site, cookies in records.items(): - entry = {'profile': profile, 'site': site, 'cookies': cookies} + entry = { + "profile": profile, + "site": site, + "cookies": cookies, + } all_browser_cookies.append(entry) except Exception as e: exceptions.append(e) @@ -78,45 +86,51 @@ def convert_chrome_utc(chrome_utc): unix_utc = datetime.fromtimestamp(second) return unix_utc + def decrypt_key_win(local_state): """从Local State文件中提取并解密出Cookies文件的密钥""" # Chrome 80+ 的Cookies解密方法参考自: https://stackoverflow.com/a/60423699/6415337 import win32crypt - with open(local_state, 'rt', encoding='utf-8') as file: - encrypted_key = json.loads(file.read())['os_crypt']['encrypted_key'] - encrypted_key = base64.b64decode(encrypted_key) # Base64 decoding - encrypted_key = encrypted_key[5:] # Remove DPAPI - decrypted_key = win32crypt.CryptUnprotectData(encrypted_key, None, None, None, 0)[1] # Decrypt key + + with open(local_state, "rt", encoding="utf-8") as file: + encrypted_key = json.loads(file.read())["os_crypt"]["encrypted_key"] + encrypted_key = base64.b64decode(encrypted_key) # Base64 decoding + encrypted_key = encrypted_key[5:] # Remove DPAPI + decrypted_key = win32crypt.CryptUnprotectData(encrypted_key, None, None, None, 0)[ + 1 + ] # Decrypt key return decrypted_key def decrypt_key_linux(local_state): """从Local State文件中提取并解密出Cookies文件的密钥,适用于Linux""" # 读取Local State文件中的密钥 - with open(local_state, 'rt', encoding='utf-8') as file: - encrypted_key = json.loads(file.read())['os_crypt']['encrypted_key'] + with open(local_state, "rt", encoding="utf-8") as file: + encrypted_key = json.loads(file.read())["os_crypt"]["encrypted_key"] encrypted_key = base64.b64decode(encrypted_key) encrypted_key = encrypted_key[5:] key = encrypted_key - nonce = b' ' * 12 + nonce = b" " * 12 aesgcm = AESGCM(key) decrypted_key = aesgcm.decrypt(nonce, encrypted_key, None) return decrypted_key -decrypt_key = decrypt_key_win if sys.platform == 'win32' else decrypt_key_linux +decrypt_key = decrypt_key_win if sys.platform == "win32" else decrypt_key_linux -def get_cookies(cookies_file, decrypter, host_pattern='javdb%.com'): +def get_cookies(cookies_file, decrypter, host_pattern="javdb%.com"): """从cookies_file文件中查找指定站点的所有Cookies""" # 复制Cookies文件到临时目录,避免直接操作原始的Cookies文件 - temp_dir = os.getenv('TMPDIR', os.getenv('TEMP', os.getenv('TMP', '.'))) - temp_cookie = os.path.join(temp_dir, 'Cookies') + temp_dir = os.getenv("TMPDIR", os.getenv("TEMP", os.getenv("TMP", "."))) + temp_cookie = os.path.join(temp_dir, "Cookies") copyfile(cookies_file, temp_cookie) # 连接数据库进行查询 conn = sqlite3.connect(temp_cookie) cursor = conn.cursor() - cursor.execute(f'SELECT host_key, name, encrypted_value, expires_utc FROM cookies WHERE host_key LIKE "{host_pattern}"') + cursor.execute( + f'SELECT host_key, name, encrypted_value, expires_utc FROM cookies WHERE host_key LIKE "{host_pattern}"' + ) # 将查询结果按照host_key进行组织 now = datetime.now() records = {} @@ -127,7 +141,7 @@ def get_cookies(cookies_file, decrypter, host_pattern='javdb%.com'): if expires > now: d[name] = decrypter.decrypt(encrypted_value) # Cookies的核心字段是'_jdb_session',因此如果records中缺失此字段(说明已过期),则对应的Cookies不再有效 - valid_records = {k: v for k, v in records.items() if '_jdb_session' in v} + valid_records = {k: v for k, v in records.items() if "_jdb_session" in v} conn.close() os.remove(temp_cookie) return valid_records @@ -136,5 +150,4 @@ def get_cookies(cookies_file, decrypter, host_pattern='javdb%.com'): if __name__ == "__main__": all_cookies = get_browsers_cookies() for d in all_cookies: - print('{:<20}{}'.format(d['profile'], d['site'])) - + print("{:<20}{}".format(d["profile"], d["site"])) diff --git a/javsp/config.py b/javsp/config.py index 3fbc8f071..a3884b394 100644 --- a/javsp/config.py +++ b/javsp/config.py @@ -9,6 +9,7 @@ from javsp.lib import resource_path + class Scanner(BaseConfig): ignored_id_pattern: List[str] input_directory: Path | None = None @@ -16,26 +17,28 @@ class Scanner(BaseConfig): ignored_folder_name_pattern: List[str] minimum_size: ByteSize + class CrawlerID(str, Enum): - airav = 'airav' - avsox = 'avsox' - avwiki = 'avwiki' - dl_getchu = 'dl_getchu' - fanza = 'fanza' - fc2 = 'fc2' - fc2fan = 'fc2fan' - fc2ppvdb = 'fc2ppvdb' - gyutto = 'gyutto' - jav321 = 'jav321' - javbus = 'javbus' - javdb = 'javdb' - javlib = 'javlib' - javmenu = 'javmenu' - mgstage = 'mgstage' - njav = 'njav' - prestige = 'prestige' - arzon = 'arzon' - arzon_iv = 'arzon_iv' + airav = "airav" + avsox = "avsox" + avwiki = "avwiki" + dl_getchu = "dl_getchu" + fanza = "fanza" + fc2 = "fc2" + fc2fan = "fc2fan" + fc2ppvdb = "fc2ppvdb" + gyutto = "gyutto" + jav321 = "jav321" + javbus = "javbus" + javdb = "javdb" + javlib = "javlib" + javmenu = "javmenu" + mgstage = "mgstage" + njav = "njav" + prestige = "prestige" + arzon = "arzon" + arzon_iv = "arzon_iv" + class Network(BaseConfig): proxy_server: Url | None @@ -43,27 +46,28 @@ class Network(BaseConfig): timeout: Duration proxy_free: Dict[CrawlerID, Url] + class CrawlerSelect(BaseConfig): def items(self) -> List[tuple[str, list[CrawlerID]]]: return [ - ('normal', self.normal), - ('fc2', self.fc2), - ('cid', self.cid), - ('getchu', self.getchu), - ('gyutto', self.gyutto), + ("normal", self.normal), + ("fc2", self.fc2), + ("cid", self.cid), + ("getchu", self.getchu), + ("gyutto", self.gyutto), ] def __getitem__(self, index) -> list[CrawlerID]: match index: - case 'normal': + case "normal": return self.normal - case 'fc2': + case "fc2": return self.fc2 - case 'cid': + case "cid": return self.cid - case 'getchu': + case "getchu": return self.getchu - case 'gyutto': + case "gyutto": return self.gyutto raise Exception("Unknown crawler type") @@ -73,37 +77,40 @@ def __getitem__(self, index) -> list[CrawlerID]: getchu: list[CrawlerID] gyutto: list[CrawlerID] + class MovieInfoField(str, Enum): - dvdid = 'dvdid' - cid = 'cid' - url = 'url' - plot = 'plot' - cover = 'cover' - big_cover = 'big_cover' - genre = 'genre' - genre_id = 'genre_id' - genre_norm = 'genre_norm' - score = 'score' - title = 'title' - ori_title = 'ori_title' - magnet = 'magnet' - serial = 'serial' - actress = 'actress' - actress_pics = 'actress_pics' - director = 'director' - duration = 'duration' - producer = 'producer' - publisher = 'publisher' - uncensored = 'uncensored' - publish_date = 'publish_date' - preview_pics = 'preview_pics' - preview_video = 'preview_video' + dvdid = "dvdid" + cid = "cid" + url = "url" + plot = "plot" + cover = "cover" + big_cover = "big_cover" + genre = "genre" + genre_id = "genre_id" + genre_norm = "genre_norm" + score = "score" + title = "title" + ori_title = "ori_title" + magnet = "magnet" + serial = "serial" + actress = "actress" + actress_pics = "actress_pics" + director = "director" + duration = "duration" + producer = "producer" + publisher = "publisher" + uncensored = "uncensored" + publish_date = "publish_date" + preview_pics = "preview_pics" + preview_video = "preview_video" + class UseJavDBCover(str, Enum): yes = "yes" no = "no" fallback = "fallback" + class Crawler(BaseConfig): selection: CrawlerSelect required_keys: list[MovieInfoField] @@ -114,6 +121,7 @@ class Crawler(BaseConfig): use_javdb_cover: UseJavDBCover normalize_actress_name: bool + class MovieDefault(BaseConfig): title: str actress: str @@ -122,6 +130,7 @@ class MovieDefault(BaseConfig): producer: str publisher: str + class PathSummarize(BaseConfig): output_folder_pattern: str basename_pattern: str @@ -130,25 +139,31 @@ class PathSummarize(BaseConfig): max_actress_count: PositiveInt = 10 hard_link: bool + class TitleSummarize(BaseConfig): remove_trailing_actor_name: bool + class NFOSummarize(BaseConfig): basename_pattern: str title_pattern: str custom_genres_fields: list[str] custom_tags_fields: list[str] + class ExtraFanartSummarize(BaseConfig): enabled: bool scrap_interval: Duration + class SlimefaceEngine(BaseConfig): - name: Literal['slimeface'] + name: Literal["slimeface"] + class CoverCrop(BaseConfig): - engine: SlimefaceEngine | None - on_id_pattern: list[str] + engine: SlimefaceEngine | None + on_id_pattern: list[str] + class CoverSummarize(BaseConfig): basename_pattern: str @@ -156,9 +171,11 @@ class CoverSummarize(BaseConfig): add_label: bool crop: CoverCrop + class FanartSummarize(BaseConfig): basename_pattern: str + class Summarizer(BaseConfig): default: MovieDefault censor_options_representation: list[str] @@ -170,60 +187,76 @@ class Summarizer(BaseConfig): fanart: FanartSummarize extra_fanarts: ExtraFanartSummarize + class BaiduTranslateEngine(BaseConfig): - name: Literal['baidu'] + name: Literal["baidu"] app_id: str api_key: str + class BingTranslateEngine(BaseConfig): - name: Literal['bing'] + name: Literal["bing"] api_key: str + class ClaudeTranslateEngine(BaseConfig): - name: Literal['claude'] + name: Literal["claude"] api_key: str + class OpenAITranslateEngine(BaseConfig): - name: Literal['openai'] + name: Literal["openai"] url: Url api_key: str model: str + class GoogleTranslateEngine(BaseConfig): - name: Literal['google'] + name: Literal["google"] + TranslateEngine: TypeAlias = Union[ - BaiduTranslateEngine, - BingTranslateEngine, - ClaudeTranslateEngine, - OpenAITranslateEngine, - GoogleTranslateEngine, - None] + BaiduTranslateEngine, + BingTranslateEngine, + ClaudeTranslateEngine, + OpenAITranslateEngine, + GoogleTranslateEngine, + None, +] + class TranslateField(BaseConfig): title: bool plot: bool + class Translator(BaseConfig): - engine: TranslateEngine = Field(..., discriminator='name') + engine: TranslateEngine = Field(..., discriminator="name") fields: TranslateField + class Other(BaseConfig): check_update: bool auto_update: bool + def get_config_source(): - parser = ArgumentParser(prog='JavSP', description='汇总多站点数据的AV元数据刮削器', formatter_class=RawTextHelpFormatter) - parser.add_argument('-c', '--config', help='使用指定的配置文件') + parser = ArgumentParser( + prog="JavSP", + description="汇总多站点数据的AV元数据刮削器", + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("-c", "--config", help="使用指定的配置文件") args, _ = parser.parse_known_args() sources = [] if args.config is None: - args.config = resource_path('config.yml') + args.config = resource_path("config.yml") sources.append(FileSource(file=args.config)) - sources.append(EnvSource(prefix='JAVSP_', allow_all=True)) - sources.append(CLArgSource(prefix='o')) + sources.append(EnvSource(prefix="JAVSP_", allow_all=True)) + sources.append(CLArgSource(prefix="o")) return sources + class Cfg(BaseConfig): scanner: Scanner network: Network @@ -231,4 +264,4 @@ class Cfg(BaseConfig): summarizer: Summarizer translator: Translator other: Other - CONFIG_SOURCES=get_config_source() + CONFIG_SOURCES = get_config_source() diff --git a/javsp/cropper/__init__.py b/javsp/cropper/__init__.py index e9c340873..381642289 100644 --- a/javsp/cropper/__init__.py +++ b/javsp/cropper/__init__.py @@ -2,8 +2,9 @@ from javsp.cropper.interface import Cropper, DefaultCropper from javsp.cropper.slimeface_crop import SlimefaceCropper + def get_cropper(engine: SlimefaceEngine | None) -> Cropper: if engine is None: return DefaultCropper() - if engine.name == 'slimeface': + if engine.name == "slimeface": return SlimefaceCropper() diff --git a/javsp/cropper/interface.py b/javsp/cropper/interface.py index 710c2b630..698db72aa 100644 --- a/javsp/cropper/interface.py +++ b/javsp/cropper/interface.py @@ -1,23 +1,27 @@ from PIL.Image import Image from abc import ABC, abstractmethod + + class Cropper(ABC): @abstractmethod def crop_specific(self, fanart: Image, ratio: float) -> Image: pass def crop(self, fanart: Image, ratio: float | None = None) -> Image: - if ratio is None: + if ratio is None: ratio = 1.42 return self.crop_specific(fanart, ratio) + class DefaultCropper(Cropper): def crop_specific(self, fanart: Image, ratio: float) -> Image: """将给定的fanart图片文件裁剪为适合poster尺寸的图片""" (fanart_w, fanart_h) = fanart.size - (poster_w, poster_h) = \ - (int(fanart_h / ratio), fanart_h) \ - if fanart_h / fanart_w < ratio \ - else (fanart_w, int(fanart_w * ratio)) # 图片太“瘦”时以宽度来定裁剪高度 + (poster_w, poster_h) = ( + (int(fanart_h / ratio), fanart_h) + if fanart_h / fanart_w < ratio + else (fanart_w, int(fanart_w * ratio)) + ) # 图片太“瘦”时以宽度来定裁剪高度 box = (poster_w - fanart_w, 0, poster_w, poster_h) fanart.crop(box) diff --git a/javsp/cropper/slimeface_crop.py b/javsp/cropper/slimeface_crop.py index a0f9712e1..610b4f8fb 100644 --- a/javsp/cropper/slimeface_crop.py +++ b/javsp/cropper/slimeface_crop.py @@ -2,33 +2,39 @@ from javsp.cropper.interface import Cropper, DefaultCropper from javsp.cropper.utils import get_bound_box_by_face + class SlimefaceCropper(Cropper): def crop_specific(self, fanart: Image.Image, ratio: float) -> Image.Image: - try: - # defer the libary import so we don't break if missing dependencies + try: + # defer the libary import so we don't break if missing dependencies from slimeface import detectRGB - bbox_confs = detectRGB(fanart.width, fanart.height, fanart.convert('RGB').tobytes()) - bbox_confs.sort(key=lambda conf_bbox: -conf_bbox[4]) # last arg stores confidence + + bbox_confs = detectRGB( + fanart.width, fanart.height, fanart.convert("RGB").tobytes() + ) + bbox_confs.sort( + key=lambda conf_bbox: -conf_bbox[4] + ) # last arg stores confidence face = bbox_confs[0][:-1] poster_box = get_bound_box_by_face(face, fanart.size, ratio) return fanart.crop(poster_box) except: return DefaultCropper().crop_specific(fanart, ratio) -if __name__ == '__main__': + +if __name__ == "__main__": from argparse import ArgumentParser - arg_parser = ArgumentParser(prog='slimeface crop') + arg_parser = ArgumentParser(prog="slimeface crop") - arg_parser.add_argument('-i', '--image', help='path to image to detect') + arg_parser.add_argument("-i", "--image", help="path to image to detect") args, _ = arg_parser.parse_known_args() - if(args.image is None): + if args.image is None: print("USAGE: slimeface_crop.py -i/--image [path]") exit(1) input = Image.open(args.image) im = SlimefaceCropper().crop(input) - im.save('output.png') - + im.save("output.png") diff --git a/javsp/cropper/utils.py b/javsp/cropper/utils.py index b11b48eee..7f68ac57a 100644 --- a/javsp/cropper/utils.py +++ b/javsp/cropper/utils.py @@ -1,12 +1,16 @@ def get_poster_size(image_shape: tuple[int, int], ratio: float) -> tuple[int, int]: - (fanart_w, fanart_h) = image_shape - (poster_w, poster_h) = \ - (int(fanart_h / ratio), fanart_h) \ - if fanart_h / fanart_w < ratio \ - else (fanart_w, int(fanart_w * ratio)) # 图片太“瘦”时以宽度来定裁剪高度 - return (poster_w, poster_h) + (fanart_w, fanart_h) = image_shape + (poster_w, poster_h) = ( + (int(fanart_h / ratio), fanart_h) + if fanart_h / fanart_w < ratio + else (fanart_w, int(fanart_w * ratio)) + ) # 图片太“瘦”时以宽度来定裁剪高度 + return (poster_w, poster_h) + -def get_bound_box_by_face(face: tuple[int, int, int, int], image_shape: tuple[int, int], ratio: float) -> tuple[int, int, int, int]: +def get_bound_box_by_face( + face: tuple[int, int, int, int], image_shape: tuple[int, int], ratio: float +) -> tuple[int, int, int, int]: """ returns (left, upper, right, lower) """ @@ -24,4 +28,3 @@ def get_bound_box_by_face(face: tuple[int, int, int, int], image_shape: tuple[in poster_left = min(poster_left, fanart_w - poster_w) poster_left = int(poster_left) return (poster_left, 0, poster_left + poster_w, poster_h) - diff --git a/javsp/datatype.py b/javsp/datatype.py index 4bfdd7171..67cc21666 100644 --- a/javsp/datatype.py +++ b/javsp/datatype.py @@ -1,4 +1,5 @@ """定义数据类型和一些通用性的对数据类型的操作""" + import os import csv import json @@ -11,7 +12,8 @@ logger = logging.getLogger(__name__) -filemove_logger = logging.getLogger('filemove') +filemove_logger = logging.getLogger("filemove") + class MovieInfo: def __init__(self, dvdid: str = None, /, *, cid: str = None, from_file=None): @@ -22,36 +24,38 @@ def __init__(self, dvdid: str = None, /, *, cid: str = None, from_file=None): """ arg_count = len([i for i in [dvdid, cid, from_file] if i]) if arg_count != 1: - raise TypeError(f'Require 1 parameter but {arg_count} given') + raise TypeError(f"Require 1 parameter but {arg_count} given") if isinstance(dvdid, Movie): self.dvdid = dvdid.dvdid self.cid = dvdid.cid else: - self.dvdid = dvdid # DVD ID,即通常的番号 - self.cid = cid # DMM Content ID + self.dvdid = dvdid # DVD ID,即通常的番号 + self.cid = cid # DMM Content ID # 创建类的默认属性 - self.url = None # 影片页面的URL - self.plot = None # 故事情节 - self.cover = None # 封面图片(URL) - self.big_cover = None # 高清封面图片(URL) - self.genre = None # 影片分类的标签 - self.genre_id = None # 影片分类的标签的ID,用于解决部分站点多个genre同名的问题,也便于管理多语言的genre - self.genre_norm = None # 统一后的影片分类的标签 - self.score = None # 评分(10分制,为方便提取写入和保持统一,应以字符串类型表示) - self.title = None # 影片标题(不含番号) - self.ori_title = None # 原始影片标题,仅在标题被处理过时才对此字段赋值 - self.magnet = None # 磁力链接 - self.serial = None # 系列 - self.actress = None # 出演女优 - self.actress_pics = None # 出演女优的头像。单列一个字段,便于满足不同的使用需要 - self.director = None # 导演 - self.duration = None # 影片时长 - self.producer = None # 制作商 - self.publisher = None # 发行商 - self.uncensored = None # 是否为无码影片 - self.publish_date = None # 发布日期 - self.preview_pics = None # 预览图片(URL) - self.preview_video = None # 预览视频(URL) + self.url = None # 影片页面的URL + self.plot = None # 故事情节 + self.cover = None # 封面图片(URL) + self.big_cover = None # 高清封面图片(URL) + self.genre = None # 影片分类的标签 + self.genre_id = None # 影片分类的标签的ID,用于解决部分站点多个genre同名的问题,也便于管理多语言的genre + self.genre_norm = None # 统一后的影片分类的标签 + self.score = ( + None # 评分(10分制,为方便提取写入和保持统一,应以字符串类型表示) + ) + self.title = None # 影片标题(不含番号) + self.ori_title = None # 原始影片标题,仅在标题被处理过时才对此字段赋值 + self.magnet = None # 磁力链接 + self.serial = None # 系列 + self.actress = None # 出演女优 + self.actress_pics = None # 出演女优的头像。单列一个字段,便于满足不同的使用需要 + self.director = None # 导演 + self.duration = None # 影片时长 + self.producer = None # 制作商 + self.publisher = None # 发行商 + self.uncensored = None # 是否为无码影片 + self.publish_date = None # 发布日期 + self.preview_pics = None # 预览图片(URL) + self.preview_video = None # 预览视频(URL) if from_file: if os.path.isfile(from_file): @@ -80,15 +84,15 @@ def dump(self, filepath=None, crawler=None) -> None: if not filepath: id = self.dvdid if self.dvdid else self.cid if crawler: - filepath = f'../unittest/data/{id} ({crawler}).json' + filepath = f"../unittest/data/{id} ({crawler}).json" filepath = os.path.join(os.path.dirname(__file__), filepath) else: - filepath = id + '.json' - with open(filepath, 'wt', encoding='utf-8') as f: + filepath = id + ".json" + with open(filepath, "wt", encoding="utf-8") as f: f.write(str(self)) def load(self, filepath) -> None: - with open(filepath, 'rt', encoding='utf-8') as f: + with open(filepath, "rt", encoding="utf-8") as f: d = json.load(f) # 更新对象属性 attrs = vars(self).keys() @@ -100,68 +104,75 @@ def get_info_dic(self): """生成用来填充模板的字典""" info = self d = {} - d['num'] = info.dvdid or info.cid - d['title'] = info.title or Cfg().summarizer.default.title - d['rawtitle'] = info.ori_title or d['title'] - d['actress'] = ','.join(info.actress) if info.actress else Cfg().summarizer.default.actress - d['score'] = info.score or '0' - d['censor'] = Cfg().summarizer.censor_options_representation[1 if info.uncensored else 0] - d['serial'] = info.serial or Cfg().summarizer.default.series - d['director'] = info.director or Cfg().summarizer.default.director - d['producer'] = info.producer or Cfg().summarizer.default.producer - d['publisher'] = info.publisher or Cfg().summarizer.default.publisher - d['date'] = info.publish_date or '0000-00-00' - d['year'] = d['date'].split('-')[0] + d["num"] = info.dvdid or info.cid + d["title"] = info.title or Cfg().summarizer.default.title + d["rawtitle"] = info.ori_title or d["title"] + d["actress"] = ( + ",".join(info.actress) if info.actress else Cfg().summarizer.default.actress + ) + d["score"] = info.score or "0" + d["censor"] = Cfg().summarizer.censor_options_representation[ + 1 if info.uncensored else 0 + ] + d["serial"] = info.serial or Cfg().summarizer.default.series + d["director"] = info.director or Cfg().summarizer.default.director + d["producer"] = info.producer or Cfg().summarizer.default.producer + d["publisher"] = info.publisher or Cfg().summarizer.default.publisher + d["date"] = info.publish_date or "0000-00-00" + d["year"] = d["date"].split("-")[0] # cid中不会出现'-',可以直接从d['num']拆分出label - num_items = d['num'].split('-') - d['label'] = num_items[0] if len(num_items) > 1 else '---' - d['genre'] = ','.join(info.genre_norm if info.genre_norm else info.genre if info.genre else []) + num_items = d["num"].split("-") + d["label"] = num_items[0] if len(num_items) > 1 else "---" + d["genre"] = ",".join( + info.genre_norm if info.genre_norm else info.genre if info.genre else [] + ) return d class Movie: """用于关联影片文件的类""" + def __init__(self, dvdid=None, /, *, cid=None) -> None: arg_count = len([i for i in (dvdid, cid) if i]) if arg_count != 1: - raise TypeError(f'Require 1 parameter but {arg_count} given') + raise TypeError(f"Require 1 parameter but {arg_count} given") # 创建类的默认属性 - self.dvdid = dvdid # DVD ID,即通常的番号 - self.cid = cid # DMM Content ID - self.files = [] # 关联到此番号的所有影片文件的列表(用于管理带有多个分片的影片) - self.data_src = 'normal' # 数据源:不同的数据源将使用不同的爬虫 - self.info: MovieInfo = None # 抓取到的影片信息 - self.save_dir = None # 存放影片、封面、NFO的文件夹路径 - self.basename = None # 按照命名模板生成的不包含路径和扩展名的basename - self.nfo_file = None # nfo文件的路径 - self.fanart_file = None # fanart文件的路径 - self.poster_file = None # poster文件的路径 - self.guid = None # GUI使用的唯一标识,通过dvdid和files做md5生成 + self.dvdid = dvdid # DVD ID,即通常的番号 + self.cid = cid # DMM Content ID + self.files = [] # 关联到此番号的所有影片文件的列表(用于管理带有多个分片的影片) + self.data_src = "normal" # 数据源:不同的数据源将使用不同的爬虫 + self.info: MovieInfo = None # 抓取到的影片信息 + self.save_dir = None # 存放影片、封面、NFO的文件夹路径 + self.basename = None # 按照命名模板生成的不包含路径和扩展名的basename + self.nfo_file = None # nfo文件的路径 + self.fanart_file = None # fanart文件的路径 + self.poster_file = None # poster文件的路径 + self.guid = None # GUI使用的唯一标识,通过dvdid和files做md5生成 @cached_property def hard_sub(self) -> bool: """影片文件带有内嵌字幕""" - return 'C' in self.attr_str + return "C" in self.attr_str @cached_property def uncensored(self) -> bool: """影片文件是无码流出/无码破解版本(很多种子并不严格区分这两种,故这里也不进一步细分)""" - return 'U' in self.attr_str + return "U" in self.attr_str @cached_property def attr_str(self) -> str: """用来标示影片文件的额外属性的字符串(空字符串/-U/-C/-UC)""" # 暂不支持多分片的影片 if len(self.files) != 1: - return '' + return "" r = detect_special_attr(self.files[0], self.dvdid) if r: - r = '-' + r + r = "-" + r return r def __repr__(self) -> str: - if self.cid and self.data_src == 'cid': + if self.cid and self.data_src == "cid": expression = f"('cid={self.cid}')" else: expression = f"('{self.dvdid}')" @@ -169,13 +180,14 @@ def __repr__(self) -> str: def rename_files(self, use_hardlink: bool = False) -> None: """根据命名规则移动(重命名)影片文件""" - def move_file(src:str, dst:str): + + def move_file(src: str, dst: str): """移动(重命名)文件并记录信息到日志""" abs_dst = os.path.abspath(dst) # shutil.move might overwrite dst file if os.path.exists(abs_dst): - raise FileExistsError(f'File exists: {abs_dst}') - if (use_hardlink): + raise FileExistsError(f"File exists: {abs_dst}") + if use_hardlink: os.link(src, abs_dst) else: shutil.move(src, abs_dst) @@ -183,7 +195,9 @@ def move_file(src:str, dst:str): dst_name = os.path.basename(dst) logger.info(f"重命名文件: '{src_rel}' -> '...{os.sep}{dst_name}'") # 目前StreamHandler并未设置filter,为了避免显示中出现重复的日志,这里暂时只能用debug级别 - filemove_logger.debug(f'移动(重命名)文件: \n 原路径: "{src}"\n 新路径: "{abs_dst}"') + filemove_logger.debug( + f'移动(重命名)文件: \n 原路径: "{src}"\n 新路径: "{abs_dst}"' + ) new_paths = [] dir = os.path.dirname(self.files[0]) @@ -196,28 +210,31 @@ def move_file(src:str, dst:str): else: for i, fullpath in enumerate(self.files, start=1): ext = os.path.splitext(fullpath)[1] - newpath = os.path.join(self.save_dir, self.basename + f'-CD{i}' + ext) + newpath = os.path.join(self.save_dir, self.basename + f"-CD{i}" + ext) move_file(fullpath, newpath) new_paths.append(newpath) self.new_paths = new_paths if len(os.listdir(dir)) == 0: - #如果移动文件后目录为空则删除该目录 + # 如果移动文件后目录为空则删除该目录 os.rmdir(dir) class GenreMap(dict): """genre的映射表""" + def __init__(self, file): genres = {} - with open(resource_path(file), newline='', encoding='utf-8-sig') as csvfile: + with open(resource_path(file), newline="", encoding="utf-8-sig") as csvfile: reader = csv.DictReader(csvfile) try: for row in reader: - genres[row['id']] = row['translate'] + genres[row["id"]] = row["translate"] except UnicodeDecodeError: - logger.error('CSV file must be saved as UTF-8-BOM to edit is in Excel') + logger.error("CSV file must be saved as UTF-8-BOM to edit is in Excel") except KeyError: - logger.error("The columns 'id' and 'translate' must exist in the csv file") + logger.error( + "The columns 'id' and 'translate' must exist in the csv file" + ) self.update(genres) def map(self, ls): diff --git a/javsp/file.py b/javsp/file.py index 9ae6b0f8b..b986dc08d 100644 --- a/javsp/file.py +++ b/javsp/file.py @@ -1,16 +1,22 @@ """与文件相关的各类功能""" + import os -from pathlib import Path import re import ctypes import logging import itertools -import json from sys import platform from typing import List -__all__ = ['scan_movies', 'get_fmt_size', 'get_remaining_path_len', 'replace_illegal_chars', 'get_failed_when_scan', 'find_subtitle_in_dir'] +__all__ = [ + "scan_movies", + "get_fmt_size", + "get_remaining_path_len", + "replace_illegal_chars", + "get_failed_when_scan", + "find_subtitle_in_dir", +] from javsp.avid import * @@ -24,14 +30,16 @@ def scan_movies(root: str) -> List[Movie]: """获取文件夹内的所有影片的列表(自动探测同一文件夹内的分片)""" - # 由于实现的限制: + # 由于实现的限制: # 1. 以数字编号最多支持10个分片,字母编号最多支持26个分片 # 2. 允许分片间的编号有公共的前导符(如编号01, 02, 03),因为求prefix时前导符也会算进去 # 扫描所有影片文件并获取它们的番号 - dic = {} # avid: [abspath1, abspath2...] + dic = {} # avid: [abspath1, abspath2...] small_videos = {} - ignore_folder_name_pattern = re.compile('|'.join(Cfg().scanner.ignored_folder_name_pattern)) + ignore_folder_name_pattern = re.compile( + "|".join(Cfg().scanner.ignored_folder_name_pattern) + ) for dirpath, dirnames, filenames in os.walk(root): for name in dirnames.copy(): if ignore_folder_name_pattern.match(name): @@ -55,7 +63,7 @@ def scan_movies(root: str) -> List[Movie]: else: dic[avid] = [fullpath] else: - fail = Movie('无法识别番号') + fail = Movie("无法识别番号") fail.files = [fullpath] failed_items.append(fail) logger.error(f"无法提取影片番号: '{fullpath}'") @@ -70,15 +78,17 @@ def scan_movies(root: str) -> List[Movie]: elif avid: has_avid[name] = avid # 对于前面忽略的视频生成一个简单的提示 - small_videos = {k:sorted(v) for k,v in sorted(small_videos.items())} + small_videos = {k: sorted(v) for k, v in sorted(small_videos.items())} skipped_files = list(itertools.chain(*small_videos.values())) skipped_cnt = len(skipped_files) if skipped_cnt > 0: if len(has_avid) > 0: - logger.info(f"跳过了 {', '.join(has_avid)} 等{skipped_cnt}个小于指定大小的视频文件") + logger.info( + f"跳过了 {', '.join(has_avid)} 等{skipped_cnt}个小于指定大小的视频文件" + ) else: logger.info(f"跳过了{skipped_cnt}个小于指定大小的视频文件") - logger.debug('跳过的视频文件如下:\n' + '\n'.join(skipped_files)) + logger.debug("跳过的视频文件如下:\n" + "\n".join(skipped_files)) # 检查是否有多部影片对应同一个番号 non_slice_dup = {} # avid: [abspath1, abspath2...] for avid, files in dic.copy().items(): @@ -95,19 +105,21 @@ def scan_movies(root: str) -> List[Movie]: basenames = [os.path.basename(i) for i in files] prefix = os.path.commonprefix(basenames) try: - pattern_expr = re_escape(prefix) + r'\s*([a-z\d])\s*' + pattern_expr = re_escape(prefix) + r"\s*([a-z\d])\s*" pattern = re.compile(pattern_expr, flags=re.I) except re.error: logger.debug(f"正则识别影片分片信息时出错: '{pattern_expr}'") del dic[avid] continue - remaining = [pattern.sub(r'\1', i).lower() for i in basenames] + remaining = [pattern.sub(r"\1", i).lower() for i in basenames] postfixes = [i[1:] for i in remaining] slices = [i[0] for i in remaining] # 如果有不同的后缀,说明有文件名不符合正则表达式条件(没有发生替换或不带分片信息) - if (len(set(postfixes)) != 1 + if ( + len(set(postfixes)) != 1 # remaining为初步提取的分片信息,不允许有重复值 - or len(slices) != len(set(slices))): + or len(slices) != len(set(slices)) + ): logger.debug(f"无法识别分片信息: {prefix=}, {remaining=}") non_slice_dup[avid] = files del dic[avid] @@ -115,7 +127,9 @@ def scan_movies(root: str) -> List[Movie]: # 影片编号必须从 0/1/a 开始且编号连续 sorted_slices = sorted(slices) first, last = sorted_slices[0], sorted_slices[-1] - if (first not in ('0', '1', 'a')) or (ord(last) != (ord(first)+len(sorted_slices)-1)): + if (first not in ("0", "1", "a")) or ( + ord(last) != (ord(first) + len(sorted_slices) - 1) + ): logger.debug(f"无效的分片起始编号或分片编号不连续: {sorted_slices=}") non_slice_dup[avid] = files del dic[avid] @@ -125,18 +139,21 @@ def scan_movies(root: str) -> List[Movie]: dic[avid] = mapped_files # 汇总输出错误提示信息 - msg = '' + msg = "" for avid, files in non_slice_dup.items(): - msg += f'{avid}: \n' + msg += f"{avid}: \n" for f in files: - msg += (' ' + os.path.relpath(f, root) + '\n') + msg += " " + os.path.relpath(f, root) + "\n" if msg: - logger.error("下列番号对应多部影片文件且不符合分片规则,已略过整理,请手动处理后重新运行脚本: \n" + msg) + logger.error( + "下列番号对应多部影片文件且不符合分片规则,已略过整理,请手动处理后重新运行脚本: \n" + + msg + ) # 转换数据的组织格式 movies: List[Movie] = [] for avid, files in dic.items(): src = guess_av_type(avid) - if src != 'cid': + if src != "cid": mov = Movie(avid) else: mov = Movie(cid=avid) @@ -144,7 +161,7 @@ def scan_movies(root: str) -> List[Movie]: mov.dvdid = get_id(files[0]) mov.files = files mov.data_src = src - logger.debug(f'影片数据源类型: {avid}: {src}') + logger.debug(f"影片数据源类型: {avid}: {src}") movies.append(mov) return movies @@ -154,37 +171,41 @@ def get_failed_when_scan(): return failed_items -_PARDIR_REPLACE = re.compile(r'\.{2,}') +_PARDIR_REPLACE = re.compile(r"\.{2,}") + + def replace_illegal_chars(name): """将不能用于文件名的字符替换为形近的字符""" # 非法字符列表 https://stackoverflow.com/a/31976060/6415337 - if platform == 'win32': + if platform == "win32": # http://www.unicode.org/Public/security/latest/confusables.txt - charmap = {'<': '❮', - '>': '❯', - ':': ':', - '"': '″', - '/': '/', - '\\': '\', - '|': '|', - '?': '?', - '*': '꘎'} + charmap = { + "<": "❮", + ">": "❯", + ":": ":", + '"': "″", + "/": "/", + "\\": "\", + "|": "|", + "?": "?", + "*": "꘎", + } for c, rep in charmap.items(): name = name.replace(c, rep) elif platform == "darwin": # MAC OS X - name = name.replace(':', ':') - else: # 其余都当做Linux处理 - name = name.replace('/', '/') + name = name.replace(":", ":") + else: # 其余都当做Linux处理 + name = name.replace("/", "/") # 处理连续多个英文句点. if os.pardir in name: - name = _PARDIR_REPLACE.sub('…', name) + name = _PARDIR_REPLACE.sub("…", name) return name def is_remote_drive(path: str): """判断一个路径是否为远程映射到本地""" - #TODO: 当前仅支持Windows平台 - if platform != 'win32': + # TODO: 当前仅支持Windows平台 + if platform != "win32": return False DRIVE_REMOTE = 0x4 drive = os.path.splitdrive(os.path.abspath(path))[0] + os.sep @@ -194,10 +215,14 @@ def is_remote_drive(path: str): def get_remaining_path_len(path): """计算当前系统支持的最大路径长度与给定路径长度的差值""" - #TODO: 支持不同的操作系统 + # TODO: 支持不同的操作系统 fullpath = os.path.abspath(path) # Windows: If the length exceeds ~256 characters, you will be able to see the path/files via Windows/File Explorer, but may not be able to delete/move/rename these paths/files - length = len(fullpath.encode('utf-8')) if Cfg().summarizer.path.length_by_byte else len(fullpath) + length = ( + len(fullpath.encode("utf-8")) + if Cfg().summarizer.path.length_by_byte + else len(fullpath) + ) remaining = Cfg().summarizer.path.length_maximum - length return remaining @@ -215,7 +240,7 @@ def get_fmt_size(file_or_size) -> str: size = file_or_size else: size = os.path.getsize(file_or_size) - for unit in ['','Ki','Mi','Gi','Ti']: + for unit in ["", "Ki", "Mi", "Gi", "Ti"]: # 1023.995: to avoid rounding bug when format str, e.g. 1048571 -> 1024.0 KiB if abs(size) < 1023.995: return f"{size:3.2f} {unit}B" @@ -223,7 +248,9 @@ def get_fmt_size(file_or_size) -> str: _sub_files = {} -SUB_EXTENSIONS = ('.srt', '.ass') +SUB_EXTENSIONS = (".srt", ".ass") + + def find_subtitle_in_dir(folder: str, dvdid: str): """在folder内寻找是否有匹配dvdid的字幕""" folder_data = _sub_files.get(folder) diff --git a/javsp/func.py b/javsp/func.py index 042afea5c..ba2048acd 100644 --- a/javsp/func.py +++ b/javsp/func.py @@ -1,4 +1,5 @@ """业务逻辑所需的或有一定通用性的函数""" + # 为了降低耦合度,也避免功能复杂后可能出现的循环导入的问题,这里尽量不导入项目内部的模块 # 如果需要获得配置信息,也应当由外部模块将配置项的值以参数的形式传入 import os @@ -27,24 +28,31 @@ from javsp.lib import re_escape, resource_path -__all__ = ['select_folder', 'get_scan_dir', 'remove_trail_actor_in_title', - 'shutdown', 'CLEAR_LINE', 'check_update', 'split_by_punc'] +__all__ = [ + "select_folder", + "get_scan_dir", + "remove_trail_actor_in_title", + "shutdown", + "CLEAR_LINE", + "check_update", + "split_by_punc", +] -CLEAR_LINE = '\r\x1b[K' +CLEAR_LINE = "\r\x1b[K" logger = logging.getLogger(__name__) -def select_folder(default_dir=''): +def select_folder(default_dir=""): """使用文件对话框提示用户选择一个文件夹""" if not USE_GUI: logger.error("无法打开窗口,请通过命令行的方式输入扫描路径") exit(1) window = Tk() window.withdraw() - window.iconbitmap(resource_path('image/JavSP.ico')) + window.iconbitmap(resource_path("image/JavSP.ico")) path = filedialog.askdirectory(initialdir=default_dir) - if path != '': + if path != "": return os.path.normpath(path) @@ -57,21 +65,21 @@ def get_scan_dir(cfg_scan_dir: Path | None) -> str | None: else: logger.error(f"配置的待整理文件夹无效:'{cfg_scan_dir}'") else: - if platform.system().lower() == 'windows': - print('请选择要整理的文件夹:', end='') + if platform.system().lower() == "windows": + print("请选择要整理的文件夹:", end="") root = select_folder() else: - root = input('请选择要整理的文件夹路径,必须是绝对路径: ') + root = input("请选择要整理的文件夹路径,必须是绝对路径: ") print(root) return root -def remove_trail_actor_in_title(title:str, actors:list) -> str: +def remove_trail_actor_in_title(title: str, actors: list) -> str: """寻找并移除标题尾部的女优名""" if not (actors and title): return title # 目前使用分隔符白名单来做检测(担心按Unicode范围匹配误伤太多),考虑尽可能多的分隔符 - delimiters = '-xX &·,; &・,;' + delimiters = "-xX &·,; &・,;" actor_ls = [re_escape(i) for i in actors if i] pattern = f"^(.*?)([{delimiters}]{{1,3}}({'|'.join(actor_ls)}))+$" # 使用match而不是sub是为了将替换掉的部分写入日志 @@ -87,11 +95,13 @@ def shutdown(timeout=120): """关闭计算机""" try: for i in reversed(range(timeout)): - print(CLEAR_LINE + f"JavSP整理完成,将在 {i} 秒后关机。按'Ctrl+C'取消", end='') + print( + CLEAR_LINE + f"JavSP整理完成,将在 {i} 秒后关机。按'Ctrl+C'取消", end="" + ) time.sleep(1) - logger.info('整理完成,自动关机') - #TODO: 当前仅支持Windows平台 - os.system('shutdown -s') + logger.info("整理完成,自动关机") + # TODO: 当前仅支持Windows平台 + os.system("shutdown -s") except KeyboardInterrupt: return @@ -101,7 +111,7 @@ def utc2local(utc_str): # python不支持 ISO-8601 中的Z后缀 now = time.time() offset = datetime.fromtimestamp(now) - datetime.utcfromtimestamp(now) - utc_str = utc_str.replace('Z', '+00:00') + utc_str = utc_str.replace("Z", "+00:00") utc_time = datetime.fromisoformat(utc_str) local_time = utc_time + offset return local_time @@ -111,7 +121,7 @@ def get_actual_width(mix_str: str) -> int: """给定一个中英混合的字符串,返回实际的显示宽度""" width = len(mix_str) for c in mix_str: - if u'\u4e00' <= c <= u'\u9fa5': + if "\u4e00" <= c <= "\u9fa5": width += 1 return width @@ -120,31 +130,33 @@ def align_center(mix_str: str, total_width: int) -> str: """给定一个中英混合的字符串,根据其实际显示宽度中心对齐""" actual_width = get_actual_width(mix_str) add_space = int((total_width - actual_width) / 2) - aligned_str = ' ' * add_space + mix_str + aligned_str = " " * add_space + mix_str return aligned_str # 枚举Unicode各平面内中日韩区块及拉丁字母区块内的所有标点符号 _punc = ( -"  ", # spaces -"!\"#%&'()*,-./:;?@[\\]_{}", # (0x0, 0x7f), Basic Latin -"¡§«¶·»¿", # (0x80, 0xff), Latin-1 Supplement -";·", # (0x370, 0x3ff), Greek and Coptic -"‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞", # (0x2000, 0x206f), General Punctuation -"、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽", # (0x3000, 0x303f), CJK Symbols and Punctuation -"゠・", # (0x30a0, 0x30ff), Katakana -"︐︑︒︓︔︕︖︗︘︙", # (0xfe10, 0xfe1f), Vertical Forms -"︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏", # (0xfe30, 0xfe4f), CJK Compatibility Forms -# "﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫", # (0xfe50, 0xfe6f), Small Form Variants -"!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・", # (0xff00, 0xffef), Halfwidth and Fullwidth Forms + "  ", # spaces + "!\"#%&'()*,-./:;?@[\\]_{}", # (0x0, 0x7f), Basic Latin + "¡§«¶·»¿", # (0x80, 0xff), Latin-1 Supplement + ";·", # (0x370, 0x3ff), Greek and Coptic + "‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞", # (0x2000, 0x206f), General Punctuation + "、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽", # (0x3000, 0x303f), CJK Symbols and Punctuation + "゠・", # (0x30a0, 0x30ff), Katakana + "︐︑︒︓︔︕︖︗︘︙", # (0xfe10, 0xfe1f), Vertical Forms + "︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏", # (0xfe30, 0xfe4f), CJK Compatibility Forms + # "﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫", # (0xfe50, 0xfe6f), Small Form Variants + "!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・", # (0xff00, 0xffef), Halfwidth and Fullwidth Forms ) -_punc_pattern = re.compile('.*?[' + ''.join(_punc) + ']') +_punc_pattern = re.compile(".*?[" + "".join(_punc) + "]") + + def split_by_punc(s): """将一个字符串按照Unicode标准中的标点符号进行分割""" iters = list(_punc_pattern.finditer(s)) if iters: - ls = [s[i.span()[0]: i.span()[1]] for i in iters] - ls.append(s[iters[-1].span()[1]:]) + ls = [s[i.span()[0] : i.span()[1]] for i in iters] + ls.append(s[iters[-1].span()[1] :]) else: ls = [s] return ls @@ -161,67 +173,71 @@ def print_header(title, info=[]): info_width = 0 terminal_width = shutil.get_terminal_size().columns display_width = min(max(title_width, info_width) + 6, terminal_width) - print('=' * display_width) + print("=" * display_width) for line in title: print(align_center(line, display_width)) if info: - print('-' * display_width) + print("-" * display_width) for line in info: print(line) - print('=' * display_width) - print('') + print("=" * display_width) + print("") # 使用pyinstaller打包exe时生成hook,运行时由该hook将版本信息注入到sys中 - local_version = meta.version('javsp') + local_version = meta.version("javsp") if local_version == "": return # 检查更新 if allow_check: - api_url = 'https://api.github.com/repos/Yuukiy/JavSP/releases/latest' - release_url = 'https://github.com/Yuukiy/JavSP/releases/latest' - print('正在检查更新...', end='') + api_url = "https://api.github.com/repos/Yuukiy/JavSP/releases/latest" + release_url = "https://github.com/Yuukiy/JavSP/releases/latest" + print("正在检查更新...", end="") try: data = request_get(api_url, timeout=3).json() - latest_version = data['tag_name'] - release_time = utc2local(data['published_at']) - release_date = release_time.isoformat().split('T')[0] + latest_version = data["tag_name"] + release_time = utc2local(data["published_at"]) + release_date = release_time.isoformat().split("T")[0] if version.parse(local_version) < version.parse(latest_version): - update_status = 'new_version' + update_status = "new_version" else: - update_status = 'already_latest' + update_status = "already_latest" except Exception as e: - logger.debug('检查版本更新时出错: ' + repr(e)) - update_status = 'fail_to_check' + logger.debug("检查版本更新时出错: " + repr(e)) + update_status = "fail_to_check" else: - update_status = 'disallow' + update_status = "disallow" # 根据检查更新的情况输出软件版本信息和更新信息 - print(CLEAR_LINE, end='') - if update_status == 'disallow': - title = f'Jav Scraper Package: {local_version}' + print(CLEAR_LINE, end="") + if update_status == "disallow": + title = f"Jav Scraper Package: {local_version}" print_header([title]) - elif update_status == 'already_latest': - title = f'Jav Scraper Package: {local_version} (已是最新版)' + elif update_status == "already_latest": + title = f"Jav Scraper Package: {local_version} (已是最新版)" print_header([title]) - elif update_status == 'fail_to_check': - release_url_mirror = 'https://hub.fastgit.xyz/Yuukiy/JavSP/releases/latest' - titles = [f'Jav Scraper Package: {local_version}'] - info = ['检查更新失败,请前往以下地址查看最新版本:', ' '+release_url, - '如果你打不开上面的地址,也可以尝试访问镜像站点:', ' '+release_url_mirror] + elif update_status == "fail_to_check": + release_url_mirror = "https://hub.fastgit.xyz/Yuukiy/JavSP/releases/latest" + titles = [f"Jav Scraper Package: {local_version}"] + info = [ + "检查更新失败,请前往以下地址查看最新版本:", + " " + release_url, + "如果你打不开上面的地址,也可以尝试访问镜像站点:", + " " + release_url_mirror, + ] print_header(titles, info) - elif update_status == 'new_version': - titles = [f'Jav Scraper Package: {local_version}'] - titles.append(f'↓ 有新版本可下载: {latest_version} ↓') + elif update_status == "new_version": + titles = [f"Jav Scraper Package: {local_version}"] + titles.append(f"↓ 有新版本可下载: {latest_version} ↓") titles.append(release_url) # 提取changelog消息 try: enable_msg_head = True - lines = data['body'].splitlines() - changelog = [f'更新时间: {release_date}'] + lines = data["body"].splitlines() + changelog = [f"更新时间: {release_date}"] for line in lines: - if line.startswith('## '): + if line.startswith("## "): enable_msg_head = False changelog.append(Style.BRIGHT + line[3:] + Style.RESET_ALL) - elif line.startswith('- '): + elif line.startswith("- "): enable_msg_head = False changelog.append(line) elif enable_msg_head: @@ -232,15 +248,17 @@ def print_header(title, info=[]): # 尝试自动更新 if auto_update: try: - logger.info('尝试自动更新到新版本: ' + latest_version + " (按'Ctrl+C'取消)") + logger.info( + "尝试自动更新到新版本: " + latest_version + " (按'Ctrl+C'取消)" + ) download_update(data) except KeyboardInterrupt: - logger.info('用户取消更新') + logger.info("用户取消更新") except Exception as e: - logger.warning('自动更新失败,请重启程序再试或者手动下载更新') + logger.warning("自动更新失败,请重启程序再试或者手动下载更新") logger.debug(e, exc_info=True) finally: - print() # 输出空行,作为新旧程序的分隔 + print() # 输出空行,作为新旧程序的分隔 def download_update(rel_info): @@ -249,22 +267,26 @@ def download_update(rel_info): Args: rel_info (json): 调用Github API得到的最新版的release信息 """ - if rel_info.get('assets') and getattr(sys, 'frozen', False): - down_url = rel_info['assets'][0]['browser_download_url'] - asset_name = rel_info['assets'][0]['name'] - desc = '下载更新' if shutil.get_terminal_size().columns < 120 else '下载更新: '+asset_name + if rel_info.get("assets") and getattr(sys, "frozen", False): + down_url = rel_info["assets"][0]["browser_download_url"] + asset_name = rel_info["assets"][0]["name"] + desc = ( + "下载更新" + if shutil.get_terminal_size().columns < 120 + else "下载更新: " + asset_name + ) download(down_url, asset_name, desc=desc) if os.path.exists(asset_name): # 备份原有的程序 basepath, ext = os.path.splitext(sys.executable) - backup_name = basepath + '_backup' + ext + backup_name = basepath + "_backup" + ext if os.path.exists(backup_name): os.remove(backup_name) os.rename(sys.executable, backup_name) # 解压下载的zip文件 - with zipfile.ZipFile(asset_name, 'r') as zip_ref: + with zipfile.ZipFile(asset_name, "r") as zip_ref: zip_ref.extractall() - logger.info('更新完成,启动新版本程序...') + logger.info("更新完成,启动新版本程序...") args = [sys.executable] + sys.argv[1:] p = subprocess.Popen(args, start_new_session=True) p.wait() @@ -273,5 +295,5 @@ def download_update(rel_info): if __name__ == "__main__": - setattr(sys, 'javsp_version', 'v0') + setattr(sys, "javsp_version", "v0") check_update() diff --git a/javsp/image.py b/javsp/image.py index e98ece903..5ea209879 100644 --- a/javsp/image.py +++ b/javsp/image.py @@ -1,11 +1,11 @@ """处理本地图片的相关功能""" + from enum import Enum -import os import logging from PIL import Image, ImageOps -__all__ = ['valid_pic', 'get_pic_size', 'add_label_to_poster', 'LabelPostion'] +__all__ = ["valid_pic", "get_pic_size", "add_label_to_poster", "LabelPostion"] logger = logging.getLogger(__name__) @@ -24,15 +24,19 @@ def valid_pic(pic_path): # 位置枚举 class LabelPostion(Enum): """水印位置枚举""" + TOP_LEFT = 1 TOP_RIGHT = 2 BOTTOM_LEFT = 3 BOTTOM_RIGHT = 4 -def add_label_to_poster(poster: Image.Image, mark_pic_file: Image.Image, pos: LabelPostion) -> Image.Image: + +def add_label_to_poster( + poster: Image.Image, mark_pic_file: Image.Image, pos: LabelPostion +) -> Image.Image: """向poster中添加标签(水印)""" - mark_img = mark_pic_file.convert('RGBA') - r,g,b,a = mark_img.split() + mark_img = mark_pic_file.convert("RGBA") + r, g, b, a = mark_img.split() # 计算水印位置 if pos == LabelPostion.TOP_LEFT: box = (0, 0) diff --git a/javsp/lib.py b/javsp/lib.py index 3b6932d76..8335918fd 100644 --- a/javsp/lib.py +++ b/javsp/lib.py @@ -1,14 +1,17 @@ """用来组织不需要依赖任何自定义类型的功能函数""" + import os import re import sys from pathlib import Path -__all__ = ['re_escape', 'resource_path', 'strftime_to_minutes', 'detect_special_attr'] +__all__ = ["re_escape", "resource_path", "strftime_to_minutes", "detect_special_attr"] + + +_special_chars_map = {i: "\\" + chr(i) for i in b"()[]{}?*+|^$\\."} -_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+|^$\\.'} def re_escape(s: str) -> str: """用来对字符串进行转义,以将转义后的字符串用于构造正则表达式""" pattern = s.translate(_special_chars_map) @@ -33,42 +36,46 @@ def strftime_to_minutes(s: str) -> int: Returns: [int]: 取整后的分钟数 """ - items = list(map(int, s.split(':'))) + items = list(map(int, s.split(":"))) if len(items) == 2: - minutes = items[0] + round(items[1]/60) + minutes = items[0] + round(items[1] / 60) elif len(items) == 3: - minutes = items[0] * 60 + items[1] + round(items[2]/60) + minutes = items[0] * 60 + items[1] + round(items[2] / 60) else: raise ValueError(f"无法将字符串'{s}'转换为分钟") return minutes -_PATTERN = re.compile(r'(uncen(sor(ed)?)?([- _\s]*leak(ed)?)?|[无無][码碼](流出|破解))', flags=re.I) +_PATTERN = re.compile( + r"(uncen(sor(ed)?)?([- _\s]*leak(ed)?)?|[无無][码碼](流出|破解))", flags=re.I +) + + def detect_special_attr(filepath: str, avid: str = None) -> str: """通过文件名检测影片是否有特殊属性(内嵌字幕、无码流出/破解) Returns: [str]: '', 'U', 'C', 'UC' """ - result = '' + result = "" base = os.path.splitext(os.path.basename(filepath))[0].upper() # 尝试使用正则匹配 match = _PATTERN.search(base) if match: - result += 'U' + result += "U" # 尝试匹配-C/-U/-UC后缀的影片 - postfix = base.split('-')[-1] - if postfix in ('U', 'C', 'UC'): + postfix = base.split("-")[-1] + if postfix in ("U", "C", "UC"): result += postfix elif avid: - pattern_str = re.sub(r'[_-]', '[_-]*', avid) + r'(UC|U|C)\b' + pattern_str = re.sub(r"[_-]", "[_-]*", avid) + r"(UC|U|C)\b" match = re.search(pattern_str, base, flags=re.I) if match: result += match.group(1) # 最终格式化 - result = ''.join(sorted(set(result), reverse=True)) + result = "".join(sorted(set(result), reverse=True)) return result if __name__ == "__main__": - print(detect_special_attr('ipx-177cd1.mp4', 'IPX-177')) + print(detect_special_attr("ipx-177cd1.mp4", "IPX-177")) diff --git a/javsp/nfo.py b/javsp/nfo.py index 573aa0cc3..01a05f544 100644 --- a/javsp/nfo.py +++ b/javsp/nfo.py @@ -1,4 +1,5 @@ """与操作nfo文件相关的功能""" + from lxml.etree import tostring from lxml.builder import E @@ -43,13 +44,13 @@ def write_nfo(info: MovieInfo, nfo_file): # 但是Emby不支持此特性,Jellyfin的文档和社区都比较弱,没找到相关说明,推测多半也不支持 # fanart通常也是通过给fanart图片命名来匹配 - nfo.append(E.mpaa('NC-17')) # 分级 + nfo.append(E.mpaa("NC-17")) # 分级 # 将DVD ID和CID写入到uniqueid字段 if info.dvdid: - nfo.append(E.uniqueid(info.dvdid, type='num', default='true')) + nfo.append(E.uniqueid(info.dvdid, type="num", default="true")) if info.cid: - nfo.append(E.uniqueid(info.cid, type='cid')) + nfo.append(E.uniqueid(info.cid, type="cid")) # 选择要写入的genre数据源字段:将[]作为后备结果,以确保genre结果为None时后续不会抛出异常 for genre_item in (info.genre_norm, info.genre, []): @@ -70,7 +71,7 @@ def write_nfo(info: MovieInfo, nfo_file): tags = [] # 添加自定义tag for tag_new in Cfg().summarizer.nfo.custom_tags_fields: - tags.append(tag_new.format(**dic)) + tags.append(tag_new.format(**dic)) # 去重 tags = list(set(tags)) # 写入tag @@ -78,7 +79,7 @@ def write_nfo(info: MovieInfo, nfo_file): nfo.append(E.tag(i)) # Kodi上的country字段没说必须使用国家的代码(比如JP),所以目前暂定直接使用国家名 - nfo.append(E.country('日本')) + nfo.append(E.country("日本")) if info.serial: # 部分影片有系列。set字段支持overview作为介绍,但是目前没发现有地方可以获取到系列的介绍 @@ -109,13 +110,20 @@ def write_nfo(info: MovieInfo, nfo_file): else: nfo.append(E.actor(E.name(i))) - with open(nfo_file, 'wt', encoding='utf-8') as f: - f.write(tostring(nfo, encoding='unicode', pretty_print=True, - doctype='')) + with open(nfo_file, "wt", encoding="utf-8") as f: + f.write( + tostring( + nfo, + encoding="unicode", + pretty_print=True, + doctype='', + ) + ) if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) - info = MovieInfo(from_file=R'unittest\data\IPX-177 (javbus).json') + info = MovieInfo(from_file=R"unittest\data\IPX-177 (javbus).json") write_nfo(info) diff --git a/javsp/print.py b/javsp/print.py index 651b75679..d96e5b82c 100644 --- a/javsp/print.py +++ b/javsp/print.py @@ -1,9 +1,10 @@ """改写内置的print函数,将其输出重定向到tqdm""" + import tqdm import inspect -__all__ = ['TqdmOut'] +__all__ = ["TqdmOut"] # 普通输出和tqdm的输出混在一起会导致显示错乱,故在使用tqdm时要使用tqdm.write方法。 @@ -13,17 +14,22 @@ # 在单个模块内,不执行导入,这样的话在各个模块内仍然可以直接使用print builtin_print = print + + def flex_print(*args, **kwargs): try: tqdm.tqdm.write(*args, **kwargs) except: - builtin_print(*args, ** kwargs) + builtin_print(*args, **kwargs) + + # 替换内置的print inspect.builtins.print = flex_print class TqdmOut: """用于将logging的stream输出重定向到tqdm""" + @classmethod def write(cls, s, file=None, nolock=False): - tqdm.tqdm.write(s, file=file, end='', nolock=nolock) + tqdm.tqdm.write(s, file=file, end="", nolock=nolock) diff --git a/javsp/web/airav.py b/javsp/web/airav.py index 22e9fdbf7..9626f2653 100644 --- a/javsp/web/airav.py +++ b/javsp/web/airav.py @@ -1,4 +1,5 @@ """从airav抓取数据""" + import re import logging from html import unescape @@ -11,13 +12,13 @@ # 初始化Request实例 request = Request(use_scraper=True) -request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9' +request.headers["Accept-Language"] = "zh-TW,zh;q=0.9" # 近期airav服务器似乎不稳定,时好时坏,单次查询平均在17秒左右,timeout时间增加到20秒 request.timeout = 20 logger = logging.getLogger(__name__) -base_url = 'https://www.airav.wiki' +base_url = "https://www.airav.wiki" def search_movie(dvdid): @@ -27,74 +28,78 @@ def search_movie(dvdid): count = 1 result = [] while len(result) < count: - url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}' + url = ( + f"{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}" + ) r = request.get(url).json() # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"} - if r['result']: - result.extend(r['result']) - count = r['count'] + if r["result"]: + result.extend(r["result"]) + count = r["count"] page += 1 - else: # 结果为空,结束循环 + else: # 结果为空,结束循环 break # 如果什么都没搜索到,直接返回 if not result: raise MovieNotFoundError(__name__, dvdid) # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472') - result.sort(key=lambda x:x['barcode']) + result.sort(key=lambda x: x["barcode"]) # 从所有搜索结果中选择最可能的番号,返回它的URL - target = dvdid.replace('-', '_') + target = dvdid.replace("-", "_") for item in result: # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''} - barcode = item['barcode'].replace('-', '_') + barcode = item["barcode"].replace("-", "_") if target in barcode: - return item['barcode'] + return item["barcode"] raise MovieNotFoundError(__name__, dvdid, result) def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 - url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' + url = f"{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW" resp = request.get(url).json() # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息 - if resp['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid): + if resp["count"] == 0 and re.match(r"\d{6}[-_]\d{2,3}", movie.dvdid): barcode = search_movie(movie.dvdid) if barcode: - url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW' + url = f"{base_url}/api/video/barcode/{barcode}?lng=zh-TW" resp = request.get(url).json() - if resp['count'] == 0: + if resp["count"] == 0: raise MovieNotFoundError(__name__, movie.dvdid, resp) # 从API返回的数据中提取需要的字段 # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展 - data = resp['result'] - dvdid = data['barcode'] + data = resp["result"] + dvdid = data["barcode"] movie.dvdid = dvdid - movie.url = base_url + '/video/' + dvdid + movie.url = base_url + "/video/" + dvdid # plot和title中可能含有HTML的转义字符,需要进行解转义处理 - movie.plot = unescape(data['description']) or None - movie.cover = data['img_url'] + movie.plot = unescape(data["description"]) or None + movie.cover = data["img_url"] # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id - movie.genre = [i['name'] for i in data['tags']] - movie.title = unescape(data['name']) - movie.actress = [i['name'] for i in data['actors']] - movie.publish_date = data['publish_date'] - movie.preview_pics = data['images'] or [] - if data['factories']: - movie.producer = data['factories'][0]['name'] + movie.genre = [i["name"] for i in data["tags"]] + movie.title = unescape(data["name"]) + movie.actress = [i["name"] for i in data["actors"]] + movie.publish_date = data["publish_date"] + movie.preview_pics = data["images"] or [] + if data["factories"]: + movie.producer = data["factories"][0]["name"] if Cfg().crawler.hardworking: # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') - video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" + video_url = ( + f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" + ) resp = request.get(video_url).json() # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} - if 'data' in resp: + if "data" in resp: # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 # TODO: 发现部分影片(如080719-976)的传统格式预览片错误 - movie.preview_video = resp['data'].get('url') + movie.preview_video = resp["data"].get("url") # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确 - for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'): + for keyword in ("馬賽克破壞版", "馬賽克破解版", "無碼流出版"): if movie.title and keyword in movie.title: movie.title = None movie.genre = [] @@ -107,10 +112,11 @@ def parse_data(movie: MovieInfo): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('DSAD-938') + movie = MovieInfo("DSAD-938") try: parse_data(movie) print(movie) diff --git a/javsp/web/arzon.py b/javsp/web/arzon.py index 433949018..156b5e045 100644 --- a/javsp/web/arzon.py +++ b/javsp/web/arzon.py @@ -1,6 +1,5 @@ """从arzon抓取数据""" -import os -import sys + import logging import re @@ -13,22 +12,26 @@ logger = logging.getLogger(__name__) base_url = "https://www.arzon.jp" + def get_cookie(): # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F - skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" + skip_verify_url = ( + "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" + ) session = requests.Session() session.get(skip_verify_url, timeout=(12, 7)) return session.cookies.get_dict() + def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" full_id = movie.dvdid cookies = get_cookie() - url = f'{base_url}/itemlist.html?t=&m=all&s=&q={full_id}' + url = f"{base_url}/itemlist.html?t=&m=all&s=&q={full_id}" # url = f'{base_url}/imagelist.html?q={full_id}' r = request_get(url, cookies, delay_raise=True) if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) + raise MovieNotFoundError(__name__, movie.dvdid) # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported data = html.fromstring(r.content) @@ -38,61 +41,65 @@ def parse_data(movie: MovieInfo): item_url = base_url + urls[0] e = request_get(item_url, cookies, delay_raise=True) - item = html.fromstring(e.content) + item = html.fromstring(e.content) title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0] cover = item.xpath("//td[@align='center']//a/img/@src")[0] item_text = item.xpath("//div[@class='item_text']/text()") - plot = [item.strip() for item in item_text if item.strip() != ''][0] + plot = [item.strip() for item in item_text if item.strip() != ""][0] preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src") # 使用列表推导式添加 "http:" 并去除 "m_" preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr] container = item.xpath("//div[@class='item_register']/table//tr") for row in container: - key = row.xpath("./td[1]/text()")[0] - contents = row.xpath("./td[2]//text()") - content = [item.strip() for item in contents if item.strip() != ''] - index = 0 - value = content[index] if content and index < len(content) else None - if key == "AV女優:": - movie.actress = content - if key == "AVメーカー:": - movie.producer = value - if key == "AVレーベル:": - video_type = value - if key == "シリーズ:": - movie.serial = value - if key == "監督:": - movie.director = value - if key == "発売日:" and value: - movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") - if key == "収録時間:" and value: - movie.duration = re.search(r'([\d.]+)分', value).group(1) - if key == "品番:": - dvd_id = value - elif key == "タグ:": - genre = value - - genres = '' + key = row.xpath("./td[1]/text()")[0] + contents = row.xpath("./td[2]//text()") + content = [item.strip() for item in contents if item.strip() != ""] + index = 0 + value = content[index] if content and index < len(content) else None + if key == "AV女優:": + movie.actress = content + if key == "AVメーカー:": + movie.producer = value + if key == "AVレーベル:": + video_type = value + if key == "シリーズ:": + movie.serial = value + if key == "監督:": + movie.director = value + if key == "発売日:" and value: + movie.publish_date = ( + re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") + ) + if key == "収録時間:" and value: + movie.duration = re.search(r"([\d.]+)分", value).group(1) + if key == "品番:": + dvd_id = value + elif key == "タグ:": + genre = value + + genres = "" if video_type: - genres = [video_type] - if(genre != None): - genres.append(genre) + genres = [video_type] + if genre != None: + genres.append(genre) movie.genre = genres movie.url = item_url movie.title = title movie.plot = plot - movie.cover = f'https:{cover}' + movie.cover = f"https:{cover}" movie.preview_pics = preview_pics + if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('csct-011') + movie = MovieInfo("csct-011") try: parse_data(movie) print(movie) diff --git a/javsp/web/arzon_iv.py b/javsp/web/arzon_iv.py index 3ea7a322f..37748c091 100644 --- a/javsp/web/arzon_iv.py +++ b/javsp/web/arzon_iv.py @@ -1,6 +1,5 @@ """从arzon抓取数据""" -import os -import sys + import logging import re @@ -13,21 +12,25 @@ logger = logging.getLogger(__name__) base_url = "https://www.arzon.jp" + def get_cookie(): # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F - skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" + skip_verify_url = ( + "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" + ) session = requests.Session() session.get(skip_verify_url, timeout=(12, 7)) return session.cookies.get_dict() + def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" full_id = movie.dvdid cookies = get_cookie() - url = f'{base_url}/imagelist.html?q={full_id}' + url = f"{base_url}/imagelist.html?q={full_id}" r = request_get(url, cookies, delay_raise=True) if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) + raise MovieNotFoundError(__name__, movie.dvdid) # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported data = html.fromstring(r.content) @@ -37,55 +40,59 @@ def parse_data(movie: MovieInfo): item_url = base_url + urls[0] e = request_get(item_url, cookies, delay_raise=True) - item = html.fromstring(e.content) + item = html.fromstring(e.content) title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0] cover = item.xpath("//td[@align='center']//a/img/@src")[0] item_text = item.xpath("//div[@class='item_text']/text()") - plot = [item.strip() for item in item_text if item.strip() != ''][0] + plot = [item.strip() for item in item_text if item.strip() != ""][0] container = item.xpath("//div[@class='item_register']/table//tr") for row in container: - key = row.xpath("./td[1]/text()")[0] - contents = row.xpath("./td[2]//text()") - content = [item.strip() for item in contents if item.strip() != ''] - index = 0 - value = content[index] if content and index < len(content) else None - if key == "タレント:": - movie.actress = content - if key == "イメージメーカー:": - movie.producer = value - if key == "イメージレーベル:": - video_type = value - if key == "監督:": - movie.director = value - if key == "発売日:" and value: - movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") - if key == "収録時間:" and value: - movie.duration = re.search(r'([\d.]+)分', value).group(1) - if key == "品番:": - dvd_id = value - elif key == "タグ:": - genre = value - - genres = '' + key = row.xpath("./td[1]/text()")[0] + contents = row.xpath("./td[2]//text()") + content = [item.strip() for item in contents if item.strip() != ""] + index = 0 + value = content[index] if content and index < len(content) else None + if key == "タレント:": + movie.actress = content + if key == "イメージメーカー:": + movie.producer = value + if key == "イメージレーベル:": + video_type = value + if key == "監督:": + movie.director = value + if key == "発売日:" and value: + movie.publish_date = ( + re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") + ) + if key == "収録時間:" and value: + movie.duration = re.search(r"([\d.]+)分", value).group(1) + if key == "品番:": + dvd_id = value + elif key == "タグ:": + genre = value + + genres = "" if video_type: - genres = [video_type] - if(genre != None): - genres.append(genre) + genres = [video_type] + if genre != None: + genres.append(genre) movie.genre = genres movie.url = item_url movie.title = title movie.plot = plot - movie.cover = f'https:{cover}' + movie.cover = f"https:{cover}" + if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('KIDM-1137B') + movie = MovieInfo("KIDM-1137B") try: parse_data(movie) print(movie) diff --git a/javsp/web/avsox.py b/javsp/web/avsox.py index ea96d6cc3..3d0bbae80 100644 --- a/javsp/web/avsox.py +++ b/javsp/web/avsox.py @@ -1,4 +1,5 @@ """从avsox抓取数据""" + import logging from javsp.web.base import get_html @@ -15,15 +16,15 @@ def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页 full_id = movie.dvdid - if full_id.startswith('FC2-'): - full_id = full_id.replace('FC2-', 'FC2-PPV-') - html = get_html(f'{base_url}tw/search/{full_id}') + if full_id.startswith("FC2-"): + full_id = full_id.replace("FC2-", "FC2-PPV-") + html = get_html(f"{base_url}tw/search/{full_id}") ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()") urls = html.xpath("//a[contains(@class, 'movie-box')]/@href") ids_lower = list(map(str.lower, ids)) if full_id.lower() in ids_lower: url = urls[ids_lower.index(full_id.lower())] - url = url.replace('/tw/', '/cn/', 1) + url = url.replace("/tw/", "/cn/", 1) else: raise MovieNotFoundError(__name__, movie.dvdid, ids) @@ -35,7 +36,7 @@ def parse_data(movie: MovieInfo): info = container.xpath("div/div[@class='col-md-3 info']")[0] dvdid = info.xpath("p/span[@style]/text()")[0] publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip() - duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip() + duration = info.xpath("p/span[text()='长度:']")[0].tail.replace("分钟", "").strip() producer, serial = None, None producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a") if producer_tag: @@ -46,15 +47,15 @@ def parse_data(movie: MovieInfo): genre = info.xpath("p/span[@class='genre']/a/text()") actress = container.xpath("//a[@class='avatar-box']/span/text()") - movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-') + movie.dvdid = dvdid.replace("FC2-PPV-", "FC2-") movie.url = url - movie.title = title.replace(dvdid, '').strip() + movie.title = title.replace(dvdid, "").strip() movie.cover = cover movie.publish_date = publish_date movie.duration = duration movie.genre = genre movie.actress = actress - if full_id.startswith('FC2-'): + if full_id.startswith("FC2-"): # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整 movie.producer = serial else: @@ -64,10 +65,11 @@ def parse_data(movie: MovieInfo): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('082713-417') + movie = MovieInfo("082713-417") try: parse_data(movie) print(movie) diff --git a/javsp/web/avwiki.py b/javsp/web/avwiki.py index fbd4ecbb3..216621a70 100644 --- a/javsp/web/avwiki.py +++ b/javsp/web/avwiki.py @@ -1,4 +1,5 @@ """从av-wiki抓取数据""" + import logging @@ -7,7 +8,7 @@ from javsp.datatype import MovieInfo logger = logging.getLogger(__name__) -base_url = 'https://av-wiki.net' +base_url = "https://av-wiki.net" def parse_data(movie: MovieInfo): @@ -15,7 +16,7 @@ def parse_data(movie: MovieInfo): Args: movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 """ - movie.url = url = f'{base_url}/{movie.dvdid}' + movie.url = url = f"{base_url}/{movie.dvdid}" resp = request_get(url, delay_raise=True) if resp.status_code == 404: raise MovieNotFoundError(__name__, movie.dvdid) @@ -24,47 +25,56 @@ def parse_data(movie: MovieInfo): cover_tag = html.xpath("//header/div/a[@class='image-link-border']/img") if cover_tag: try: - srcset = cover_tag[0].get('srcset').split(', ') + srcset = cover_tag[0].get("srcset").split(", ") src_set_urls = {} for src in srcset: url, width = src.split() - width = int(width.rstrip('w')) + width = int(width.rstrip("w")) src_set_urls[width] = url - max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True) + max_pic = sorted(src_set_urls.items(), key=lambda x: x[0], reverse=True) movie.cover = max_pic[0][1] except: - movie.cover = cover_tag[0].get('src') + movie.cover = cover_tag[0].get("src") body = html.xpath("//section[@class='article-body']")[0] title = body.xpath("div/p/text()")[0] - title = title.replace(f"【{movie.dvdid}】", '') + title = title.replace(f"【{movie.dvdid}】", "") cite_url = body.xpath("div/cite/a/@href")[0] - cite_url = cite_url.split('?aff=')[0] + cite_url = cite_url.split("?aff=")[0] info = body.xpath("dl[@class='dltable']")[0] dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd") data = {} for dt_txt, dd in zip(dt_txt_ls, dd_tags): dt_txt = dt_txt.strip() - a_tag = dd.xpath('a') + a_tag = dd.xpath("a") if len(a_tag) == 0: dd_txt = dd.text.strip() else: dd_txt = [i.text.strip() for i in a_tag] - if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留 + if ( + isinstance(dd_txt, list) and dt_txt != "AV女優名" + ): # 只有女优名以列表的数据格式保留 dd_txt = dd_txt[0] data[dt_txt] = dd_txt - ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'} + ATTR_MAP = { + "メーカー": "producer", + "AV女優名": "actress", + "メーカー品番": "dvdid", + "シリーズ": "serial", + "配信開始日": "publish_date", + } for key, attr in ATTR_MAP.items(): setattr(movie, attr, data.get(key)) movie.title = title - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) - movie = MovieInfo('259LUXU-593') + movie = MovieInfo("259LUXU-593") try: parse_data(movie) print(movie) diff --git a/javsp/web/base.py b/javsp/web/base.py index 717b5168a..fee9da7c3 100644 --- a/javsp/web/base.py +++ b/javsp/web/base.py @@ -1,4 +1,5 @@ """网络请求的统一接口""" + import os import sys import time @@ -18,27 +19,42 @@ from javsp.web.exceptions import * -__all__ = ['Request', 'get_html', 'post_html', 'request_get', 'resp2html', 'is_connectable', 'download', 'get_resp_text', 'read_proxy'] +__all__ = [ + "Request", + "get_html", + "post_html", + "request_get", + "resp2html", + "is_connectable", + "download", + "get_resp_text", + "read_proxy", +] -headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'} +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" +} logger = logging.getLogger(__name__) # 删除js脚本相关的tag,避免网页检测到没有js运行环境时强行跳转,影响调试 -cleaner = Cleaner(kill_tags=['script', 'noscript']) +cleaner = Cleaner(kill_tags=["script", "noscript"]) + def read_proxy(): if Cfg().network.proxy_server is None: return {} else: proxy = str(Cfg().network.proxy_server) - return {'http': proxy, 'https': proxy} + return {"http": proxy, "https": proxy} + # 与网络请求相关的功能汇总到一个模块中以方便处理,但是不同站点的抓取器又有自己的需求(针对不同网站 # 需要使用不同的UA、语言等)。每次都传递参数很麻烦,而且会面临函数参数越加越多的问题。因此添加这个 # 处理网络请求的类,它带有默认的属性,但是也可以在各个抓取器模块里进行进行定制 -class Request(): +class Request: """作为网络请求出口并支持各个模块定制功能""" + def __init__(self, use_scraper=False) -> None: # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效 self.headers = headers.copy() @@ -59,44 +75,54 @@ def __init__(self, use_scraper=False) -> None: def _scraper_monitor(self, func): """监控cloudscraper的工作状态,遇到不支持的Challenge时尝试退回常规的requests请求""" + def wrapper(*args, **kw): try: return func(*args, **kw) except Exception as e: - logger.debug(f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求") + logger.debug( + f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求" + ) if func == self.scraper.get: return requests.get(*args, **kw) else: return requests.post(*args, **kw) + return wrapper def get(self, url, delay_raise=False): - r = self.__get(url, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) + r = self.__get( + url, + headers=self.headers, + proxies=self.proxies, + cookies=self.cookies, + timeout=self.timeout, + ) if not delay_raise: r.raise_for_status() return r def post(self, url, data, delay_raise=False): - r = self.__post(url, - data=data, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) + r = self.__post( + url, + data=data, + headers=self.headers, + proxies=self.proxies, + cookies=self.cookies, + timeout=self.timeout, + ) if not delay_raise: r.raise_for_status() return r def head(self, url, delay_raise=True): - r = self.__head(url, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) + r = self.__head( + url, + headers=self.headers, + proxies=self.proxies, + cookies=self.cookies, + timeout=self.timeout, + ) if not delay_raise: r.raise_for_status() return r @@ -118,10 +144,12 @@ def request_get(url, cookies={}, timeout=None, delay_raise=False): """获取指定url的原始请求""" if timeout is None: timeout = Cfg().network.timeout.seconds - - r = requests.get(url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout) + + r = requests.get( + url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout + ) if not delay_raise: - if r.status_code == 403 and b'>Just a moment...<' in r.content: + if r.status_code == 403 and b">Just a moment...<" in r.content: raise SiteBlocked(f"403 Forbidden: 无法通过CloudFlare检测: {url}") else: r.raise_for_status() @@ -132,7 +160,14 @@ def request_post(url, data, cookies={}, timeout=None, delay_raise=False): """向指定url发送post请求""" if timeout is None: timeout = Cfg().network.timeout.seconds - r = requests.post(url, data=data, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout) + r = requests.post( + url, + data=data, + headers=headers, + proxies=read_proxy(), + cookies=cookies, + timeout=timeout, + ) if not delay_raise: r.raise_for_status() return r @@ -147,7 +182,7 @@ def get_resp_text(resp: Response, encoding=None): return resp.text -def get_html(url, encoding='utf-8'): +def get_html(url, encoding="utf-8"): """使用get方法访问指定网页并返回经lxml解析后的document""" resp = request_get(url) text = get_resp_text(resp, encoding=encoding) @@ -155,23 +190,23 @@ def get_html(url, encoding='utf-8'): html.make_links_absolute(url, resolve_base_href=True) # 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus) # html = cleaner.clean_html(html) - if hasattr(sys, 'javsp_debug_mode'): + if hasattr(sys, "javsp_debug_mode"): lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug return html -def resp2html(resp, encoding='utf-8') -> lxml.html.HtmlComment: +def resp2html(resp, encoding="utf-8") -> lxml.html.HtmlComment: """将request返回的response转换为经lxml解析后的document""" text = get_resp_text(resp, encoding=encoding) html = lxml.html.fromstring(text) html.make_links_absolute(resp.url, resolve_base_href=True) # html = cleaner.clean_html(html) - if hasattr(sys, 'javsp_debug_mode'): + if hasattr(sys, "javsp_debug_mode"): lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug return html -def post_html(url, data, encoding='utf-8', cookies={}): +def post_html(url, data, encoding="utf-8", cookies={}): """使用post方法访问指定网页并返回经lxml解析后的document""" resp = request_post(url, data, cookies=cookies) text = get_resp_text(resp, encoding=encoding) @@ -179,11 +214,11 @@ def post_html(url, data, encoding='utf-8', cookies={}): # jav321提供ed2k形式的资源链接,其中的非ASCII字符可能导致转换失败,因此要先进行处理 ed2k_tags = html.xpath("//a[starts-with(@href,'ed2k://')]") for tag in ed2k_tags: - tag.attrib['ed2k'], tag.attrib['href'] = tag.attrib['href'], '' + tag.attrib["ed2k"], tag.attrib["href"] = tag.attrib["href"], "" html.make_links_absolute(url, resolve_base_href=True) for tag in ed2k_tags: - tag.attrib['href'] = tag.attrib['ed2k'] - tag.attrib.pop('ed2k') + tag.attrib["href"] = tag.attrib["ed2k"] + tag.attrib.pop("ed2k") # html = cleaner.clean_html(html) # lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug return html @@ -192,9 +227,9 @@ def post_html(url, data, encoding='utf-8', cookies={}): def dump_xpath_node(node, filename=None): """将xpath节点dump到文件""" if not filename: - filename = node.tag + '.html' - with open(filename, 'wt', encoding='utf-8') as f: - content = etree.tostring(node, pretty_print=True).decode('utf-8') + filename = node.tag + ".html" + with open(filename, "wt", encoding="utf-8") as f: + content = etree.tostring(node, pretty_print=True).decode("utf-8") f.write(content) @@ -213,16 +248,17 @@ def urlretrieve(url, filename=None, reporthook=None, headers=None): headers["Referer"] = "https://www.arzon.jp/" """使用requests实现urlretrieve""" # https://blog.csdn.net/qq_38282706/article/details/80253447 - with contextlib.closing(requests.get(url, headers=headers, - proxies=read_proxy(), stream=True)) as r: + with contextlib.closing( + requests.get(url, headers=headers, proxies=read_proxy(), stream=True) + ) as r: header = r.headers - with open(filename, 'wb+') as fp: + with open(filename, "wb+") as fp: bs = 1024 size = -1 blocknum = 0 if "content-length" in header: - size = int(header["Content-Length"]) # 文件总大小(理论值) - if reporthook: # 写入前运行一次回调函数 + size = int(header["Content-Length"]) # 文件总大小(理论值) + if reporthook: # 写入前运行一次回调函数 reporthook(blocknum, bs, size) for chunk in r.iter_content(chunk_size=1024): if chunk: @@ -236,35 +272,40 @@ def urlretrieve(url, filename=None, reporthook=None, headers=None): def download(url, output_path, desc=None): """下载指定url的资源""" # 支持“下载”本地资源,以供fc2fan的本地镜像所使用 - if not url.startswith('http'): + if not url.startswith("http"): start_time = time.time() shutil.copyfile(url, output_path) filesize = os.path.getsize(url) elapsed = time.time() - start_time - info = {'total': filesize, 'elapsed': elapsed, 'rate': filesize/elapsed} + info = {"total": filesize, "elapsed": elapsed, "rate": filesize / elapsed} return info if not desc: - desc = url.split('/')[-1] + desc = url.split("/")[-1] referrer = headers.copy() - referrer['referer'] = url[:url.find('/', 8)+1] # 提取base_url部分 - with DownloadProgressBar(unit='B', unit_scale=True, - miniters=1, desc=desc, leave=False) as t: + referrer["referer"] = url[: url.find("/", 8) + 1] # 提取base_url部分 + with DownloadProgressBar( + unit="B", unit_scale=True, miniters=1, desc=desc, leave=False + ) as t: urlretrieve(url, filename=output_path, reporthook=t.update_to, headers=referrer) - info = {k: t.format_dict[k] for k in ('total', 'elapsed', 'rate')} + info = {k: t.format_dict[k] for k in ("total", "elapsed", "rate")} return info def open_in_chrome(url, new=0, autoraise=True): """使用指定的Chrome Profile打开url,便于调试""" import subprocess - chrome = R'C:\Program Files\Google\Chrome\Application\chrome.exe' + + chrome = R"C:\Program Files\Google\Chrome\Application\chrome.exe" subprocess.run(f'"{chrome}" --profile-directory="Profile 2" {url}', shell=True) + import webbrowser + webbrowser.open = open_in_chrome if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) - download('https://www.javbus.com/pics/cover/6n54_b.jpg', 'cover.jpg') + download("https://www.javbus.com/pics/cover/6n54_b.jpg", "cover.jpg") diff --git a/javsp/web/dl_getchu.py b/javsp/web/dl_getchu.py index 15267f1f7..da6884cc7 100644 --- a/javsp/web/dl_getchu.py +++ b/javsp/web/dl_getchu.py @@ -1,4 +1,5 @@ """从dl.getchu官网抓取数据""" + import re import logging @@ -9,19 +10,19 @@ logger = logging.getLogger(__name__) # https://dl.getchu.com/i/item4045373 -base_url = 'https://dl.getchu.com' +base_url = "https://dl.getchu.com" # dl.getchu用utf-8会乱码 -base_encode = 'euc-jp' +base_encode = "euc-jp" def get_movie_title(html): container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]") if len(container) > 0: container = container[0] - rows = container.xpath('.//tr') - title = '' + rows = container.xpath(".//tr") + title = "" for row in rows: - for cell in row.xpath('.//td/div'): + for cell in row.xpath(".//td/div"): # 获取单元格文本内容 if cell.text: title = str(cell.text).strip() @@ -29,11 +30,11 @@ def get_movie_title(html): def get_movie_img(html, getchu_id): - img_src = '' + img_src = "" container = html.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]') if len(container) > 0: container = container[0] - img_src = container.get('src') + img_src = container.get("src") return img_src @@ -42,20 +43,22 @@ def get_movie_preview(html, getchu_id): container = html.xpath(f'//img[contains(@src, "{getchu_id}_")]') if len(container) > 0: for c in container: - preview_pics.append(c.get('src')) + preview_pics.append(c.get("src")) return preview_pics -DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分') +DURATION_PATTERN = re.compile(r"(?:動画)?(\d+)分") + + def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # 去除番号中的'GETCHU'字样 id_uc = movie.dvdid.upper() - if not id_uc.startswith('GETCHU-'): - raise ValueError('Invalid GETCHU number: ' + movie.dvdid) - getchu_id = id_uc.replace('GETCHU-', '') + if not id_uc.startswith("GETCHU-"): + raise ValueError("Invalid GETCHU number: " + movie.dvdid) + getchu_id = id_uc.replace("GETCHU-", "") # 抓取网页 - url = f'{base_url}/i/item{getchu_id}' + url = f"{base_url}/i/item{getchu_id}" r = request_get(url, delay_raise=True) if r.status_code == 404: raise MovieNotFoundError(__name__, movie.dvdid) @@ -64,7 +67,7 @@ def parse_data(movie: MovieInfo): if len(container) > 0: container = container[0] # 将表格提取为键值对 - rows = container.xpath('.//table/tr') + rows = container.xpath(".//table/tr") kv_rows = [i for i in rows if len(i) == 2] data = {} for row in kv_rows: @@ -80,26 +83,26 @@ def parse_data(movie: MovieInfo): data[key] = value for key, value in data.items(): - if key == 'サークル': + if key == "サークル": movie.producer = value[0] - elif key == '作者': + elif key == "作者": # 暂时没有在getchu找到多个actress的片子 movie.actress = [i.strip() for i in value] - elif key == '画像数&ページ数': - match = DURATION_PATTERN.search(' '.join(value)) + elif key == "画像数&ページ数": + match = DURATION_PATTERN.search(" ".join(value)) if match: movie.duration = match.group(1) - elif key == '配信開始日': - movie.publish_date = value[0].replace('/', '-') - elif key == '趣向': + elif key == "配信開始日": + movie.publish_date = value[0].replace("/", "-") + elif key == "趣向": movie.genre = value - elif key == '作品内容': + elif key == "作品内容": idx = -1 for i, line in enumerate(value): - if line.lstrip().startswith('※'): + if line.lstrip().startswith("※"): idx = i break - movie.plot = ''.join(value[:idx]) + movie.plot = "".join(value[:idx]) movie.title = get_movie_title(html) movie.cover = get_movie_img(html, getchu_id) @@ -114,7 +117,7 @@ def parse_data(movie: MovieInfo): pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('getchu-4041026') + movie = MovieInfo("getchu-4041026") try: parse_data(movie) print(movie) diff --git a/javsp/web/exceptions.py b/javsp/web/exceptions.py index 0ea720d01..5db4af266 100644 --- a/javsp/web/exceptions.py +++ b/javsp/web/exceptions.py @@ -1,6 +1,15 @@ """网页抓取相关的异常""" -__all__ = ['CrawlerError', 'MovieNotFoundError', 'MovieDuplicateError', 'SiteBlocked', - 'SitePermissionError', 'CredentialError', 'WebsiteError', 'OtherError'] + +__all__ = [ + "CrawlerError", + "MovieNotFoundError", + "MovieDuplicateError", + "SiteBlocked", + "SitePermissionError", + "CredentialError", + "WebsiteError", + "OtherError", +] class CrawlerError(Exception): @@ -9,6 +18,7 @@ class CrawlerError(Exception): class MovieNotFoundError(CrawlerError): """表示某个站点没有抓取到某部影片""" + # 保持异常消息的简洁,同时又支持使用'logger.info(e, exc_info=True)'记录完整信息 def __init__(self, mod, avid, *args) -> None: msg = f"{mod}: 未找到影片: '{avid}'" @@ -20,6 +30,7 @@ def __str__(self): class MovieDuplicateError(CrawlerError): """影片重复""" + def __init__(self, mod, avid, dup_count, *args) -> None: msg = f"{mod}: '{avid}': 存在{dup_count}个完全匹配目标番号的搜索结果" super().__init__(msg, *args) diff --git a/javsp/web/fanza.py b/javsp/web/fanza.py index e975c4c8f..305aaad8f 100644 --- a/javsp/web/fanza.py +++ b/javsp/web/fanza.py @@ -1,7 +1,6 @@ """从fanza抓取数据""" -import os + import re -import sys import json import logging from typing import Dict, List, Tuple @@ -14,25 +13,43 @@ logger = logging.getLogger(__name__) -base_url = 'https://www.dmm.co.jp' +base_url = "https://www.dmm.co.jp" # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) request = Request() -request.cookies = {'age_check_done': '1'} -request.headers['Accept-Language'] = 'ja,en-US;q=0.9' +request.cookies = {"age_check_done": "1"} +request.headers["Accept-Language"] = "ja,en-US;q=0.9" + + +_PRODUCT_PRIORITY = {"digital": 10, "mono": 5, "monthly": 2, "rental": 1} +_TYPE_PRIORITY = { + "videoa": 10, + "anime": 8, + "nikkatsu": 6, + "doujin": 4, + "dvd": 3, + "ppr": 2, + "paradisetv": 1, +} -_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1} -_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1} def sort_search_result(result: List[Dict]): """排序搜索结果""" - scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result} - sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True) + scores = { + i["url"]: ( + _PRODUCT_PRIORITY.get(i["product"], 0), + _TYPE_PRIORITY.get(i["type"], 0), + ) + for i in result + } + sorted_result = sorted(result, key=lambda x: scores[x["url"]], reverse=True) return sorted_result def get_urls_of_cid(cid: str) -> Tuple[str, str]: """搜索cid可能的影片URL""" - r = request.get(f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0") + r = request.get( + f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0" + ) if r.status_code == 404: raise MovieNotFoundError(__name__, cid) r.raise_for_status() @@ -40,19 +57,21 @@ def get_urls_of_cid(cid: str) -> Tuple[str, str]: result = html.xpath("//ul[@id='list']/li/div/p/a/@href") parsed_result = {} for url in result: - items = url.split('/') + items = url.split("/") type_, cid = None, None for i, part in enumerate(items): - if part == '-': - product, type_ = items[i-2], items[i-1] - elif part.startswith('cid='): + if part == "-": + product, type_ = items[i - 2], items[i - 1] + elif part.startswith("cid="): cid = part[4:] - new_url = '/'.join(i for i in items if not i.startswith('?')) + '/' - parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url}) + new_url = "/".join(i for i in items if not i.startswith("?")) + "/" + parsed_result.setdefault(cid, []).append( + {"product": product, "type": type_, "url": new_url} + ) break if cid not in parsed_result: if len(result) > 0: - logger.debug(f"Unknown URL in search result: " + ', '.join(result)) + logger.debug("Unknown URL in search result: " + ", ".join(result)) raise MovieNotFoundError(__name__, cid) sorted_result = sort_search_result(parsed_result[cid]) return sorted_result @@ -60,16 +79,18 @@ def get_urls_of_cid(cid: str) -> Tuple[str, str]: def resp2html_wrapper(resp): html = resp2html(resp) - if 'not available in your region' in html.text_content(): - raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') - elif '/login/' in resp.url: - raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP') + if "not available in your region" in html.text_content(): + raise SiteBlocked( + "FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置" + ) + elif "/login/" in resp.url: + raise SiteBlocked("FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP") return html def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" - default_url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/' + default_url = f"{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/" r0 = request.get(default_url, delay_raise=True) if r0.status_code == 404: urls = get_urls_of_cid(movie.cid) @@ -80,16 +101,18 @@ def parse_data(movie: MovieInfo): else: logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}") continue - r = request.get(d['url']) + r = request.get(d["url"]) html = resp2html_wrapper(r) try: parse_func(movie, html) - movie.url = d['url'] + movie.url = d["url"] break except: logger.debug(f"Fail to parse {d['url']}", exc_info=True) if d is urls[-1]: - logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败") + logger.warning( + f"在fanza查找到的cid={movie.cid}的影片页面均解析失败" + ) raise else: html = resp2html_wrapper(r0) @@ -104,22 +127,32 @@ def parse_videoa_page(movie: MovieInfo, html): container = html.xpath("//table[@class='mg-b12']/tr/td")[0] cover = container.xpath("//div[@id='sample-video']/a/@href")[0] # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 - date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()") + date_tag = container.xpath( + "//td[text()='配信開始日:']/following-sibling::td/text()" + ) if date_tag: - movie.publish_date = date_tag[0].strip().replace('/', '-') - duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() - match = re.search(r'\d+', duration_str) + movie.publish_date = date_tag[0].strip().replace("/", "-") + duration_str = container.xpath( + "//td[text()='収録時間:']/following-sibling::td/text()" + )[0].strip() + match = re.search(r"\d+", duration_str) if match: movie.duration = match.group(0) # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 actress = container.xpath("//span[@id='performer']/a/text()") - director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()") + director_tag = container.xpath( + "//td[text()='監督:']/following-sibling::td/a/text()" + ) if director_tag: movie.director = director_tag[0].strip() - serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") + serial_tag = container.xpath( + "//td[text()='シリーズ:']/following-sibling::td/a/text()" + ) if serial_tag: movie.serial = serial_tag[0].strip() - producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") + producer_tag = container.xpath( + "//td[text()='メーカー:']/following-sibling::td/a/text()" + ) if producer_tag: movie.producer = producer_tag[0].strip() # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 @@ -127,40 +160,48 @@ def parse_videoa_page(movie: MovieInfo, html): # if label_tag: # label = label_tag[0].strip() # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 - genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]") + genre_tags = container.xpath( + "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]" + ) genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text.strip()) - genre_id.append(tag.get('href').split('=')[-1].strip('/')) - cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() + genre_id.append(tag.get("href").split("=")[-1].strip("/")) + cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[ + 0 + ].strip() plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip() preview_pics = container.xpath("//a[@name='sample-image']/img/@src") score_tag = container.xpath("//p[@class='d-review__average']/strong/text()") if score_tag: - match = re.search(r'\d+', score_tag[0].strip()) + match = re.search(r"\d+", score_tag[0].strip()) if match: score = float(match.group()) * 2 - movie.score = f'{score:.2f}' + movie.score = f"{score:.2f}" else: - score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] - movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 - + score_img = container.xpath( + "//td[text()='平均評価:']/following-sibling::td/img/@src" + )[0] + movie.score = int(score_img.split("/")[-1].split(".")[0]) # 00, 05 ... 50 + if Cfg().crawler.hardworking: # 预览视频是动态加载的,不在静态网页中 - video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}' + video_url = f"{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}" html2 = request.get_html(video_url) # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 - script = html2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip() - match = re.search(r'\{.*\}', script) + script = html2.xpath( + "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()" + )[0].strip() + match = re.search(r"\{.*\}", script) # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 try: data = json.loads(match.group()) - video_url = data.get('src') - if video_url and video_url.startswith('//'): - video_url = 'https:' + video_url + video_url = data.get("src") + if video_url and video_url.startswith("//"): + video_url = "https:" + video_url movie.preview_video = video_url except Exception as e: - logger.debug('解析视频地址时异常: ' + repr(e)) + logger.debug("解析视频地址时异常: " + repr(e)) movie.cid = cid movie.title = title @@ -170,7 +211,7 @@ def parse_videoa_page(movie: MovieInfo, html): movie.genre_id = genre_id movie.plot = plot movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 def parse_anime_page(movie: MovieInfo, html): @@ -178,27 +219,41 @@ def parse_anime_page(movie: MovieInfo, html): title = html.xpath("//h1[@id='title']/text()")[0] container = html.xpath("//table[@class='mg-b12']/tr/td")[0] cover = container.xpath("//img[@name='package-image']/@src")[0] - date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip() - publish_date = date_str.replace('/', '-') - duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()") + date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[ + 0 + ].strip() + publish_date = date_str.replace("/", "-") + duration_tag = container.xpath( + "//td[text()='収録時間:']/following-sibling::td/text()" + ) if duration_tag: - movie.duration = duration_tag[0].strip().replace('分', '') - serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") + movie.duration = duration_tag[0].strip().replace("分", "") + serial_tag = container.xpath( + "//td[text()='シリーズ:']/following-sibling::td/a/text()" + ) if serial_tag: movie.serial = serial_tag[0].strip() - producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") + producer_tag = container.xpath( + "//td[text()='メーカー:']/following-sibling::td/a/text()" + ) if producer_tag: movie.producer = producer_tag[0].strip() - genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]") + genre_tags = container.xpath( + "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]" + ) genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text.strip()) - genre_id.append(tag.get('href').split('=')[-1].strip('/')) - cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() + genre_id.append(tag.get("href").split("=")[-1].strip("/")) + cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[ + 0 + ].strip() plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip() preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy") - score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] - score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 + score_img = container.xpath( + "//td[text()='平均評価:']/following-sibling::td/img/@src" + )[0] + score = int(score_img.split("/")[-1].split(".")[0]) # 00, 05 ... 50 movie.cid = cid movie.title = title @@ -207,9 +262,9 @@ def parse_anime_page(movie: MovieInfo, html): movie.genre = genre movie.genre_id = genre_id movie.plot = plot - movie.score = f'{score/5:.2f}' # 转换为10分制 + movie.score = f"{score/5:.2f}" # 转换为10分制 movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 # parse_dvd_page = parse_videoa_page # 118wtktabf067 @@ -220,10 +275,11 @@ def parse_anime_page(movie: MovieInfo, html): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo(cid='d_aisoft3356') + movie = MovieInfo(cid="d_aisoft3356") try: parse_data(movie) print(movie) diff --git a/javsp/web/fc2.py b/javsp/web/fc2.py index 66be7ae4e..5e47df6e7 100644 --- a/javsp/web/fc2.py +++ b/javsp/web/fc2.py @@ -1,4 +1,5 @@ """从FC2官网抓取数据""" + import logging @@ -10,12 +11,12 @@ logger = logging.getLogger(__name__) -base_url = 'https://adult.contents.fc2.com' +base_url = "https://adult.contents.fc2.com" def get_movie_score(fc2_id): """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None""" - html = get_html(f'{base_url}/article/{fc2_id}/review') + html = get_html(f"{base_url}/article/{fc2_id}/review") review_tags = html.xpath("//ul[@class='items_comment_headerReviewInArea']/li") reviews = {} for tag in review_tags: @@ -23,9 +24,9 @@ def get_movie_score(fc2_id): vote = int(tag.xpath("span")[0].text_content()) reviews[score] = vote total_votes = sum(reviews.values()) - if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧 - summary = sum([k*v for k, v in reviews.items()]) - final_score = summary / total_votes * 2 # 乘以2转换为10分制 + if total_votes >= 2: # 至少也该有两个人评价才有参考意义一点吧 + summary = sum([k * v for k, v in reviews.items()]) + final_score = summary / total_votes * 2 # 乘以2转换为10分制 return final_score @@ -33,14 +34,14 @@ def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # 去除番号中的'FC2'字样 id_uc = movie.dvdid.upper() - if not id_uc.startswith('FC2-'): - raise ValueError('Invalid FC2 number: ' + movie.dvdid) - fc2_id = id_uc.replace('FC2-', '') + if not id_uc.startswith("FC2-"): + raise ValueError("Invalid FC2 number: " + movie.dvdid) + fc2_id = id_uc.replace("FC2-", "") # 抓取网页 - url = f'{base_url}/article/{fc2_id}/' + url = f"{base_url}/article/{fc2_id}/" resp = request_get(url) - if '/id.fc2.com/' in resp.url: - raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP') + if "/id.fc2.com/" in resp.url: + raise SiteBlocked("FC2要求当前IP登录账号才可访问,请尝试更换为日本IP") html = resp2html(resp) container = html.xpath("//div[@class='items_article_left']") if len(container) > 0: @@ -49,7 +50,7 @@ def parse_data(movie: MovieInfo): raise MovieNotFoundError(__name__, movie.dvdid) # FC2 标题增加反爬乱码,使用数组合并标题 title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()") - title = ''.join(title_arr) + title = "".join(title_arr) thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0] thumb_pic = thumb_tag.xpath("span/img/@src")[0] duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0] @@ -57,25 +58,31 @@ def parse_data(movie: MovieInfo): producer = container.xpath("//li[text()='by ']/a/text()")[0] genre = container.xpath("//a[@class='tag tagTag']/text()") date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0] - publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30' + publish_date = date_str[-10:].replace("/", "-") # '販売日 : 2017/11/30' preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href") if Cfg().crawler.hardworking: # 通过评论数据来计算准确的评分 score = get_movie_score(fc2_id) if score: - movie.score = f'{score:.2f}' + movie.score = f"{score:.2f}" # 预览视频是动态加载的,不在静态网页中 - desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0] - key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa... - api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}' + desc_frame_url = container.xpath( + "//section[@class='items_article_Contents']/iframe/@src" + )[0] + key = desc_frame_url.split("=")[ + -1 + ] # /widget/article/718323/description?ac=60fc08fa... + api_url = f"{base_url}/api/v2/videos/{fc2_id}/sample?key={key}" r = request_get(api_url).json() - movie.preview_video = r['path'] + movie.preview_video = r["path"] else: # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星 - score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0] + score_tag_attr = container.xpath( + "//a[@class='items_article_Stars']/p/span/@class" + )[0] score = int(score_tag_attr[-1]) * 2 - movie.score = f'{score:.2f}' + movie.score = f"{score:.2f}" movie.dvdid = id_uc movie.url = url @@ -94,10 +101,11 @@ def parse_data(movie: MovieInfo): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('FC2-718323') + movie = MovieInfo("FC2-718323") try: parse_data(movie) print(movie) diff --git a/javsp/web/fc2fan.py b/javsp/web/fc2fan.py index 229b3e3df..f3baa1d3b 100644 --- a/javsp/web/fc2fan.py +++ b/javsp/web/fc2fan.py @@ -1,4 +1,5 @@ """解析fc2fan本地镜像的数据""" + # FC2官网的影片下架就无法再抓取数据,如果用户有fc2fan的镜像,那可以尝试从镜像中解析影片数据 import os import re @@ -21,7 +22,7 @@ def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" if use_local_mirror: - html_file = f'{base_path}/{movie.dvdid}.html' + html_file = f"{base_path}/{movie.dvdid}.html" if not os.path.exists(html_file): raise MovieNotFoundError(__name__, movie.dvdid, html_file) html = lxml.html.parse(html_file) @@ -30,23 +31,23 @@ def parse_data(movie: MovieInfo): r = requests.get(url) if r.status_code == 404: raise MovieNotFoundError(__name__, movie.dvdid) - elif r.text == '': - raise WebsiteError(f'fc2fan: 站点不可用 (HTTP {r.status_code}): {url}') + elif r.text == "": + raise WebsiteError(f"fc2fan: 站点不可用 (HTTP {r.status_code}): {url}") html = resp2html(r) try: container = html.xpath("//div[@class='col-sm-8']")[0] except IndexError: - raise WebsiteError(f'fc2fan: 站点不可用') + raise WebsiteError("fc2fan: 站点不可用") title = container.xpath("h3/text()")[0] score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip() - match = re.search(r'\d+', score_str) + match = re.search(r"\d+", score_str) if match: - score = int(match.group()) / 10 # fc2fan站长是按100分来打分的 - movie.score = f'{score:.1f}' + score = int(match.group()) / 10 # fc2fan站长是按100分来打分的 + movie.score = f"{score:.1f}" resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail - if '无码' in resource_info: + if "无码" in resource_info: movie.uncensored = True - elif '有码' in resource_info: + elif "有码" in resource_info: movie.uncensored = False # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商 producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text @@ -56,7 +57,9 @@ def parse_data(movie: MovieInfo): actress = container.xpath("h5/strong[text()='女优名字']/../a/text()") preview_pics = container.xpath("//ul[@class='slides']/li/img/@src") if use_local_mirror: - preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics] + preview_pics = [ + os.path.normpath(os.path.join(base_path, i)) for i in preview_pics + ] # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0] # 影片真实截图,目前暂时用不到 movie.title = title @@ -69,10 +72,11 @@ def parse_data(movie: MovieInfo): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('FC2-1879420') + movie = MovieInfo("FC2-1879420") try: parse_data(movie) print(movie) diff --git a/javsp/web/fc2ppvdb.py b/javsp/web/fc2ppvdb.py index b0ad60892..ad06a5b9d 100644 --- a/javsp/web/fc2ppvdb.py +++ b/javsp/web/fc2ppvdb.py @@ -1,4 +1,5 @@ """从FC2PPVDB抓取数据""" + import logging from typing import List @@ -10,35 +11,47 @@ logger = logging.getLogger(__name__) -base_url = 'https://fc2ppvdb.com' +base_url = "https://fc2ppvdb.com" def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # 去除番号中的'FC2'字样 id_uc = movie.dvdid.upper() - if not id_uc.startswith('FC2-'): - raise ValueError('Invalid FC2 number: ' + movie.dvdid) - fc2_id = id_uc.replace('FC2-', '') + if not id_uc.startswith("FC2-"): + raise ValueError("Invalid FC2 number: " + movie.dvdid) + fc2_id = id_uc.replace("FC2-", "") # 抓取网页 - url = f'{base_url}/articles/{fc2_id}' + url = f"{base_url}/articles/{fc2_id}" html = get_html(url) - container = html.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]") + container = html.xpath( + "//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]" + ) if len(container) > 0: container = container[0] else: raise MovieNotFoundError(__name__, movie.dvdid) - + title = container.xpath("//h2/a/text()") thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src") - duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()") + duration_str = container.xpath( + "//div[starts-with(text(),'収録時間:')]/span/text()" + ) actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()") genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()") publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()") publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()") - uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()") - uncensored_str_f = get_list_first(uncensored_str); - uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None + uncensored_str = container.xpath( + "//div[starts-with(text(),'モザイク:')]/span/text()" + ) + uncensored_str_f = get_list_first(uncensored_str) + uncensored = ( + True + if uncensored_str_f == "無" + else False + if uncensored_str_f == "有" + else None + ) preview_pics = None preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href") @@ -60,15 +73,18 @@ def parse_data(movie: MovieInfo): else: movie.cover = get_list_first(thumb_pic) -def get_list_first(list:List): + +def get_list_first(list: List): return list[0] if list and len(list) > 0 else None + if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('FC2-4497837') + movie = MovieInfo("FC2-4497837") try: parse_data(movie) print(movie) diff --git a/javsp/web/gyutto.py b/javsp/web/gyutto.py index db7d6c795..d24d592b8 100644 --- a/javsp/web/gyutto.py +++ b/javsp/web/gyutto.py @@ -1,4 +1,5 @@ """从https://gyutto.com/官网抓取数据""" + import logging import time @@ -9,38 +10,41 @@ logger = logging.getLogger(__name__) # https://dl.gyutto.com/i/item266923 -base_url = 'http://gyutto.com' -base_encode = 'euc-jp' +base_url = "http://gyutto.com" +base_encode = "euc-jp" + def get_movie_title(html): container = html.xpath("//h1") if len(container) > 0: container = container[0] title = container.text - + return title -def get_movie_img(html, index = 1): + +def get_movie_img(html, index=1): images = [] container = html.xpath("//a[@class='highslide']/img") if len(container) > 0: if index == 0: - return container[0].get('src') - + return container[0].get("src") + for row in container: - images.append(row.get('src')) + images.append(row.get("src")) return images + def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # 去除番号中的'gyutto'字样 id_uc = movie.dvdid.upper() - if not id_uc.startswith('GYUTTO-'): - raise ValueError('Invalid gyutto number: ' + movie.dvdid) - gyutto_id = id_uc.replace('GYUTTO-', '') + if not id_uc.startswith("GYUTTO-"): + raise ValueError("Invalid gyutto number: " + movie.dvdid) + gyutto_id = id_uc.replace("GYUTTO-", "") # 抓取网页 - url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1' + url = f"{base_url}/i/item{gyutto_id}?select_uaflag=1" r = request_get(url, delay_raise=True) if r.status_code == 404: raise MovieNotFoundError(__name__, movie.dvdid) @@ -50,17 +54,17 @@ def parse_data(movie: MovieInfo): for row in container: key = row.xpath(".//dt/text()") if key[0] == "サークル": - producer = ''.join(row.xpath(".//dd/a/text()")) + producer = "".join(row.xpath(".//dd/a/text()")) elif key[0] == "ジャンル": genre = row.xpath(".//dd/a/text()") elif key[0] == "配信開始日": date = row.xpath(".//dd/text()") - date_str = ''.join(date) + date_str = "".join(date) date_time = time.strptime(date_str, "%Y年%m月%d日") publish_date = time.strftime("%Y-%m-%d", date_time) plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0] - + movie.title = get_movie_title(html) movie.cover = get_movie_img(html, 0) movie.preview_pics = get_movie_img(html) @@ -73,12 +77,13 @@ def parse_data(movie: MovieInfo): movie.genre = genre movie.plot = plot + if __name__ == "__main__": import pretty_errors pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('gyutto-266923') + movie = MovieInfo("gyutto-266923") try: parse_data(movie) diff --git a/javsp/web/jav321.py b/javsp/web/jav321.py index 4e42617a5..2f3df7f0d 100644 --- a/javsp/web/jav321.py +++ b/javsp/web/jav321.py @@ -1,4 +1,5 @@ """从jav321抓取数据""" + import re import logging @@ -9,17 +10,17 @@ logger = logging.getLogger(__name__) -base_url = 'https://www.jav321.com' +base_url = "https://www.jav321.com" def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" - html = post_html(f'{base_url}/search', data={'sn': movie.dvdid}) + html = post_html(f"{base_url}/search", data={"sn": movie.dvdid}) page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] - #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 - cid = page_url.split('/')[-1] # /video/ipx00177 + # TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 + cid = page_url.split("/")[-1] # /video/ipx00177 # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片 - if cid == 'search': + if cid == "search": raise MovieNotFoundError(__name__, movie.dvdid) title = html.xpath("//div[@class='panel-heading']/h3/text()")[0] info = html.xpath("//div[@class='col-md-9']")[0] @@ -30,10 +31,12 @@ def parse_data(movie: MovieInfo): # actress, actress_pics # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白 actress, actress_pics = [], {} - actress_tags = html.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img") + actress_tags = html.xpath( + "//div[@class='thumbnail']/a[contains(@href,'/star/')]/img" + ) for tag in actress_tags: name = tag.tail.strip() - pic_url = tag.get('src') + pic_url = tag.get("src") actress.append(name) # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url, # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据 @@ -43,17 +46,19 @@ def parse_data(movie: MovieInfo): genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text) - genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1 - dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper() - publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '') + genre_id.append(tag.get("href").split("/")[-2]) # genre/4025/1 + dvdid = info.xpath("b[text()='品番']")[0].tail.replace(": ", "").upper() + publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(": ", "") duration_str = info.xpath("b[text()='収録時間']")[0].tail - match = re.search(r'\d+', duration_str) + match = re.search(r"\d+", duration_str) if match: movie.duration = match.group(0) # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星 - score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original") + score_tag = info.xpath( + "//b[text()='平均評価']/following-sibling::img/@data-original" + ) if score_tag: - score = int(score_tag[0][5:7])/5 # /10*2 + score = int(score_tag[0][5:7]) / 5 # /10*2 movie.score = str(score) serial_tag = info.xpath("a[contains(@href,'/series/')]/text()") if serial_tag: @@ -61,15 +66,21 @@ def parse_data(movie: MovieInfo): preview_video_tag = info.xpath("//video/source/@src") if preview_video_tag: movie.preview_video = preview_video_tag[0] - plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()") + plot_tag = info.xpath( + "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()" + ) if plot_tag: movie.plot = plot_tag[0] - preview_pics = html.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src") + preview_pics = html.xpath( + "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src" + ) if len(preview_pics) == 0: # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL - preview_pics = html.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src") + preview_pics = html.xpath( + "//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src" + ) # 有的图片链接里有多个//,网站质量堪忧…… - preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics] + preview_pics = [i[:8] + i[8:].replace("//", "/") for i in preview_pics] # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析 movie.url = page_url @@ -89,10 +100,11 @@ def parse_data(movie: MovieInfo): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('SCUTE-1177') + movie = MovieInfo("SCUTE-1177") try: parse_data(movie) print(movie) diff --git a/javsp/web/javbus.py b/javsp/web/javbus.py index a98cd9974..e40c29025 100644 --- a/javsp/web/javbus.py +++ b/javsp/web/javbus.py @@ -1,4 +1,5 @@ """从JavBus抓取数据""" + import logging @@ -10,8 +11,8 @@ logger = logging.getLogger(__name__) -genre_map = GenreMap('data/genre_javbus.csv') -permanent_url = 'https://www.javbus.com' +genre_map = GenreMap("data/genre_javbus.csv") +permanent_url = "https://www.javbus.com" if Cfg().network.proxy_server is not None: base_url = permanent_url else: @@ -23,7 +24,7 @@ def parse_data(movie: MovieInfo): Args: movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 """ - url = f'{base_url}/{movie.dvdid}' + url = f"{base_url}/{movie.dvdid}" resp = request_get(url, delay_raise=True) # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 if resp.history and resp.history[0].status_code == 302: @@ -31,8 +32,8 @@ def parse_data(movie: MovieInfo): else: html = resp2html(resp) # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 - page_title = html.xpath('/html/head/title/text()') - if page_title and page_title[0].startswith('404 Page Not Found!'): + page_title = html.xpath("/html/head/title/text()") + if page_title and page_title[0].startswith("404 Page Not Found!"): raise MovieNotFoundError(__name__, movie.dvdid) container = html.xpath("//div[@class='container']")[0] @@ -42,9 +43,9 @@ def parse_data(movie: MovieInfo): info = container.xpath("//div[@class='col-md-3 info']")[0] dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip() - duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip() + duration = info.xpath("p/span[text()='長度:']")[0].tail.replace("分鐘", "").strip() director_tag = info.xpath("p/span[text()='導演:']") - if director_tag: # xpath没有匹配时将得到空列表 + if director_tag: # xpath没有匹配时将得到空列表 movie.director = director_tag[0].getnext().text.strip() producer_tag = info.xpath("p/span[text()='製作商:']") if producer_tag: @@ -61,12 +62,12 @@ def parse_data(movie: MovieInfo): genre_tags = info.xpath("//span[@class='genre']/label/a") genre, genre_id = [], [] for tag in genre_tags: - tag_url = tag.get('href') - pre_id = tag_url.split('/')[-1] + tag_url = tag.get("href") + pre_id = tag_url.split("/")[-1] genre.append(tag.text) - if 'uncensored' in tag_url: + if "uncensored" in tag_url: movie.uncensored = True - genre_id.append('uncensored-' + pre_id) + genre_id.append("uncensored-" + pre_id) else: movie.uncensored = False genre_id.append(pre_id) @@ -75,18 +76,18 @@ def parse_data(movie: MovieInfo): actress, actress_pics = [], {} actress_tags = html.xpath("//a[@class='avatar-box']/div/img") for tag in actress_tags: - name = tag.get('title') - pic_url = tag.get('src') + name = tag.get("title") + pic_url = tag.get("src") actress.append(name) - if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 + if not pic_url.endswith("nowprinting.gif"): # 略过默认的头像 actress_pics[name] = pic_url # 整理数据并更新movie的相应属性 - movie.url = f'{permanent_url}/{movie.dvdid}' + movie.url = f"{permanent_url}/{movie.dvdid}" movie.dvdid = dvdid - movie.title = title.replace(dvdid, '').strip() + movie.title = title.replace(dvdid, "").strip() movie.cover = cover movie.preview_pics = preview_pics - if publish_date != '0000-00-00': # 丢弃无效的发布日期 + if publish_date != "0000-00-00": # 丢弃无效的发布日期 movie.publish_date = publish_date movie.duration = duration if int(duration) else None movie.genre = genre @@ -99,15 +100,16 @@ def parse_clean_data(movie: MovieInfo): """解析指定番号的影片数据并进行清洗""" parse_data(movie) movie.genre_norm = genre_map.map(movie.genre_id) - movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换) + movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换) if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('NANP-030') + movie = MovieInfo("NANP-030") try: parse_clean_data(movie) print(movie) diff --git a/javsp/web/javdb.py b/javsp/web/javdb.py index 5120aae76..149f05eeb 100644 --- a/javsp/web/javdb.py +++ b/javsp/web/javdb.py @@ -1,4 +1,5 @@ """从JavDB抓取数据""" + import os import re import logging @@ -14,11 +15,13 @@ # 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析 request = Request(use_scraper=True) -request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5' +request.headers["Accept-Language"] = ( + "zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5" +) logger = logging.getLogger(__name__) -genre_map = GenreMap('data/genre_javdb.csv') -permanent_url = 'https://javdb.com' +genre_map = GenreMap("data/genre_javdb.csv") +permanent_url = "https://javdb.com" if Cfg().network.proxy_server is not None: base_url = permanent_url else: @@ -31,29 +34,39 @@ def get_html_wrapper(url): r = request.get(url, delay_raise=True) if r.status_code == 200: # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页 - if r.history and '/login' in r.url: + if r.history and "/login" in r.url: # 仅在需要时去读取Cookies - if 'cookies_pool' not in globals(): + if "cookies_pool" not in globals(): try: cookies_pool = get_browsers_cookies() except (PermissionError, OSError) as e: - logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True) + logger.warning( + f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", + exc_info=True, + ) cookies_pool = [] except Exception as e: - logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True) + logger.warning( + f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", + exc_info=True, + ) cookies_pool = [] if len(cookies_pool) > 0: item = cookies_pool.pop() # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies request = Request(use_scraper=True) - request.cookies = item['cookies'] - cookies_source = (item['profile'], item['site']) - logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}') + request.cookies = item["cookies"] + cookies_source = (item["profile"], item["site"]) + logger.debug( + f"未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}" + ) return get_html_wrapper(url) else: - raise CredentialError('JavDB: 所有浏览器Cookies均已过期') - elif r.history and 'pay' in r.url.split('/')[-1]: - raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'") + raise CredentialError("JavDB: 所有浏览器Cookies均已过期") + elif r.history and "pay" in r.url.split("/")[-1]: + raise SitePermissionError( + f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'" + ) else: html = resp2html(r) return html @@ -62,42 +75,48 @@ def get_html_wrapper(url): code_tag = html.xpath("//span[@class='code-label']/span") error_code = code_tag[0].text if code_tag else None if error_code: - if error_code == '1020': - block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器' + if error_code == "1020": + block_msg = f"JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器" else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})' + block_msg = ( + f"JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})" + ) else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url}' + block_msg = f"JavDB: {r.status_code} 禁止访问: {url}" raise SiteBlocked(block_msg) else: - raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}') + raise WebsiteError(f"JavDB: {r.status_code} 非预期状态码: {url}") def get_user_info(site, cookies): """获取cookies对应的JavDB用户信息""" try: request.cookies = cookies - html = request.get_html(f'https://{site}/users/profile') + html = request.get_html(f"https://{site}/users/profile") except Exception as e: - logger.info('JavDB: 获取用户信息时出错') + logger.info("JavDB: 获取用户信息时出错") logger.debug(e, exc_info=1) return # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点 - if 'JavDB' in html.text: - email = html.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip() - username = html.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip() + if "JavDB" in html.text: + email = html.xpath( + "//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()" + )[0].strip() + username = html.xpath( + "//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()" + )[0].strip() return email, username else: - logger.debug('JavDB: 域名已过期: ' + site) + logger.debug("JavDB: 域名已过期: " + site) def get_valid_cookies(): """扫描浏览器,获取一个可用的Cookies""" # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用 for d in cookies_pool: - info = get_user_info(d['site'], d['cookies']) + info = get_user_info(d["site"], d["cookies"]) if info: - return d['cookies'] + return d["cookies"] else: logger.debug(f"{d['profile']}, {d['site']}: Cookies无效") @@ -108,7 +127,7 @@ def parse_data(movie: MovieInfo): movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 """ # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个 - html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}') + html = get_html_wrapper(f"{base_url}/search?q={movie.dvdid}") ids = list(map(str.lower, html.xpath("//div[@class='video-title']/strong/text()"))) movie_urls = html.xpath("//a[@class='box']/@href") match_count = len([i for i in ids if i == movie.dvdid.lower()]) @@ -123,11 +142,11 @@ def parse_data(movie: MovieInfo): # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面 box = html.xpath("//a[@class='box']")[index] movie.url = new_url - movie.title = box.get('title') + movie.title = box.get("title") movie.cover = box.xpath("div/img/@src")[0] score_str = box.xpath("div[@class='score']/span/span")[0].tail - score = re.search(r'([\d.]+)分', score_str).group(1) - movie.score = "{:.2f}".format(float(score)*2) + score = re.search(r"([\d.]+)分", score_str).group(1) + movie.score = "{:.2f}".format(float(score) * 2) movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip() return else: @@ -136,25 +155,34 @@ def parse_data(movie: MovieInfo): container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0] info = container.xpath("//nav[@class='panel movie-panel-info']")[0] title = container.xpath("h2/strong[@class='current-title']/text()")[0] - show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]") + show_orig_title = container.xpath( + "//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]" + ) if show_orig_title: movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0] cover = container.xpath("//img[@class='video-cover']/@src")[0] - preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href") + preview_pics = container.xpath( + "//a[@class='tile-item'][@data-fancybox='gallery']/@href" + ) preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src") if preview_video_tag: preview_video = preview_video_tag[0] - if preview_video.startswith('//'): - preview_video = 'https:' + preview_video + if preview_video.startswith("//"): + preview_video = "https:" + preview_video movie.preview_video = preview_video dvdid = info.xpath("div/span")[0].text_content() publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text - duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip() + duration = ( + info.xpath("div/strong[text()='時長:']")[0] + .getnext() + .text.replace("分鍾", "") + .strip() + ) director_tag = info.xpath("div/strong[text()='導演:']") if director_tag: movie.director = director_tag[0].getnext().text_content().strip() av_type = guess_av_type(movie.dvdid) - if av_type != 'fc2': + if av_type != "fc2": producer_tag = info.xpath("div/strong[text()='片商:']") else: producer_tag = info.xpath("div/strong[text()='賣家:']") @@ -169,27 +197,29 @@ def parse_data(movie: MovieInfo): score_tag = info.xpath("//span[@class='score-stars']") if score_tag: score_str = score_tag[0].tail - score = re.search(r'([\d.]+)分', score_str).group(1) - movie.score = "{:.2f}".format(float(score)*2) + score = re.search(r"([\d.]+)分", score_str).group(1) + movie.score = "{:.2f}".format(float(score) * 2) genre_tags = info.xpath("//strong[text()='類別:']/../span/a") genre, genre_id = [], [] for tag in genre_tags: - pre_id = tag.get('href').split('/')[-1] + pre_id = tag.get("href").split("/")[-1] genre.append(tag.text) genre_id.append(pre_id) # 判定影片有码/无码 - subsite = pre_id.split('?')[0] - movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite) + subsite = pre_id.split("?")[0] + movie.uncensored = {"uncensored": True, "tags": False}.get(subsite) # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优 actors_tag = info.xpath("//strong[text()='演員:']/../span")[0] all_actors = actors_tag.xpath("a/text()") genders = actors_tag.xpath("strong/text()") - actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀'] - magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href") + actress = [i for i in all_actors if genders[all_actors.index(i)] == "♀"] + magnet = container.xpath( + "//div[@class='magnet-name column is-four-fifths']/a/@href" + ) movie.dvdid = dvdid movie.url = new_url.replace(base_url, permanent_url) - movie.title = title.replace(dvdid, '').strip() + movie.title = title.replace(dvdid, "").strip() movie.cover = cover movie.preview_pics = preview_pics movie.publish_date = publish_date @@ -197,7 +227,7 @@ def parse_data(movie: MovieInfo): movie.genre = genre movie.genre_id = genre_id movie.actress = actress - movie.magnet = [i.replace('[javdb.com]','') for i in magnet] + movie.magnet = [i.replace("[javdb.com]", "") for i in magnet] def parse_clean_data(movie: MovieInfo): @@ -211,10 +241,12 @@ def parse_clean_data(movie: MovieInfo): movie.cover = None except SiteBlocked: raise - logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试') - if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')): + logger.error("JavDB: 可能触发了反爬虫机制,请稍后再试") + if movie.genre_id and (not movie.genre_id[0].startswith("fc2?")): movie.genre_norm = genre_map.map(movie.genre_id) - movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换) + movie.genre_id = ( + None # 没有别的地方需要再用到,清空genre id(表明已经完成转换) + ) def collect_actress_alias(type=0, use_original=True): @@ -325,7 +357,7 @@ def collect_actress_alias(type=0, use_original=True): if __name__ == "__main__": # collect_actress_alias() - movie = MovieInfo('FC2-2735981') + movie = MovieInfo("FC2-2735981") try: parse_clean_data(movie) print(movie) diff --git a/javsp/web/javlib.py b/javsp/web/javlib.py index 85f77b75f..f4b8c055a 100644 --- a/javsp/web/javlib.py +++ b/javsp/web/javlib.py @@ -1,4 +1,5 @@ """从JavLibrary抓取数据""" + import logging from urllib.parse import urlsplit @@ -7,21 +8,21 @@ from javsp.web.exceptions import * from javsp.web.proxyfree import get_proxy_free_url from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo +from javsp.datatype import MovieInfo # 初始化Request实例 request = Request(use_scraper=True) logger = logging.getLogger(__name__) -permanent_url = 'https://www.javlibrary.com' -base_url = '' +permanent_url = "https://www.javlibrary.com" +base_url = "" def init_network_cfg(): """设置合适的代理模式和base_url""" request.timeout = 5 - proxy_free_url = get_proxy_free_url('javlib') + proxy_free_url = get_proxy_free_url("javlib") urls = [str(Cfg().network.proxy_free[CrawlerID.javlib]), permanent_url] if proxy_free_url and proxy_free_url not in urls: urls.insert(1, proxy_free_url) @@ -39,7 +40,7 @@ def init_network_cfg(): return url except Exception as e: logger.debug(f"Fail to connect to '{url}': {e}") - logger.warning('无法绕开JavLib的反爬机制') + logger.warning("无法绕开JavLib的反爬机制") request.timeout = Cfg().network.timeout.seconds return permanent_url @@ -51,7 +52,7 @@ def parse_data(movie: MovieInfo): if not base_url: base_url = init_network_cfg() logger.debug(f"JavLib网络配置: {base_url}, proxy={request.proxies}") - url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' + url = new_url = f"{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}" resp = request.get(url) html = resp2html(resp) if resp.history: @@ -61,10 +62,10 @@ def parse_data(movie: MovieInfo): else: # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段, # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据 - base_url = 'https://' + urlsplit(resp.url).netloc + base_url = "https://" + urlsplit(resp.url).netloc logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}") return parse_data(movie) - else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 + else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 video_tags = html.xpath("//div[@class='video'][@id]/a") # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果 pre_choose = [] @@ -72,7 +73,7 @@ def parse_data(movie: MovieInfo): tag_dvdid = tag.xpath("div[@class='id']/text()")[0] if tag_dvdid.upper() == movie.dvdid.upper(): pre_choose.append(tag) - pre_choose_urls = [i.get('href') for i in pre_choose] + pre_choose_urls = [i.get("href") for i in pre_choose] match_count = len(pre_choose) if match_count == 0: raise MovieNotFoundError(__name__, movie.dvdid) @@ -81,18 +82,24 @@ def parse_data(movie: MovieInfo): elif match_count == 2: no_blueray = [] for tag in pre_choose: - if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc + if "ブルーレイディスク" not in tag.get("title"): # Blu-ray Disc no_blueray.append(tag) no_blueray_count = len(no_blueray) if no_blueray_count == 1: - new_url = no_blueray[0].get('href') - logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}") + new_url = no_blueray[0].get("href") + logger.debug( + f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}" + ) else: # 两个结果中没有谁是蓝光影片,说明影片番号重复了 - raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) + raise MovieDuplicateError( + __name__, movie.dvdid, match_count, pre_choose_urls + ) else: # 存在不同影片但是番号相同的情况,如MIDV-010 - raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) + raise MovieDuplicateError( + __name__, movie.dvdid, match_count, pre_choose_urls + ) # 重新抓取网页 html = request.get_html(new_url) container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0] @@ -112,15 +119,15 @@ def parse_data(movie: MovieInfo): movie.publisher = publisher_tag[0] score_tag = info.xpath("//span[@class='score']/text()") if score_tag: - movie.score = score_tag[0].strip('()') + movie.score = score_tag[0].strip("()") genre = info.xpath("//span[@class='genre']/a/text()") actress = info.xpath("//span[@class='star']/a/text()") movie.dvdid = dvdid movie.url = new_url.replace(base_url, permanent_url) - movie.title = title.replace(dvdid, '').strip() - if cover.startswith('//'): # 补全URL中缺少的协议段 - cover = 'https:' + cover + movie.title = title.replace(dvdid, "").strip() + if cover.startswith("//"): # 补全URL中缺少的协议段 + cover = "https:" + cover movie.cover = cover movie.publish_date = publish_date movie.duration = duration @@ -131,9 +138,10 @@ def parse_data(movie: MovieInfo): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) base_url = permanent_url - movie = MovieInfo('IPX-177') + movie = MovieInfo("IPX-177") try: parse_data(movie) print(movie) diff --git a/javsp/web/javmenu.py b/javsp/web/javmenu.py index 5296a69cd..310b5de0f 100644 --- a/javsp/web/javmenu.py +++ b/javsp/web/javmenu.py @@ -1,4 +1,5 @@ """从JavMenu抓取数据""" + import logging from javsp.web.base import Request, resp2html @@ -9,7 +10,7 @@ request = Request() logger = logging.getLogger(__name__) -base_url = 'https://mrzyx.xyz' +base_url = "https://mrzyx.xyz" def parse_data(movie: MovieInfo): @@ -18,7 +19,7 @@ def parse_data(movie: MovieInfo): movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 """ # JavMenu网页做得很不走心,将就了 - url = f'{base_url}/{movie.dvdid}' + url = f"{base_url}/{movie.dvdid}" r = request.get(url) if r.history: # 被重定向到主页说明找不到影片资源 @@ -28,13 +29,13 @@ def parse_data(movie: MovieInfo): container = html.xpath("//div[@class='col-md-9 px-0']")[0] title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0] # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站 - title = title.replace(' | JAV目錄大全 | 每日更新', '') - title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '') + title = title.replace(" | JAV目錄大全 | 每日更新", "") + title = title.replace(" 免費在線看", "").replace(" 免費AV在線看", "") cover_tag = container.xpath("//div[@class='single-video']") if len(cover_tag) > 0: - video_tag = cover_tag[0].find('video') + video_tag = cover_tag[0].find("video") # URL首尾竟然也有空格…… - movie.cover = video_tag.get('data-poster').strip() + movie.cover = video_tag.get("data-poster").strip() # 预览影片改为blob了,无法获取 # movie.preview_video = video_tag.find('source').get('src').strip() else: @@ -43,30 +44,39 @@ def parse_data(movie: MovieInfo): movie.cover = cover_img_tag[0].strip() info = container.xpath("//div[@class='card-body']")[0] publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text - duration = info.xpath("div/span[contains(text(), '時長:')]")[0].getnext().text.replace('分鐘', '') - producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()") + duration = ( + info.xpath("div/span[contains(text(), '時長:')]")[0] + .getnext() + .text.replace("分鐘", "") + ) + producer = info.xpath( + "div/span[contains(text(), '製作:')]/following-sibling::a/span/text()" + ) if producer: movie.producer = producer[0] genre_tags = info.xpath("//a[@class='genre']") genre, genre_id = [], [] for tag in genre_tags: - items = tag.get('href').split('/') - pre_id = items[-3] + '/' + items[-1] + items = tag.get("href").split("/") + pre_id = items[-3] + "/" + items[-1] genre.append(tag.text.strip()) genre_id.append(pre_id) # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠…… - actress = info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()") or None + actress = ( + info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()") + or None + ) magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody") if magnet_table: magnet_links = magnet_table[0].xpath("tr/td/a/@href") # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以 - movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links] + movie.magnet = [i.replace("[javdb.com]", "") for i in magnet_links] preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href") if (not movie.cover) and preview_pics: movie.cover = preview_pics[0] movie.url = url - movie.title = title.replace(movie.dvdid, '').strip() + movie.title = title.replace(movie.dvdid, "").strip() movie.preview_pics = preview_pics movie.publish_date = publish_date movie.duration = duration @@ -77,10 +87,11 @@ def parse_data(movie: MovieInfo): if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('FC2-718323') + movie = MovieInfo("FC2-718323") try: parse_data(movie) print(movie) diff --git a/javsp/web/mgstage.py b/javsp/web/mgstage.py index 4904e51db..a3d1ac7e8 100644 --- a/javsp/web/mgstage.py +++ b/javsp/web/mgstage.py @@ -1,4 +1,5 @@ """从蚊香社-mgstage抓取数据""" + import re import logging @@ -10,18 +11,18 @@ logger = logging.getLogger(__name__) -base_url = 'https://www.mgstage.com' +base_url = "https://www.mgstage.com" # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) request = Request() -request.cookies = {'adc': '1'} +request.cookies = {"adc": "1"} def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" - url = f'{base_url}/product/product_detail/{movie.dvdid}/' + url = f"{base_url}/product/product_detail/{movie.dvdid}/" resp = request.get(url, delay_raise=True) if resp.status_code == 403: - raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理') + raise SiteBlocked("mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理") # url不存在时会被重定向至主页。history非空时说明发生了重定向 elif resp.history: raise MovieNotFoundError(__name__, movie.dvdid) @@ -33,18 +34,28 @@ def parse_data(movie: MovieInfo): cover = container.xpath("//a[@id='EnlargeImage']/@href")[0] # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表 actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()") - actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()") + actress_link = container.xpath( + "//th[text()='出演:']/following-sibling::td/a/text()" + ) actress = [i.strip() for i in actress_text + actress_link] - actress = [i for i in actress if i] # 移除空字符串 - producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip() - duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0] - match = re.search(r'\d+', duration_str) + actress = [i for i in actress if i] # 移除空字符串 + producer = container.xpath( + "//th[text()='メーカー:']/following-sibling::td/a/text()" + )[0].strip() + duration_str = container.xpath( + "//th[text()='収録時間:']/following-sibling::td/text()" + )[0] + match = re.search(r"\d+", duration_str) if match: movie.duration = match.group(0) dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0] - date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0] - publish_date = date_str.replace('/', '-') - serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()") + date_str = container.xpath( + "//th[text()='配信開始日:']/following-sibling::td/text()" + )[0] + publish_date = date_str.replace("/", "-") + serial_tag = container.xpath( + "//th[text()='シリーズ:']/following-sibling::td/a/text()" + ) if serial_tag: movie.serial = serial_tag[0].strip() # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 @@ -52,10 +63,10 @@ def parse_data(movie: MovieInfo): genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a") genre = [i.text.strip() for i in genre_tags] score_str = container.xpath("//td[@class='review']/span")[0].tail.strip() - match = re.search(r'^[\.\d]+', score_str) + match = re.search(r"^[\.\d]+", score_str) if match: score = float(match.group()) * 2 - movie.score = f'{score:.2f}' + movie.score = f"{score:.2f}" # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签 plots = [] plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]") @@ -66,26 +77,26 @@ def parse_data(movie: MovieInfo): plots.append(p.text_content()) continue for child in children: - if child.tag == 'br' and plots[-1] != '\n': - plots.append('\n') + if child.tag == "br" and plots[-1] != "\n": + plots.append("\n") else: if child.text: plots.append(child.text) if child.tail: plots.append(child.tail) - plot = ''.join(plots).strip() + plot = "".join(plots).strip() preview_pics = container.xpath("//a[@class='sample_image']/@href") if Cfg().crawler.hardworking: # 预览视频是点击按钮后再加载的,不在静态网页中 btn_url = container.xpath("//a[@class='button_sample']/@href")[0] - video_pid = btn_url.split('/')[-1] - req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' + video_pid = btn_url.split("/")[-1] + req_url = f"{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}" resp = request.get(req_url).json() - video_url = resp.get('url') + video_url = resp.get("url") if video_url: # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX - preview_video = video_url.split('.ism/')[0] + '.mp4' + preview_video = video_url.split(".ism/")[0] + ".mp4" movie.preview_video = preview_video movie.dvdid = dvdid @@ -98,15 +109,16 @@ def parse_data(movie: MovieInfo): movie.genre = genre movie.plot = plot movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('HRV-045') + movie = MovieInfo("HRV-045") try: parse_data(movie) print(movie) diff --git a/javsp/web/njav.py b/javsp/web/njav.py index f94e943f3..331e44e6a 100644 --- a/javsp/web/njav.py +++ b/javsp/web/njav.py @@ -1,4 +1,5 @@ """从NJAV抓取数据""" + import re import logging from typing import List @@ -11,12 +12,13 @@ logger = logging.getLogger(__name__) -base_url = 'https://njav.tv/ja' +base_url = "https://njav.tv/ja" + def search_video(movie: MovieInfo): id_uc = movie.dvdid # 抓取网页 - url = f'{base_url}/search?keyword={id_uc}' + url = f"{base_url}/search?keyword={id_uc}" html = get_html(url) list = html.xpath("//div[@class='box-item']/div[@class='detail']/a") video_url = None @@ -26,13 +28,14 @@ def search_video(movie: MovieInfo): video_url = item.xpath("@href") break if id_uc.startswith("FC2-"): - fc2id = id_uc.replace('FC2-', '') + fc2id = id_uc.replace("FC2-", "") if "FC2" in search_title and fc2id in search_title: video_url = item.xpath("@href") break - + return get_list_first(video_url) - + + def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # 抓取网页 @@ -45,8 +48,10 @@ def parse_data(movie: MovieInfo): container = container[0] else: raise MovieNotFoundError(__name__, movie.dvdid) - - title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0] + + title = container.xpath( + "//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()" + )[0] thumb_pic = container.xpath("//div[@id='player']/@data-poster") plot = " ".join(container.xpath("//div[@class='description']/p/text()")) magnet = container.xpath("//div[@class='magnet']/a/@href") @@ -64,13 +69,13 @@ def parse_data(movie: MovieInfo): detail_dic = {} for item in container.xpath("//div[@class='detail-item']/div"): - item_title = item.xpath('span/text()')[0] + item_title = item.xpath("span/text()")[0] if "タグ:" in item_title: genre += item.xpath("span")[1].xpath("a/text()") elif "ジャンル:" in item_title: genre += item.xpath("span")[1].xpath("a/text()") elif "レーベル:" in item_title: - genre += item.xpath("span")[1].xpath("a/text()") + genre += item.xpath("span")[1].xpath("a/text()") elif "女優:" in item_title: actress = item.xpath("span")[1].xpath("a/text()") elif "シリーズ:" in item_title: @@ -83,18 +88,18 @@ def parse_data(movie: MovieInfo): publish_date = get_list_first(item.xpath("span")[1].xpath("text()")) elif "再生時間:" in item_title: duration_str = get_list_first(item.xpath("span")[1].xpath("text()")) - + # 清除标题里的番号字符 keywords = [real_id, " "] if movie.dvdid.startswith("FC2"): - keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]] + keywords += ["FC2", "PPV", "-"] + [movie.dvdid.split("-")[-1]] for keyword in keywords: - title = re.sub(re.escape(keyword), "", title, flags=re.I) + title = re.sub(re.escape(keyword), "", title, flags=re.I) # 判断是否无码 uncensored_arr = magnet + [title] for uncensored_str in uncensored_arr: - if 'uncensored' in uncensored_str.lower(): + if "uncensored" in uncensored_str.lower(): uncensored = True movie.url = url @@ -118,15 +123,18 @@ def parse_data(movie: MovieInfo): else: movie.cover = get_list_first(thumb_pic) -def get_list_first(list:List): + +def get_list_first(list: List): return list[0] if list and len(list) > 0 else None + if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('012023_002') + movie = MovieInfo("012023_002") try: parse_data(movie) print(movie) diff --git a/javsp/web/prestige.py b/javsp/web/prestige.py index f6884c658..65e0eee0b 100644 --- a/javsp/web/prestige.py +++ b/javsp/web/prestige.py @@ -1,4 +1,5 @@ """从蚊香社-prestige抓取数据""" + import re import logging @@ -9,10 +10,10 @@ logger = logging.getLogger(__name__) -base_url = 'https://www.prestige-av.com' +base_url = "https://www.prestige-av.com" # prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面 # (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取) -cookies = {'__age_auth__': 'true'} +cookies = {"__age_auth__": "true"} def parse_data(movie: MovieInfo): @@ -20,13 +21,15 @@ def parse_data(movie: MovieInfo): Args: movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 """ - url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}' + url = f"{base_url}/goods/goods_detail.php?sku={movie.dvdid}" resp = request_get(url, cookies=cookies, delay_raise=True) if resp.status_code == 500: # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试 raise MovieNotFoundError(__name__, movie.dvdid) elif resp.status_code == 403: - raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理') + raise SiteBlocked( + "prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理" + ) resp.raise_for_status() html = resp2html(resp) container_tags = html.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']") @@ -35,25 +38,41 @@ def parse_data(movie: MovieInfo): container = container_tags[0] title = container.xpath("h1/span")[0].tail.strip() - cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0] - cover = cover.split('?')[0] - actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()") + cover = container.xpath( + "//div[@class='c-ratio-image mr-8']/picture/source/img/@src" + )[0] + cover = cover.split("?")[0] + actress = container.xpath( + "//p[text()='出演者:']/following-sibling::div/p/a/text()" + ) # 移除女优名中的空格,使女优名与其他网站保持一致 - actress = [i.strip().replace(' ', '') for i in actress] - duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content() - match = re.search(r'\d+', duration_str) + actress = [i.strip().replace(" ", "") for i in actress] + duration_str = ( + container.xpath("//p[text()='収録時間:']")[0].getnext().text_content() + ) + match = re.search(r"\d+", duration_str) if match: movie.duration = match.group(0) - date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0] - publish_date = date_url.split('?date=')[-1] - producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip() + date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[ + 0 + ] + publish_date = date_url.split("?date=")[-1] + producer = container.xpath( + "//p[text()='メーカー:']/following-sibling::div/a/text()" + )[0].strip() dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0] genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a") genre = [tag.text.strip() for tag in genre_tags] - serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip() - plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip() - preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src") - preview_pics = [i.split('?')[0] for i in preview_pics] + serial = container.xpath( + "//p[text()='レーベル:']/following-sibling::div/a/text()" + )[0].strip() + plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[ + 0 + ].text.strip() + preview_pics = container.xpath( + "//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src" + ) + preview_pics = [i.split("?")[0] for i in preview_pics] # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效 movie.url = url @@ -67,15 +86,18 @@ def parse_data(movie: MovieInfo): movie.serial = serial movie.plot = plot movie.preview_pics = preview_pics - movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片 + movie.uncensored = ( + False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片 + ) if __name__ == "__main__": import pretty_errors + pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('ABP-647') + movie = MovieInfo("ABP-647") try: parse_data(movie) print(movie) diff --git a/javsp/web/proxyfree.py b/javsp/web/proxyfree.py index 89c1e63a4..2c98b7cef 100644 --- a/javsp/web/proxyfree.py +++ b/javsp/web/proxyfree.py @@ -1,4 +1,5 @@ """获取各个网站的免代理地址""" + import re import sys @@ -17,15 +18,15 @@ def get_proxy_free_url(site_name: str, prefer_url=None) -> str: return prefer_url # 当prefer_url不可用时,尝试自动获取指定网站的免代理地址 site_name = site_name.lower() - func_name = f'_get_{site_name}_urls' - get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith('_get_')] + func_name = f"_get_{site_name}_urls" + get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith("_get_")] if func_name in get_funcs: get_urls = getattr(sys.modules[__name__], func_name) try: urls = get_urls() return _choose_one(urls) except: - return '' + return "" else: raise Exception("Dont't know how to get proxy-free url for " + site_name) @@ -34,42 +35,52 @@ def _choose_one(urls) -> str: for url in urls: if is_connectable(url, timeout=5): return url - return '' + return "" def _get_avsox_urls() -> list: - html = get_html('https://tellme.pw/avsox') - urls = html.xpath('//h4/strong/a/@href') + html = get_html("https://tellme.pw/avsox") + urls = html.xpath("//h4/strong/a/@href") return urls def _get_javbus_urls() -> list: - html = get_html('https://www.javbus.one/') + html = get_html("https://www.javbus.one/") text = html.text_content() - urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A) + urls = re.findall( + r"防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})", + text, + re.I | re.A, + ) return urls def _get_javlib_urls() -> list: - html = get_html('https://github.com/javlibcom') - text = html.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content() - match = re.search(r'[\w\.]+', text, re.A) + html = get_html("https://github.com/javlibcom") + text = html.xpath( + "//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']" + )[0].text_content() + match = re.search(r"[\w\.]+", text, re.A) if match: - domain = f'https://www.{match.group(0)}.com' + domain = f"https://www.{match.group(0)}.com" return [domain] def _get_javdb_urls() -> list: - html = get_html('https://jav524.app') + html = get_html("https://jav524.app") js_links = html.xpath("//script[@src]/@src") for link in js_links: - if '/js/index' in link: + if "/js/index" in link: text = get_resp_text(request_get(link)) - match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A) + match = re.search( + r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', + text, + flags=re.I | re.A, + ) if match: return [match.group(1)] if __name__ == "__main__": - print('javdb:\t', _get_javdb_urls()) - print('javlib:\t', _get_javlib_urls()) + print("javdb:\t", _get_javdb_urls()) + print("javlib:\t", _get_javlib_urls()) diff --git a/javsp/web/translate.py b/javsp/web/translate.py index 2e762cb15..1ef736287 100644 --- a/javsp/web/translate.py +++ b/javsp/web/translate.py @@ -1,23 +1,30 @@ """网页翻译接口""" + # 由于翻译服务不走代理,而且需要自己的错误处理机制,因此不通过base.py来管理网络请求 +import logging +import random import time -from typing import Union import uuid -import random -import logging -from pydantic_core import Url -import requests from hashlib import md5 +from typing import Union +import requests +from pydantic_core import Url -__all__ = ['translate', 'translate_movie_info'] +__all__ = ["translate", "translate_movie_info"] -from javsp.config import BaiduTranslateEngine, BingTranslateEngine, Cfg, ClaudeTranslateEngine, GoogleTranslateEngine, OpenAITranslateEngine, TranslateEngine +from javsp.config import ( + BaiduTranslateEngine, + BingTranslateEngine, + Cfg, + ClaudeTranslateEngine, + GoogleTranslateEngine, + OpenAITranslateEngine, +) from javsp.datatype import MovieInfo from javsp.web.base import read_proxy - logger = logging.getLogger(__name__) @@ -26,36 +33,42 @@ def translate_movie_info(info: MovieInfo): # 翻译标题 if info.title and Cfg().translator.fields.title and info.ori_title is None: result = translate(info.title, Cfg().translator.engine, info.actress) - if 'trans' in result: + if "trans" in result: info.ori_title = info.title - info.title = result['trans'] + info.title = result["trans"] # 如果有的话,附加断句信息 - if 'orig_break' in result: - setattr(info, 'ori_title_break', result['orig_break']) - if 'trans_break' in result: - setattr(info, 'title_break', result['trans_break']) + if "orig_break" in result: + setattr(info, "ori_title_break", result["orig_break"]) + if "trans_break" in result: + setattr(info, "title_break", result["trans_break"]) else: - logger.error('翻译标题时出错: ' + result['error']) + logger.error("翻译标题时出错: " + result["error"]) return False # 翻译简介 if info.plot and Cfg().translator.fields.plot: result = translate(info.plot, Cfg().translator.engine, info.actress) - if 'trans' in result: + if "trans" in result: # 只有翻译过plot的影片才可能需要ori_plot属性,因此在运行时动态添加,而不添加到类型定义里 - setattr(info, 'ori_plot', info.plot) - info.plot = result['trans'] + setattr(info, "ori_plot", info.plot) + info.plot = result["trans"] else: - logger.error('翻译简介时出错: ' + result['error']) + logger.error("翻译简介时出错: " + result["error"]) return False return True -def translate(texts, engine: Union[ + +def translate( + texts, + engine: Union[ BaiduTranslateEngine, BingTranslateEngine, ClaudeTranslateEngine, OpenAITranslateEngine, - None - ], actress=[]): + GoogleTranslateEngine, + None, + ], + actress=[], +): """ 翻译入口:对错误进行处理并且统一返回格式 @@ -65,84 +78,108 @@ def translate(texts, engine: Union[ 翻译出错: {'error': 'baidu: 54000: PARAM_FROM_TO_OR_Q_EMPTY'} """ rtn = {} - err_msg = '' - if engine.name == 'baidu': + err_msg = "" + if engine.name == "baidu": result = baidu_translate(texts, engine.app_id, engine.api_key) - if 'error_code' not in result: + if "error_code" not in result: # 百度翻译的结果中的组表示的是按换行符分隔的不同段落,而不是句子 - paragraphs = [i['dst'] for i in result['trans_result']] - rtn = {'trans': '\n'.join(paragraphs)} + paragraphs = [i["dst"] for i in result["trans_result"]] + rtn = {"trans": "\n".join(paragraphs)} else: - err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg']) - elif engine.name == 'bing': + err_msg = "{}: {}: {}".format( + engine, result["error_code"], result["error_msg"] + ) + elif engine.name == "bing": # 使用动态词典保护原文中的女优名,防止翻译后认不出来 for i in actress: - texts = texts.replace(i, f'{i}') + texts = texts.replace( + i, f'{i}' + ) result = bing_translate(texts, api_key=engine.api_key) - if 'error' not in result: - sentLen = result[0]['translations'][0]['sentLen'] + if "error" not in result: + sentLen = result[0]["translations"][0]["sentLen"] orig_break, trans_break = [], [] # 对原文进行断句 remaining = texts - for i in sentLen['srcSentLen']: + for i in sentLen["srcSentLen"]: orig_break.append(remaining[:i]) remaining = remaining[i:] # 对译文进行断句 - remaining = result[0]['translations'][0]['text'] - for i in sentLen['transSentLen']: + remaining = result[0]["translations"][0]["text"] + for i in sentLen["transSentLen"]: # Bing会在译文的每个句尾添加一个空格,这并不符合中文的标点习惯,所以去掉这个空格 - trans_break.append(remaining[:i].rstrip(' ')) + trans_break.append(remaining[:i].rstrip(" ")) remaining = remaining[i:] - trans = ''.join(trans_break) - rtn = {'trans': trans, 'orig_break': orig_break, 'trans_break': trans_break} + trans = "".join(trans_break) + rtn = {"trans": trans, "orig_break": orig_break, "trans_break": trans_break} else: - err_msg = "{}: {}: {}".format(engine, result['error']['code'], result['error']['message']) - elif engine.name == 'claude': + err_msg = "{}: {}: {}".format( + engine, result["error"]["code"], result["error"]["message"] + ) + elif engine.name == "claude": try: result = claude_translate(texts, engine.api_key) - if 'error_code' not in result: - rtn = {'trans': result} + if "error_code" not in result: + rtn = {"trans": result} else: - err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg']) + err_msg = "{}: {}: {}".format( + engine, result["error_code"], result["error_msg"] + ) except Exception as e: err_msg = "{}: {}: Exception: {}".format(engine, -2, repr(e)) - elif engine.name == 'openai': + elif engine.name == "openai": try: result = openai_translate(texts, engine.url, engine.api_key, engine.model) - if 'error_code' not in result: - rtn = {'trans': result} + if "error_code" not in result: + rtn = {"trans": result} else: - err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg']) + err_msg = "{}: {}: {}".format( + engine, result["error_code"], result["error_msg"] + ) except Exception as e: err_msg = "{}: {}: Exception: {}".format(engine, -2, repr(e)) - elif engine.name == 'google': + elif engine.name == "google": try: result = google_trans(texts) # 经测试,翻译成功时会带有'sentences'字段;失败时不带,也没有故障码 - if 'sentences' in result: + if "sentences" in result: # Google会对句子分组,完整的译文需要自行拼接 - orig_break = [i['orig'] for i in result['sentences']] - trans_break = [i['trans'] for i in result['sentences']] - trans = ''.join(trans_break) - rtn = {'trans': trans, 'orig_break': orig_break, 'trans_break': trans_break} + orig_break = [i["orig"] for i in result["sentences"]] + trans_break = [i["trans"] for i in result["sentences"]] + trans = "".join(trans_break) + rtn = { + "trans": trans, + "orig_break": orig_break, + "trans_break": trans_break, + } else: - err_msg = "{}: {}: {}".format(engine, result['error_code'], result['error_msg']) + err_msg = "{}: {}: {}".format( + engine, result["error_code"], result["error_msg"] + ) except Exception as e: err_msg = "{}: {}: Exception: {}".format(engine, -2, repr(e)) else: - return {'trans': texts} + return {"trans": texts} -def baidu_translate(texts, app_id, api_key, to='zh'): + +def baidu_translate(texts, app_id, api_key, to="zh"): """使用百度翻译文本(默认翻译为简体中文)""" api_url = "https://api.fanyi.baidu.com/api/trans/vip/translate" - headers = {'Content-Type': 'application/x-www-form-urlencoded'} + headers = {"Content-Type": "application/x-www-form-urlencoded"} salt = random.randint(0, 0x7FFFFFFF) sign_input = app_id + texts + str(salt) + api_key - sign = md5(sign_input.encode('utf-8')).hexdigest() - payload = {'appid': app_id, 'q': texts, 'from': 'auto', 'to': to, 'salt': salt, 'sign': sign} + sign = md5(sign_input.encode("utf-8")).hexdigest() + payload = { + "appid": app_id, + "q": texts, + "from": "auto", + "to": to, + "salt": salt, + "sign": sign, + } # 由于百度标准版限制QPS为1,连续翻译标题和简介会超限,因此需要添加延时 now = time.perf_counter() - last_access = getattr(baidu_translate, '_last_access', -1) + last_access = getattr(baidu_translate, "_last_access", -1) wait = 1.0 - (now - last_access) if wait > 0: time.sleep(wait) @@ -152,24 +189,26 @@ def baidu_translate(texts, app_id, api_key, to='zh'): return result -def bing_translate(texts, api_key, to='zh-Hans'): +def bing_translate(texts, api_key, to="zh-Hans"): """使用Bing翻译文本(默认翻译为简体中文)""" api_url = "https://api.cognitive.microsofttranslator.com/translate" - params = {'api-version': '3.0', 'to': to, 'includeSentenceLength': True} + params = {"api-version": "3.0", "to": to, "includeSentenceLength": True} headers = { - 'Ocp-Apim-Subscription-Key': api_key, - 'Ocp-Apim-Subscription-Region': 'global', - 'Content-type': 'application/json', - 'X-ClientTraceId': str(uuid.uuid4()) + "Ocp-Apim-Subscription-Key": api_key, + "Ocp-Apim-Subscription-Region": "global", + "Content-type": "application/json", + "X-ClientTraceId": str(uuid.uuid4()), } - body = [{'text': texts}] + body = [{"text": texts}] r = requests.post(api_url, params=params, headers=headers, json=body) result = r.json() return result _google_trans_wait = 60 -def google_trans(texts, to='zh_CN'): + + +def google_trans(texts, to="zh_CN"): """使用Google翻译文本(默认翻译为简体中文)""" # API: https://www.jianshu.com/p/ce35d89c25c3 # client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017 @@ -178,7 +217,9 @@ def google_trans(texts, to='zh_CN'): proxies = read_proxy() r = requests.get(url, proxies=proxies) while r.status_code == 429: - logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试") + logger.warning( + f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试" + ) time.sleep(_google_trans_wait) r = requests.get(url, proxies=proxies) if r.status_code == 429: @@ -186,10 +227,11 @@ def google_trans(texts, to='zh_CN'): if r.status_code == 200: result = r.json() else: - result = {'error_code': r.status_code, 'error_msg': r.reason} - time.sleep(4) # Google翻译的API有QPS限制,因此需要等待一段时间 + result = {"error_code": r.status_code, "error_msg": r.reason} + time.sleep(4) # Google翻译的API有QPS限制,因此需要等待一段时间 return result + def claude_translate(texts, api_key, to="zh_CN"): """使用Claude翻译文本(默认翻译为简体中文)""" api_url = "https://api.anthropic.com/v1/messages" @@ -214,6 +256,7 @@ def claude_translate(texts, api_key, to="zh_CN"): } return result + def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"): """使用 OpenAI 翻译文本(默认翻译为简体中文)""" api_url = str(url) @@ -222,29 +265,32 @@ def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"): "Authorization": f"Bearer {api_key}", } data = { - "messages": [ - { - "role": "system", - "content": f"Translate the following Japanese paragraph into {to}, while leaving non-Japanese text, names, or text that does not look like Japanese untranslated. Reply with the translated text only, do not add any text that is not in the original content." - }, - { - "role": "user", - "content": texts - } - ], - "model": model, - "temperature": 0, - "max_tokens": 1024, + "messages": [ + { + "role": "system", + "content": f"Translate the following Japanese paragraph into {to}, while leaving non-Japanese text, names, or text that does not look like Japanese untranslated. Reply with the translated text only, do not add any text that is not in the original content.", + }, + {"role": "user", "content": texts}, + ], + "model": model, + "temperature": 0, + "max_tokens": 1024, } r = requests.post(api_url, headers=headers, json=data) if r.status_code == 200: - if 'error' in r.json(): + if "error" in r.json(): result = { "error_code": r.status_code, "error_msg": r.json().get("error", {}).get("message", ""), } else: - result = r.json().get("choices", [{}])[0].get("message", {}).get("content", "").strip() + result = ( + r.json() + .get("choices", [{}])[0] + .get("message", {}) + .get("content", "") + .strip() + ) else: result = { "error_code": r.status_code, diff --git a/poetry.lock b/poetry.lock index 1c92293a3..14bb35742 100644 --- a/poetry.lock +++ b/poetry.lock @@ -549,27 +549,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "flake8" -version = "7.1.1" -description = "the modular source code checker: pep8 pyflakes and co" -optional = false -python-versions = ">=3.8.1" -files = [ - {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"}, - {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"}, -] - -[package.dependencies] -mccabe = ">=0.7.0,<0.8.0" -pycodestyle = ">=2.12.0,<2.13.0" -pyflakes = ">=3.2.0,<3.3.0" - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "idna" version = "3.10" @@ -889,22 +868,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "mccabe" -version = "0.7.0" -description = "McCabe checker, plugin for flake8" -optional = false -python-versions = ">=3.6" -files = [ - {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, - {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "packaging" version = "24.1" @@ -1176,22 +1139,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "pycodestyle" -version = "2.12.1" -description = "Python style guide checker" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"}, - {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "pycparser" version = "2.22" @@ -1412,22 +1359,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "pyflakes" -version = "3.2.0" -description = "passive checker of Python programs" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a"}, - {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "pyparsing" version = "3.1.4" @@ -1668,6 +1599,38 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "ruff" +version = "0.6.8" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.6.8-py3-none-linux_armv6l.whl", hash = "sha256:77944bca110ff0a43b768f05a529fecd0706aac7bcce36d7f1eeb4cbfca5f0f2"}, + {file = "ruff-0.6.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:27b87e1801e786cd6ede4ada3faa5e254ce774de835e6723fd94551464c56b8c"}, + {file = "ruff-0.6.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:cd48f945da2a6334f1793d7f701725a76ba93bf3d73c36f6b21fb04d5338dcf5"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:677e03c00f37c66cea033274295a983c7c546edea5043d0c798833adf4cf4c6f"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9f1476236b3eacfacfc0f66aa9e6cd39f2a624cb73ea99189556015f27c0bdeb"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f5a2f17c7d32991169195d52a04c95b256378bbf0de8cb98478351eb70d526f"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5fd0d4b7b1457c49e435ee1e437900ced9b35cb8dc5178921dfb7d98d65a08d0"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8034b19b993e9601f2ddf2c517451e17a6ab5cdb1c13fdff50c1442a7171d87"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6cfb227b932ba8ef6e56c9f875d987973cd5e35bc5d05f5abf045af78ad8e098"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef0411eccfc3909269fed47c61ffebdcb84a04504bafa6b6df9b85c27e813b0"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:007dee844738c3d2e6c24ab5bc7d43c99ba3e1943bd2d95d598582e9c1b27750"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:ce60058d3cdd8490e5e5471ef086b3f1e90ab872b548814e35930e21d848c9ce"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1085c455d1b3fdb8021ad534379c60353b81ba079712bce7a900e834859182fa"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:70edf6a93b19481affd287d696d9e311388d808671bc209fb8907b46a8c3af44"}, + {file = "ruff-0.6.8-py3-none-win32.whl", hash = "sha256:792213f7be25316f9b46b854df80a77e0da87ec66691e8f012f887b4a671ab5a"}, + {file = "ruff-0.6.8-py3-none-win_amd64.whl", hash = "sha256:ec0517dc0f37cad14a5319ba7bba6e7e339d03fbf967a6d69b0907d61be7a263"}, + {file = "ruff-0.6.8-py3-none-win_arm64.whl", hash = "sha256:8d3bb2e3fbb9875172119021a13eed38849e762499e3cfde9588e4b4d70968dc"}, + {file = "ruff-0.6.8.tar.gz", hash = "sha256:a5bf44b1aa0adaf6d9d20f86162b34f7c593bfedabc51239953e446aefc8ce18"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "setuptools" version = "75.1.0" @@ -2041,4 +2004,4 @@ reference = "mirrors" [metadata] lock-version = "2.0" python-versions = "<3.13,>=3.10" -content-hash = "056b2f7a21b0286a04a5ecadb809f6472c636348fe07976ac42c9c47c620f04c" +content-hash = "29f8d207debd76155da3db3331fa117832c969b27889645d3a65570c8692f47d" diff --git a/pyproject.toml b/pyproject.toml index a5e1b4d10..152f1a289 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ confz = "^2.0.1" pydantic-extra-types = "^2.9.0" pendulum = "^3.0.0" slimeface = "^2024.9.27" +ruff = "^0.6.8" [tool.poetry.scripts] javsp = "javsp.__main__:entry" @@ -42,7 +43,6 @@ priority = "primary" [tool.poetry.group.dev.dependencies] pytest = "^8.1.1" -flake8 = "^7.0.0" cx-freeze = "^7.2.2" types-lxml = "^2024.4.14" types-pillow = "^10.2.0.20240822" diff --git a/setup.py b/setup.py index 5d3aba2a8..81652452d 100644 --- a/setup.py +++ b/setup.py @@ -9,39 +9,34 @@ include_files: List[Tuple[str, str]] = [ - (f'{proj_root}/config.yml', 'config.yml'), - (f'{proj_root}/data', 'data'), - (f'{proj_root}/image', 'image') + (f"{proj_root}/config.yml", "config.yml"), + (f"{proj_root}/data", "data"), + (f"{proj_root}/image", "image"), ] includes = [] -for file in os.listdir('javsp/web'): +for file in os.listdir("javsp/web"): name, ext = os.path.splitext(file) - if ext == '.py': - includes.append('javsp.web.' + name) + if ext == ".py": + includes.append("javsp.web." + name) -packages = [ - 'pendulum' # pydantic_extra_types depends on pendulum +packages = [ + "pendulum" # pydantic_extra_types depends on pendulum ] build_exe = { - 'include_files': include_files, - 'includes': includes, - 'excludes': ['unittest'], - 'packages': packages, + "include_files": include_files, + "includes": includes, + "excludes": ["unittest"], + "packages": packages, } javsp = Executable( - './javsp/__main__.py', - target_name='JavSP', + "./javsp/__main__.py", + target_name="JavSP", base=base, - icon='./image/JavSP.ico', -) - -setup( - name='JavSP', - options = {'build_exe': build_exe}, - executables=[javsp] + icon="./image/JavSP.ico", ) +setup(name="JavSP", options={"build_exe": build_exe}, executables=[javsp]) diff --git a/tools/airav_search.py b/tools/airav_search.py index ca6aa95d8..678b1ab48 100644 --- a/tools/airav_search.py +++ b/tools/airav_search.py @@ -1,37 +1,38 @@ """获取airav指定关键词的所有搜索结果""" + import os import sys import json -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from javsp.web.base import Request request = Request() -request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9' +request.headers["Accept-Language"] = "zh-TW,zh;q=0.9" -base_url = 'https://www.airav.wiki' +base_url = "https://www.airav.wiki" def search(keyword): """搜索指定影片的所有结果""" all_results = [] page = 1 - data = {'offset': 0, 'count': 1, 'result': []} - while (data['offset'] + len(data['result']) < data['count']): - url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={keyword}&page={page}' + data = {"offset": 0, "count": 1, "result": []} + while data["offset"] + len(data["result"]) < data["count"]: + url = f"{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={keyword}&page={page}" data = request.get(url).json() - all_results.extend(data['result']) + all_results.extend(data["result"]) print(f"Get page {page}: {len(data['result'])} movie(s)") page += 1 for i in all_results: - if not i['url']: - i['url'] = f"{base_url}/video/{i['barcode']}" + if not i["url"]: + i["url"] = f"{base_url}/video/{i['barcode']}" return all_results if __name__ == "__main__": - keyword = '版' + keyword = "版" results = search(keyword) - with open(f'airav_search_{keyword}.json', 'wt', encoding='utf-8') as f: + with open(f"airav_search_{keyword}.json", "wt", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) diff --git a/tools/call_crawler.py b/tools/call_crawler.py index ed17b4ba2..da4315a19 100644 --- a/tools/call_crawler.py +++ b/tools/call_crawler.py @@ -1,4 +1,5 @@ """调用抓取器抓取数据""" + import os import sys @@ -11,21 +12,21 @@ file_dir = os.path.dirname(__file__) -data_dir = os.path.abspath(os.path.join(file_dir, '../unittest/data')) -sys.path.insert(0, os.path.abspath(os.path.join(file_dir, '..'))) +data_dir = os.path.abspath(os.path.join(file_dir, "../unittest/data")) +sys.path.insert(0, os.path.abspath(os.path.join(file_dir, ".."))) from javsp.datatype import MovieInfo # 搜索抓取器并导入它们 all_crawler = {} -exclude_files = ['fc2fan'] -for file in os.listdir('web'): +exclude_files = ["fc2fan"] +for file in os.listdir("web"): name, ext = os.path.splitext(file) - if ext == '.py' and name not in exclude_files: - modu = 'web.' + name + if ext == ".py" and name not in exclude_files: + modu = "web." + name __import__(modu) - if hasattr(sys.modules[modu], 'parse_data'): - parser = getattr(sys.modules[modu], 'parse_data') + if hasattr(sys.modules[modu], "parse_data"): + parser = getattr(sys.modules[modu], "parse_data") all_crawler[name] = parser @@ -38,18 +39,18 @@ def call_crawlers(dvdid_list: list, used_crawlers=None): crawlers (list[str], optional): 要使用的抓取器,未指定时将使用全部抓取器 """ if used_crawlers: - crawlers = {i:all_crawler[i] for i in used_crawlers} + crawlers = {i: all_crawler[i] for i in used_crawlers} else: crawlers = all_crawler - outer_bar = tqdm(dvdid_list, desc='抓取影片数据', leave=False) + outer_bar = tqdm(dvdid_list, desc="抓取影片数据", leave=False) for avid in outer_bar: success, fail = [], [] - outer_bar.set_description(f'抓取影片数据: {avid}') - inner_bar = tqdm(crawlers.items(), desc='抓取器', leave=False) + outer_bar.set_description(f"抓取影片数据: {avid}") + inner_bar = tqdm(crawlers.items(), desc="抓取器", leave=False) for name, parser in inner_bar: - inner_bar.set_description(f'正在抓取{name}'.rjust(10+len(avid))) + inner_bar.set_description(f"正在抓取{name}".rjust(10 + len(avid))) # 每次都会创建一个全新的实例,所以不同抓取器的结果之间不会有影响 - if name != 'fanza': + if name != "fanza": movie = MovieInfo(avid) else: movie = MovieInfo(cid=avid) @@ -60,7 +61,9 @@ def call_crawlers(dvdid_list: list, used_crawlers=None): success.append(name) except: fail.append(name) - out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format(avid, len(success), ' '.join(success), len(fail), ' '.join(fail)) + out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format( + avid, len(success), " ".join(success), len(fail), " ".join(fail) + ) tqdm.write(out) @@ -69,16 +72,16 @@ def call_crawlers(dvdid_list: list, used_crawlers=None): # 带参数调用时,将参数全部视作番号并调用所有抓取器抓取数据 call_crawlers(sys.argv[1:]) else: - user_in = input('请输入要抓取数据的影片番号: ') + user_in = input("请输入要抓取数据的影片番号: ") dvdid_list = user_in.split() # 提示选择要使用的抓取器 names = list(all_crawler.keys()) for i in range(len(names)): - print(f"{i+1}. {names[i]}", end=' ') - user_in2 = input('\n请选择要使用的抓取器(回车表示全部使用): ') + print(f"{i+1}. {names[i]}", end=" ") + user_in2 = input("\n请选择要使用的抓取器(回车表示全部使用): ") if user_in2: items = user_in2.split() - indexes = [int(i)-1 for i in items if i.isdigit()] + indexes = [int(i) - 1 for i in items if i.isdigit()] valid_indexes = [i for i in indexes if i < len(names)] used = [names[i] for i in valid_indexes] else: diff --git a/tools/check_genre.py b/tools/check_genre.py index dd562dc65..f8357aa79 100644 --- a/tools/check_genre.py +++ b/tools/check_genre.py @@ -16,21 +16,25 @@ import csv -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from javsp.web.base import * from javsp.config import cfg def get_javbus_genre(): """获取JavBus的genre各语言对照列表""" - record = {} # {id: [cn_url, zh_tw, ja, en]} + record = {} # {id: [cn_url, zh_tw, ja, en]} base_url = cfg.ProxyFree.javbus subsite_urls = { - 'normal': ['/genre', '/ja/genre', '/en/genre'], - 'uncensored': ['/uncensored/genre', '/ja/uncensored/genre', '/en/uncensored/genre'], + "normal": ["/genre", "/ja/genre", "/en/genre"], + "uncensored": [ + "/uncensored/genre", + "/ja/uncensored/genre", + "/en/uncensored/genre", + ], } for subsite, urls in subsite_urls.items(): - id_prefix = 'uncensored-' if subsite == 'uncensored' else '' + id_prefix = "uncensored-" if subsite == "uncensored" else "" zh_tw = get_html(base_url + urls[0]) ja = get_html(base_url + urls[1]) en = get_html(base_url + urls[2]) @@ -38,8 +42,8 @@ def get_javbus_genre(): genre_tags = html.xpath("//div[@class='row genre-box']/a") # 提取各个genre的信息 for tag in genre_tags: - url = tag.get('href') - id = id_prefix + url.split('/')[-1] + url = tag.get("href") + id = id_prefix + url.split("/")[-1] name = tag.text.strip() if id in record: record[id].append(name) @@ -47,9 +51,9 @@ def get_javbus_genre(): record[id] = [url, name] # 将相关数据进行结构化后返回 data = { - 'site_name': 'javbus', - 'header': ['id', 'url', 'zh_tw', 'ja', 'en'], - 'record': record + "site_name": "javbus", + "header": ["id", "url", "zh_tw", "ja", "en"], + "record": record, } return data @@ -61,9 +65,9 @@ def get_javdb_genre(): record = {} base_url = cfg.ProxyFree.javdb subsite_urls = { - 'normal': ['/tags?locale=zh', '/tags?locale=en'], - 'uncensored': ['/tags/uncensored?locale=zh', '/tags/uncensored?locale=en'], - 'western': ['/tags/western?locale=zh', '/tags/western?locale=en'] + "normal": ["/tags?locale=zh", "/tags?locale=en"], + "uncensored": ["/tags/uncensored?locale=zh", "/tags/uncensored?locale=en"], + "western": ["/tags/western?locale=zh", "/tags/western?locale=en"], } for subsite, urls in subsite_urls.items(): zh_tw = get_html(base_url + urls[0]) @@ -72,8 +76,8 @@ def get_javdb_genre(): genre_tags = html.xpath("//span[@class='tag_labels']/a") # 提取各个genre的信息 for tag in genre_tags: - url = tag.get('href') - id = url.split('/')[-1] + url = tag.get("href") + id = url.split("/")[-1] name = tag.text.strip() if id in record: record[id].append(name) @@ -81,14 +85,14 @@ def get_javdb_genre(): record[id] = [url, name] # 移除分类中的c9:'筛选', c10:'年份', c11:'时长' for id, _ in record.copy().items(): - catelog = id.split('?')[1].split('=')[0] # e.g. tags?c11=2021 - if catelog in ['c9', 'c10', 'c11']: + catelog = id.split("?")[1].split("=")[0] # e.g. tags?c11=2021 + if catelog in ["c9", "c10", "c11"]: del record[id] # 将相关数据进行结构化后返回 data = { - 'site_name': 'javdb', - 'header': ['id', 'url', 'zh_tw', 'en'], - 'record': record + "site_name": "javdb", + "header": ["id", "url", "zh_tw", "en"], + "record": record, } return data @@ -97,22 +101,22 @@ def get_avsox_genre(): """获取AVSOX的genre各语言对照列表""" record = {} base_url = cfg.ProxyFree.avsox - languages = ['cn', 'tw', 'en', 'ja'] + languages = ["cn", "tw", "en", "ja"] for lang in languages: - html = get_html(f'{base_url}/{lang}/genre') + html = get_html(f"{base_url}/{lang}/genre") genre_tags = html.xpath("//div[@class='row genre-box']/a") for tag in genre_tags: - url = tag.get('href') - id = url.split('/')[-1] + url = tag.get("href") + id = url.split("/")[-1] name = tag.text.strip() if id in record: record[id].append(name) else: record[id] = [url, name] data = { - 'site_name': 'avsox', - 'header': ['id', 'url', 'zh_cn', 'zh_tw', 'en', 'ja'], - 'record': record + "site_name": "avsox", + "header": ["id", "url", "zh_cn", "zh_tw", "en", "ja"], + "record": record, } return data @@ -121,22 +125,22 @@ def get_javlib_genre(): """获取JavLibrary的genre各语言对照列表""" record = {} base_url = cfg.ProxyFree.javlib - languages = ['cn', 'tw', 'en', 'ja'] + languages = ["cn", "tw", "en", "ja"] for lang in languages: - html = get_html(f'{base_url}/{lang}/genres.php') + html = get_html(f"{base_url}/{lang}/genres.php") genre_tags = html.xpath("//div[@class='genreitem']/a") for tag in genre_tags: - url = tag.get('href') - id = url.split('=')[-1] + url = tag.get("href") + id = url.split("=")[-1] name = tag.text.strip() if id in record: record[id].append(name) else: record[id] = [url, name] data = { - 'site_name': 'javlib', - 'header': ['id', 'url', 'zh_cn', 'zh_tw', 'en', 'ja'], - 'record': record + "site_name": "javlib", + "header": ["id", "url", "zh_cn", "zh_tw", "en", "ja"], + "record": record, } return data @@ -144,12 +148,12 @@ def get_javlib_genre(): def write_csv(data): """将genre按照中文翻译排序后写入csv文件""" # data格式: {'site_name': name, 'header': ['id', 'url', 'zh_tw'...], 'record': {id1: [ls1], id2: [ls2]...}} - record = data['record'] + record = data["record"] csv_name = f"data/genre_{data['site_name']}.csv" - csv_header = data['header'] + ['translate', 'note'] + csv_header = data["header"] + ["translate", "note"] # p[1][1] 必须是最接近最终翻译文本的那一列(如繁体中文) sort_record = {k: v for k, v in sorted(record.items(), key=lambda p: p[1][1])} - with open(csv_name, 'wt', encoding='utf-8-sig', newline='') as csvfile: + with open(csv_name, "wt", encoding="utf-8-sig", newline="") as csvfile: writer = csv.writer(csvfile) writer.writerow(csv_header) for id, genres in sort_record.items(): diff --git a/tools/config_migration.py b/tools/config_migration.py index 95adc45d6..d1360d388 100644 --- a/tools/config_migration.py +++ b/tools/config_migration.py @@ -3,55 +3,68 @@ import re arg_parser = ArgumentParser( - prog='config migration', - description='migration your javsp config to yaml') + prog="config migration", description="migration your javsp config to yaml" +) -arg_parser.add_argument('-i', '--input', help='path to config.ini') -arg_parser.add_argument('-o', '--output', help='path to output config', default="config.yml") +arg_parser.add_argument("-i", "--input", help="path to config.ini") +arg_parser.add_argument( + "-o", "--output", help="path to output config", default="config.yml" +) args, _ = arg_parser.parse_known_args() -if(args.input is None): +if args.input is None: print("Expecting an input config file, try `config_migration.py -h` to see help.") exit(1) cfg = ConfigParser() cfg.read(args.input) -ignore_regexes: list[str] = cfg['MovieID']['ignore_regex'].split(';') -ignore_regexes += cfg['MovieID']['ignore_whole_word'].split(';') -ignore_regexes.append('(144|240|360|480|720|1080)[Pp]') -ignore_regexes.append('[24][Kk]') +ignore_regexes: list[str] = cfg["MovieID"]["ignore_regex"].split(";") +ignore_regexes += cfg["MovieID"]["ignore_whole_word"].split(";") +ignore_regexes.append("(144|240|360|480|720|1080)[Pp]") +ignore_regexes.append("[24][Kk]") -input_directory = cfg['File']['scan_dir'] -input_directory = 'null' if len(input_directory) == 0 else f"'{input_directory}'" +input_directory = cfg["File"]["scan_dir"] +input_directory = "null" if len(input_directory) == 0 else f"'{input_directory}'" -filename_extensions = cfg['File']['media_ext'].split(';') +filename_extensions = cfg["File"]["media_ext"].split(";") -ignored_folders = cfg['File']['ignore_folder'].split(';') +ignored_folders = cfg["File"]["ignore_folder"].split(";") + +proxy_disabled = cfg["Network"]["use_proxy"] == "no" or cfg["Network"]["proxy"] == "" -proxy_disabled = cfg['Network']['use_proxy'] == 'no' or cfg['Network']['proxy'] == '' def yes_to_true(s): - return 'true' if s == 'yes' else 'false' + return "true" if s == "yes" else "false" + def use_javdb_cover(s): - if s == 'yes': return 'no' - elif s == 'no': return 'yes' - elif s == 'auto': return 'fallback' + if s == "yes": + return "no" + elif s == "no": + return "yes" + elif s == "auto": + return "fallback" + def path_len_by_byte(s): - if s == 'no': return 'false' - else: return 'true' + if s == "no": + return "false" + else: + return "true" + def ai_crop_pat(s): - if s == r'\d': - return r'^\d{6}[-_]\d{3}$' + if s == r"\d": + return r"^\d{6}[-_]\d{3}$" else: - return '^' + s + return "^" + s + def fix_pat(p): - return re.sub(r'\$([a-z]+)', r'{\1}', p) + return re.sub(r"\$([a-z]+)", r"{\1}", p) + config_str = f"""# vim:foldmethod=marker ################################ @@ -242,6 +255,5 @@ def fix_pat(p): # 是否允许检查到新版本时自动下载 auto_update: {yes_to_true(cfg['Other']['auto_update'])}""" -with open(args.output, mode ="w") as file: +with open(args.output, mode="w") as file: file.write(config_str) - diff --git a/tools/version.py b/tools/version.py index b018e0a16..9e63bfaba 100644 --- a/tools/version.py +++ b/tools/version.py @@ -1,5 +1,5 @@ import importlib.metadata as meta -javsp_version = meta.version('javsp') +javsp_version = meta.version("javsp") print(javsp_version) diff --git a/unittest/conftest.py b/unittest/conftest.py index bfd973a8b..f46945597 100644 --- a/unittest/conftest.py +++ b/unittest/conftest.py @@ -4,7 +4,7 @@ from glob import glob -data_dir = os.path.join(os.path.dirname(__file__), 'data') +data_dir = os.path.join(os.path.dirname(__file__), "data") def pytest_addoption(parser): @@ -12,6 +12,7 @@ def pytest_addoption(parser): "--only", action="store", default="", help="仅测试指定抓取器的数据" ) + def pytest_runtest_logreport(report): """定制 short test summary info 显示格式""" # report 的部分属性形如 @@ -20,8 +21,8 @@ def pytest_runtest_logreport(report): # keywords: {'082713-417: avsox': 1, 'unittest/test_crawlers.py': 1, 'test_crawler[082713-417: avsox]': 1, 'JavSP': 1} # 为test_crawlers.py定制short test summary格式 - if 'test_crawlers.py::' in report.nodeid: - report.nodeid = re.sub(r'^.*::test_crawler', '', report.nodeid) + if "test_crawlers.py::" in report.nodeid: + report.nodeid = re.sub(r"^.*::test_crawler", "", report.nodeid) @pytest.fixture @@ -30,17 +31,17 @@ def crawler(request): def pytest_generate_tests(metafunc): - if 'crawler_params' in metafunc.fixturenames: + if "crawler_params" in metafunc.fixturenames: # 根据测试数据文件夹中的文件生成测试数据 testcases = {} - data_files = glob(data_dir + os.sep + '*.json') + data_files = glob(data_dir + os.sep + "*.json") target_crawler = metafunc.config.getoption("--only") for file in data_files: basename = os.path.basename(file) match = re.match(r"([-\w]+) \((\w+)\)", basename, re.I) if match: avid, scraper = match.groups() - name = f'{avid}: {scraper}' + name = f"{avid}: {scraper}" # 仅当未指定抓取器或者指定的抓取器与当前抓取器相同时,才实际执行抓取和比较 if (not target_crawler) or scraper == target_crawler: testcases[name] = (avid, scraper, file) diff --git a/unittest/test_avid.py b/unittest/test_avid.py index ca0c0008f..9ee5006b3 100644 --- a/unittest/test_avid.py +++ b/unittest/test_avid.py @@ -5,7 +5,7 @@ from shutil import rmtree file_dir = os.path.dirname(__file__) -sys.path.insert(0, os.path.abspath(os.path.join(file_dir, '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(file_dir, ".."))) from javsp.avid import get_id, get_cid @@ -16,13 +16,13 @@ def prepare_files(files): Args: files (list of tuple): 文件列表,仅接受相对路径 """ - tmp_folder = 'tmp_' + uuid.uuid4().hex[:8] + tmp_folder = "tmp_" + uuid.uuid4().hex[:8] for i in files: path = os.path.join(tmp_folder, i) folder = os.path.split(path)[0] if folder and (not os.path.exists(folder)): os.makedirs(folder) - with open(path, 'wt', encoding='utf-8') as f: + with open(path, "wt", encoding="utf-8") as f: f.write(path) yield rmtree(tmp_folder) @@ -30,31 +30,31 @@ def prepare_files(files): def test_fc2(): - assert 'FC2-123456' == get_id('(2017) [FC2-123456] 【個人撮影】') - assert 'FC2-123456' == get_id('fc2-ppv-123456-1.delogo.mp4') - assert 'FC2-123456' == get_id('FC2-PPV-123456.mp4') - assert 'FC2-123456' == get_id('FC2PPV-123456 Yuukiy') - assert 'FC2-1234567' == get_id('fc2-ppv_1234567-2.mp4') + assert "FC2-123456" == get_id("(2017) [FC2-123456] 【個人撮影】") + assert "FC2-123456" == get_id("fc2-ppv-123456-1.delogo.mp4") + assert "FC2-123456" == get_id("FC2-PPV-123456.mp4") + assert "FC2-123456" == get_id("FC2PPV-123456 Yuukiy") + assert "FC2-1234567" == get_id("fc2-ppv_1234567-2.mp4") def test_normal(): - assert '' == get_id('Yuukiy') - assert 'ABC-12' == get_id('ABC-12_01.mkv') - assert 'ABC-123' == get_id('Sky Angel Vol.6 月丘うさぎ(ABC-123).avi') - assert 'ABCD-123' == get_id('ABCD-123.mp4') + assert "" == get_id("Yuukiy") + assert "ABC-12" == get_id("ABC-12_01.mkv") + assert "ABC-123" == get_id("Sky Angel Vol.6 月丘うさぎ(ABC-123).avi") + assert "ABCD-123" == get_id("ABCD-123.mp4") def test_cid_valid(): - assert 'ab012st' == get_cid('ab012st') - assert 'ab012st' == get_cid('ab012st.mp4') - assert '123_0456' == get_cid('123_0456.mp4') - assert '123abc00045' == get_cid('123abc00045.mp4') - assert '403abcd56789' == get_cid('403abcd56789_1') - assert 'h_001abc00001' == get_cid('h_001abc00001.mp4') - assert '1234wvr00001rp' == get_cid('1234wvr00001rp.mp4') - assert '402abc_hello000089' == get_cid('402abc_hello000089.mp4') - assert 'h_826zizd021' == get_cid('h_826zizd021.mp4') - assert '403abcd56789' == get_cid('403abcd56789cd1.mp4') + assert "ab012st" == get_cid("ab012st") + assert "ab012st" == get_cid("ab012st.mp4") + assert "123_0456" == get_cid("123_0456.mp4") + assert "123abc00045" == get_cid("123abc00045.mp4") + assert "403abcd56789" == get_cid("403abcd56789_1") + assert "h_001abc00001" == get_cid("h_001abc00001.mp4") + assert "1234wvr00001rp" == get_cid("1234wvr00001rp.mp4") + assert "402abc_hello000089" == get_cid("402abc_hello000089.mp4") + assert "h_826zizd021" == get_cid("h_826zizd021.mp4") + assert "403abcd56789" == get_cid("403abcd56789cd1.mp4") def test_from_file(): @@ -62,50 +62,52 @@ def test_from_file(): write_back = False rewrite_lines = [] - datafile = os.path.join(file_dir, 'testdata_avid.txt') - with open(datafile, 'rt', encoding='utf-8') as f: + datafile = os.path.join(file_dir, "testdata_avid.txt") + with open(datafile, "rt", encoding="utf-8") as f: lines = f.readlines() for line_no, line in enumerate(lines, start=1): - items = line.strip('\r\n').split('\t') + items = line.strip("\r\n").split("\t") if len(items) == 2: (filename, avid), ignore = items, False else: filename, avid, ignore = items guess_id = get_id(filename) if write_back: - rewrite_lines.append(f'{filename}\t{guess_id}\n') + rewrite_lines.append(f"{filename}\t{guess_id}\n") continue if guess_id != avid: if ignore: print(f"Ignored: {guess_id} != {avid}\t'{filename}'") else: - assert guess_id == avid.upper(), f'AV ID not match at line {line_no}' + assert ( + guess_id == avid.upper() + ), f"AV ID not match at line {line_no}" if write_back: - with open(datafile, 'wt', encoding='utf-8') as f: + with open(datafile, "wt", encoding="utf-8") as f: f.writelines(rewrite_lines) def test_cid_invalid(): - assert '' == get_cid('hasUpperletter.mp4') - assert '' == get_cid('存在非ASCII字符.mp4') - assert '' == get_cid('has-dash.mp4') - assert '' == get_cid('403_abcd56789_fgh') - assert '' == get_cid('many_parts1234-12.mp4') - assert '' == get_cid('abc12.mp4') - assert '' == get_cid('ab012st/仅文件夹名称为cid.mp4') - assert '' == get_cid('123_0456st.mp4') + assert "" == get_cid("hasUpperletter.mp4") + assert "" == get_cid("存在非ASCII字符.mp4") + assert "" == get_cid("has-dash.mp4") + assert "" == get_cid("403_abcd56789_fgh") + assert "" == get_cid("many_parts1234-12.mp4") + assert "" == get_cid("abc12.mp4") + assert "" == get_cid("ab012st/仅文件夹名称为cid.mp4") + assert "" == get_cid("123_0456st.mp4") -@pytest.mark.parametrize('files', [('Unknown.mp4',)]) +@pytest.mark.parametrize("files", [("Unknown.mp4",)]) def test_by_folder_name1(prepare_files): - assert '' == get_id('Unknown.mp4') + assert "" == get_id("Unknown.mp4") -@pytest.mark.parametrize('files', [('FC2-123456/Unknown.mp4',)]) +@pytest.mark.parametrize("files", [("FC2-123456/Unknown.mp4",)]) def test_by_folder_name2(prepare_files): - assert 'FC2-123456' == get_id('FC2-123456/Unknown.mp4') + assert "FC2-123456" == get_id("FC2-123456/Unknown.mp4") -@pytest.mark.parametrize('files', [('ABC-123/CDF-456.mp4',)]) +@pytest.mark.parametrize("files", [("ABC-123/CDF-456.mp4",)]) def test_by_folder_name3(prepare_files): - assert 'CDF-456' == get_id('ABC-123/CDF-456.mp4') + assert "CDF-456" == get_id("ABC-123/CDF-456.mp4") diff --git a/unittest/test_crawlers.py b/unittest/test_crawlers.py index 3b0257e07..42f5cf4c8 100644 --- a/unittest/test_crawlers.py +++ b/unittest/test_crawlers.py @@ -6,8 +6,8 @@ file_dir = os.path.dirname(__file__) -data_dir = os.path.join(file_dir, 'data') -sys.path.insert(0, os.path.abspath(os.path.join(file_dir, '..'))) +data_dir = os.path.join(file_dir, "data") +sys.path.insert(0, os.path.abspath(os.path.join(file_dir, ".."))) from javsp.datatype import MovieInfo from javsp.web.exceptions import CrawlerError, SiteBlocked @@ -25,27 +25,30 @@ def test_crawler(crawler_params): compare(*crawler_params) except requests.exceptions.ReadTimeout: logger.warning(f"{site} 连接超时: {params}") - except Exception as e: - if os.getenv('GITHUB_ACTIONS') and (site in ['javdb', 'javlib', 'airav']): - logger.debug(f'检测到Github actions环境,已忽略测试失败项: {params}', exc_info=True) + except Exception: + if os.getenv("GITHUB_ACTIONS") and (site in ["javdb", "javlib", "airav"]): + logger.debug( + f"检测到Github actions环境,已忽略测试失败项: {params}", exc_info=True + ) else: raise + def compare(avid, scraper, file): """从本地的数据文件生成Movie实例,并与在线抓取到的数据进行比较""" local = MovieInfo(from_file=file) - if scraper != 'fanza': + if scraper != "fanza": online = MovieInfo(avid) else: online = MovieInfo(cid=avid) # 导入抓取器模块 - scraper_mod = 'javsp.web.' + scraper + scraper_mod = "javsp.web." + scraper __import__(scraper_mod) mod = sys.modules[scraper_mod] - if hasattr(mod, 'parse_clean_data'): - parse_data = getattr(mod, 'parse_clean_data') + if hasattr(mod, "parse_clean_data"): + parse_data = getattr(mod, "parse_clean_data") else: - parse_data = getattr(mod, 'parse_data') + parse_data = getattr(mod, "parse_data") try: parse_data(online) @@ -61,22 +64,24 @@ def compare(avid, scraper, file): online_vars = vars(online) for k, v in online_vars.items(): # 部分字段可能随时间变化,因此只要这些字段不是一方有值一方无值就行 - if k in ['score', 'magnet']: + if k in ["score", "magnet"]: assert bool(v) == bool(local_vars.get(k, None)) - elif k == 'preview_video' and scraper in ['airav', 'javdb']: + elif k == "preview_video" and scraper in ["airav", "javdb"]: assert bool(v) == bool(local_vars.get(k, None)) # JavBus采用免代理域名时图片地址也会是免代理域名,因此只比较path部分即可 - elif k == 'cover' and scraper == 'javbus': + elif k == "cover" and scraper == "javbus": assert urlsplit(v).path == urlsplit(local_vars.get(k, None)).path - elif k == 'actress_pics' and scraper == 'javbus': + elif k == "actress_pics" and scraper == "javbus": local_tmp = online_tmp = {} local_pics = local_vars.get(k) if local_pics: - local_tmp = {name: urlsplit(url).path for name, url in local_pics.items()} + local_tmp = { + name: urlsplit(url).path for name, url in local_pics.items() + } if v: online_tmp = {name: urlsplit(url).path for name, url in v.items()} assert local_tmp == online_tmp - elif k == 'preview_pics' and scraper == 'javbus': + elif k == "preview_pics" and scraper == "javbus": local_pics = local_vars.get(k) if local_pics: local_tmp = [urlsplit(i).path for i in local_pics] @@ -84,7 +89,7 @@ def compare(avid, scraper, file): online_tmp = [urlsplit(i).path for i in v] assert local_tmp == online_tmp # 对顺序没有要求的list型字段,比较时也应该忽略顺序信息 - elif k in ['genre', 'genre_id', 'genre_norm', 'actress']: + elif k in ["genre", "genre_id", "genre_norm", "actress"]: if isinstance(v, list): loc_v = local_vars.get(k) if loc_v is None: @@ -96,7 +101,7 @@ def compare(avid, scraper, file): assert v == local_vars.get(k, None) except AssertionError: # 本地运行时更新已有的测试数据,方便利用版本控制系统检查差异项 - if not os.getenv('GITHUB_ACTIONS'): + if not os.getenv("GITHUB_ACTIONS"): online.dump(file) raise except Exception as e: diff --git a/unittest/test_exe.py b/unittest/test_exe.py index 983d1ff67..3307d5f0c 100644 --- a/unittest/test_exe.py +++ b/unittest/test_exe.py @@ -8,25 +8,29 @@ def test_javsp_exe(): cwd = os.getcwd() - dist_dir = os.path.normpath(os.path.join(os.path.dirname(__file__) + '/../dist')) + dist_dir = os.path.normpath(os.path.join(os.path.dirname(__file__) + "/../dist")) os.chdir(dist_dir) size = 300 * 2**20 - tmp_folder = '.TMP_' + ''.join(random.choices(string.ascii_uppercase, k=6)) - FILE = '300MAAN-642.RIP.f4v' + tmp_folder = ".TMP_" + "".join(random.choices(string.ascii_uppercase, k=6)) + FILE = "300MAAN-642.RIP.f4v" try: os.system(f"fsutil file createnew {FILE} {size}") - r = subprocess.run(f"JavSP.exe --auto-exit --input . --output {tmp_folder}".split(), capture_output=True, encoding='utf-8') - print(r.stdout, r.stderr.encode().decode("unicode_escape"), sep='\n') + r = subprocess.run( + f"JavSP.exe --auto-exit --input . --output {tmp_folder}".split(), + capture_output=True, + encoding="utf-8", + ) + print(r.stdout, r.stderr.encode().decode("unicode_escape"), sep="\n") r.check_returncode() # Check generated files - files = glob(tmp_folder + '/**/*.*', recursive=True) - print('\n'.join(files)) + files = glob(tmp_folder + "/**/*.*", recursive=True) + print("\n".join(files)) # assert all('横宮七海' in i for i in files), "Actress name not found" - assert any(i.endswith('fanart.jpg') for i in files), "fanart not found" - assert any(i.endswith('poster.jpg') for i in files), "poster not found" - assert any(i.endswith('.f4v') for i in files), "video file not found" - assert any(i.endswith('.nfo') for i in files), "nfo file not found" + assert any(i.endswith("fanart.jpg") for i in files), "fanart not found" + assert any(i.endswith("poster.jpg") for i in files), "poster not found" + assert any(i.endswith(".f4v") for i in files), "video file not found" + assert any(i.endswith(".nfo") for i in files), "nfo file not found" finally: if os.path.exists(FILE): os.remove(FILE) diff --git a/unittest/test_file.py b/unittest/test_file.py index df83467e0..ae3a689da 100644 --- a/unittest/test_file.py +++ b/unittest/test_file.py @@ -6,17 +6,19 @@ from shutil import rmtree -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from javsp.file import scan_movies -tmp_folder = 'TMP_' + ''.join(random.choices(string.ascii_uppercase, k=6)) -DEFAULT_SIZE = 512*2**20 # 512 MiB +tmp_folder = "TMP_" + "".join(random.choices(string.ascii_uppercase, k=6)) +DEFAULT_SIZE = 512 * 2**20 # 512 MiB + def touch_file_size(path: str, size_bytes: int): - with open(path, 'wb') as f: + with open(path, "wb") as f: f.seek(size_bytes - 1) - f.write(b'\0') + f.write(b"\0") + @pytest.fixture def prepare_files(files): @@ -26,7 +28,7 @@ def prepare_files(files): files (list of tuple): 文件列表,仅接受相对路径 """ if not isinstance(files, dict): - files = {i:DEFAULT_SIZE for i in files} + files = {i: DEFAULT_SIZE for i in files} for name, size in files.items(): path = os.path.join(tmp_folder, name) folder = os.path.split(path)[0] @@ -39,190 +41,234 @@ def prepare_files(files): # 根文件夹下的单个影片文件 -@pytest.mark.parametrize('files', [('ABC-123.mp4',)]) +@pytest.mark.parametrize("files", [("ABC-123.mp4",)]) def test_single_movie(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 1 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'ABC-123.mp4' + assert basenames[0] == "ABC-123.mp4" # 多个分片以数字排序: 012 -@pytest.mark.parametrize('files', [('ABC-123-0.mp4','ABC-123-1.mp4','ABC-123- 2.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123-0.mp4", "ABC-123-1.mp4", "ABC-123- 2.mp4")] +) def test_scan_movies__012(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 3 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'ABC-123-0.mp4' - assert basenames[1] == 'ABC-123-1.mp4' - assert basenames[2] == 'ABC-123- 2.mp4' + assert basenames[0] == "ABC-123-0.mp4" + assert basenames[1] == "ABC-123-1.mp4" + assert basenames[2] == "ABC-123- 2.mp4" # 多个分片以数字排序: 123 -@pytest.mark.parametrize('files', [('ABC-123.1.mp4','ABC-123. 2.mp4','ABC-123.3.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123.1.mp4", "ABC-123. 2.mp4", "ABC-123.3.mp4")] +) def test_scan_movies__123(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 3 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'ABC-123.1.mp4' - assert basenames[1] == 'ABC-123. 2.mp4' - assert basenames[2] == 'ABC-123.3.mp4' + assert basenames[0] == "ABC-123.1.mp4" + assert basenames[1] == "ABC-123. 2.mp4" + assert basenames[2] == "ABC-123.3.mp4" # 多个分片以字母排序 -@pytest.mark.parametrize('files', [('ABC-123-A.mp4','ABC-123-B.mp4','ABC-123- C .mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123-A.mp4", "ABC-123-B.mp4", "ABC-123- C .mp4")] +) def test_scan_movies__abc(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 3 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'ABC-123-A.mp4' - assert basenames[1] == 'ABC-123-B.mp4' - assert basenames[2] == 'ABC-123- C .mp4' + assert basenames[0] == "ABC-123-A.mp4" + assert basenames[1] == "ABC-123-B.mp4" + assert basenames[2] == "ABC-123- C .mp4" # 多个分片以.CDx编号 -@pytest.mark.parametrize('files', [('ABC-123.CD1.mp4','ABC-123.CD2 .mp4','ABC-123.CD3.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123.CD1.mp4", "ABC-123.CD2 .mp4", "ABC-123.CD3.mp4")] +) def test_scan_movies__cdx(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 3 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'ABC-123.CD1.mp4' - assert basenames[1] == 'ABC-123.CD2 .mp4' - assert basenames[2] == 'ABC-123.CD3.mp4' + assert basenames[0] == "ABC-123.CD1.mp4" + assert basenames[1] == "ABC-123.CD2 .mp4" + assert basenames[2] == "ABC-123.CD3.mp4" -@pytest.mark.parametrize('files', [('abc123cd1.mp4','abc123cd2.mp4')]) +@pytest.mark.parametrize("files", [("abc123cd1.mp4", "abc123cd2.mp4")]) def test_scan_movies__cdx_without_delimeter(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 2 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'abc123cd1.mp4' - assert basenames[1] == 'abc123cd2.mp4' + assert basenames[0] == "abc123cd1.mp4" + assert basenames[1] == "abc123cd2.mp4" # 文件夹以番号命名,分片位于文件夹内且无番号信息 -@pytest.mark.parametrize('files', [('ABC-123/CD1.mp4','ABC-123/CD2 .mp4','ABC-123/CD3.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123/CD1.mp4", "ABC-123/CD2 .mp4", "ABC-123/CD3.mp4")] +) def test_scan_movies__from_folder(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 3 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'CD1.mp4' - assert basenames[1] == 'CD2 .mp4' - assert basenames[2] == 'CD3.mp4' + assert basenames[0] == "CD1.mp4" + assert basenames[1] == "CD2 .mp4" + assert basenames[2] == "CD3.mp4" # 分片以多位数字编号 -@pytest.mark.parametrize('files', [('ABC-123.01.mp4','ABC-123.02.mp4','ABC-123.03.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123.01.mp4", "ABC-123.02.mp4", "ABC-123.03.mp4")] +) def test_scan_movies__0x123(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 3 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'ABC-123.01.mp4' - assert basenames[1] == 'ABC-123.02.mp4' - assert basenames[2] == 'ABC-123.03.mp4' + assert basenames[0] == "ABC-123.01.mp4" + assert basenames[1] == "ABC-123.02.mp4" + assert basenames[2] == "ABC-123.03.mp4" # 无效: 没有可以匹配到番号的文件 -@pytest.mark.parametrize('files', [('什么也没有.mp4',)]) +@pytest.mark.parametrize("files", [("什么也没有.mp4",)]) def test_scan_movies__nothing(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 0 # 无效: 在CWD下没有可以匹配到番号的文件 -@pytest.mark.parametrize('files', [('什么也没有.mp4',)]) +@pytest.mark.parametrize("files", [("什么也没有.mp4",)]) def test_scan_movies__nothing_in_cwd(prepare_files): cwd = os.getcwd() os.chdir(tmp_folder) try: - movies = scan_movies('.') + movies = scan_movies(".") finally: os.chdir(cwd) assert len(movies) == 0 # 无效:多个分片命名杂乱 -@pytest.mark.parametrize('files', [('ABC-123-1.mp4','ABC-123-第2部分.mp4','ABC-123-3.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123-1.mp4", "ABC-123-第2部分.mp4", "ABC-123-3.mp4")] +) def test_scan_movies__strange_names(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 0 # 无效:同一影片的分片和非分片混合 -@pytest.mark.parametrize('files', [('ABC-123.mp4','ABC-123-1.mp4','ABC-123-2.mp4')]) +@pytest.mark.parametrize("files", [("ABC-123.mp4", "ABC-123-1.mp4", "ABC-123-2.mp4")]) def test_scan_movies__mix_slices(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 0 # 无效:多个分片位于不同文件夹 -@pytest.mark.parametrize('files', [('ABC-123.CD1.mp4','sub/ABC-123.CD2.mp4','ABC-123.CD3.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123.CD1.mp4", "sub/ABC-123.CD2.mp4", "ABC-123.CD3.mp4")] +) def test_scan_movies__wrong_structure(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 0 # 无效:分片的起始编号不合法 -@pytest.mark.parametrize('files', [('ABC-123.CD2.mp4','ABC-123.CD3.mp4','ABC-123.CD4.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123.CD2.mp4", "ABC-123.CD3.mp4", "ABC-123.CD4.mp4")] +) def test_scan_movies__wrong_initial_id(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 0 # 无效:分片的编号不连续 -@pytest.mark.parametrize('files', [('ABC-123.CD1.mp4','ABC-123.CD3.mp4','ABC-123.CD4.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123.CD1.mp4", "ABC-123.CD3.mp4", "ABC-123.CD4.mp4")] +) def test_scan_movies__not_consecutive(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 0 # 无效:分片的编号重复 -@pytest.mark.parametrize('files', [('ABC-123-1.mp4','ABC-123-1 .mp4','ABC-123-3.mp4')]) +@pytest.mark.parametrize( + "files", [("ABC-123-1.mp4", "ABC-123-1 .mp4", "ABC-123-3.mp4")] +) def test_scan_movies__duplicate_index(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 0 # 混合有效和无效数据 -@pytest.mark.parametrize('files', [('DEF-456/movie.mp4', 'ABC-123.1.mp4','sub/ABC-123.2.mp4','ABC-123.3.mp4')]) +@pytest.mark.parametrize( + "files", + [("DEF-456/movie.mp4", "ABC-123.1.mp4", "sub/ABC-123.2.mp4", "ABC-123.3.mp4")], +) def test_scan_movies__mix_data(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'DEF-456' + assert movies[0].dvdid == "DEF-456" assert len(movies[0].files) == 1 basenames = [os.path.basename(i) for i in movies[0].files] - assert basenames[0] == 'movie.mp4' + assert basenames[0] == "movie.mp4" # 文件夹以番号命名,文件夹内同时有带番号的影片和广告 -@pytest.mark.parametrize('files', [{'ABC-123/ABC-123.mp4': DEFAULT_SIZE, 'ABC-123/广告1.mp4': 1024, 'ABC-123/广告2.mp4': 243269631}]) +@pytest.mark.parametrize( + "files", + [ + { + "ABC-123/ABC-123.mp4": DEFAULT_SIZE, + "ABC-123/广告1.mp4": 1024, + "ABC-123/广告2.mp4": 243269631, + } + ], +) def test_scan_movies__1_video_with_ad(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 1 - assert movies[0].dvdid == 'ABC-123' + assert movies[0].dvdid == "ABC-123" assert len(movies[0].files) == 1 # 文件夹内同时有多部带番号的影片和广告 -@pytest.mark.parametrize('files', [{'ABC-123.mp4': DEFAULT_SIZE, 'DEF-456.mp4': DEFAULT_SIZE, '广告1.mp4': 1024, '广告2.mp4': 243269631}]) +@pytest.mark.parametrize( + "files", + [ + { + "ABC-123.mp4": DEFAULT_SIZE, + "DEF-456.mp4": DEFAULT_SIZE, + "广告1.mp4": 1024, + "广告2.mp4": 243269631, + } + ], +) def test_scan_movies__n_video_with_ad(prepare_files): movies = scan_movies(tmp_folder) assert len(movies) == 2 - assert movies[0].dvdid == 'ABC-123' and movies[1].dvdid == 'DEF-456' + assert movies[0].dvdid == "ABC-123" and movies[1].dvdid == "DEF-456" assert all(len(i.files) == 1 for i in movies) diff --git a/unittest/test_func.py b/unittest/test_func.py index ca6d0560f..ec8328b35 100644 --- a/unittest/test_func.py +++ b/unittest/test_func.py @@ -2,16 +2,16 @@ import sys import random -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from javsp.func import * +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from javsp.func import * def test_remove_trail_actor_in_title(): run = remove_trail_actor_in_title - delimiters = list('-xX &·,; &・,;') - title1 = '东风夜放花千树,更吹落、星如雨。' - title2 = '辛弃疾 ' + title1 - names = ['辛弃疾', '牛顿', '爱因斯坦', '阿基米德', '伽利略'] + delimiters = list("-xX &·,; &・,;") + title1 = "东风夜放花千树,更吹落、星如雨。" + title2 = "辛弃疾 " + title1 + names = ["辛弃疾", "牛顿", "爱因斯坦", "阿基米德", "伽利略"] def combine(items): sep = random.choice(delimiters) @@ -20,7 +20,7 @@ def combine(items): return new_str # 定义测试用例 - assert title1 == run(combine([title1, '辛弃疾']), names) + assert title1 == run(combine([title1, "辛弃疾"]), names) assert title1 == run(combine([title1] + names), names) - assert title1 == run(combine([title1, '辛弃疾']), names) + assert title1 == run(combine([title1, "辛弃疾"]), names) assert title2 == run(combine([title2] + names), names) diff --git a/unittest/test_lib.py b/unittest/test_lib.py index 43a05338c..adff36e73 100644 --- a/unittest/test_lib.py +++ b/unittest/test_lib.py @@ -1,26 +1,26 @@ import os import sys -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from javsp.lib import * +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from javsp.lib import * def test_detect_special_attr(): run = detect_special_attr # 定义测试用例 - assert run('STARS-225_UNCENSORED_LEAKED.mp4') == 'U' - assert run('STARS-225_UNCENSORED_LEAKED-C.mp4') == 'UC' - assert run('STARS-225_无码.mp4') == '' - assert run('STARS-225_无码流出.mp4') == 'U' - assert run('STARS-225_无码破解.mp4') == 'U' - assert run('STARS-225_UNCEN.mp4') == 'U' - assert run('STARS-225_UNCEN-C.mp4') == 'UC' - assert run('STARS-225u.mp4', 'STARS-225') == 'U' - assert run('STARS-225C.mp4', 'STARS-225') == 'C' - assert run('STARS-225uC.mp4', 'STARS-225') == 'UC' - assert run('STARS225u.mp4', 'STARS-225') == 'U' - assert run('STARS225C.mp4', 'STARS-225') == 'C' - assert run('STARS225uC.mp4', 'STARS-225') == 'UC' - assert run('STARS-225CD1.mp4', 'STARS-225') == '' - assert run('stars225cd2.mp4', 'STARS-225') == '' + assert run("STARS-225_UNCENSORED_LEAKED.mp4") == "U" + assert run("STARS-225_UNCENSORED_LEAKED-C.mp4") == "UC" + assert run("STARS-225_无码.mp4") == "" + assert run("STARS-225_无码流出.mp4") == "U" + assert run("STARS-225_无码破解.mp4") == "U" + assert run("STARS-225_UNCEN.mp4") == "U" + assert run("STARS-225_UNCEN-C.mp4") == "UC" + assert run("STARS-225u.mp4", "STARS-225") == "U" + assert run("STARS-225C.mp4", "STARS-225") == "C" + assert run("STARS-225uC.mp4", "STARS-225") == "UC" + assert run("STARS225u.mp4", "STARS-225") == "U" + assert run("STARS225C.mp4", "STARS-225") == "C" + assert run("STARS225uC.mp4", "STARS-225") == "UC" + assert run("STARS-225CD1.mp4", "STARS-225") == "" + assert run("stars225cd2.mp4", "STARS-225") == "" diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py index 1537d93ad..4be1152ca 100644 --- a/unittest/test_proxyfree.py +++ b/unittest/test_proxyfree.py @@ -1,18 +1,19 @@ import os import sys -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from javsp.web.proxyfree import * def test_get_url(): - assert get_proxy_free_url('javlib') != '' - assert get_proxy_free_url('javdb') != '' + assert get_proxy_free_url("javlib") != "" + assert get_proxy_free_url("javdb") != "" def test_get_url_with_prefer(): - prefer_url = 'https://www.baidu.com' - assert prefer_url == get_proxy_free_url('javlib', prefer_url) + prefer_url = "https://www.baidu.com" + assert prefer_url == get_proxy_free_url("javlib", prefer_url) + if __name__ == "__main__": - print(get_proxy_free_url('javlib')) + print(get_proxy_free_url("javlib"))