diff --git a/README.md b/README.md index 590783f..994f1ee 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ A downloader tool for kemono.party and coomer.party. 3. Then install requirements with `pip install -r requirements.txt` - If the command doesn't run try adding `python -m`, `python3 -m`, or `py -m` to the front 4. Get a cookie.txt file from kemono.party/coomer.party - - You can get a cookie text file on [Chrome](https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid?hl=en) with this extension. + - You can get a cookie text file on [Chrome](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) or [Firefox](https://addons.mozilla.org/firefox/addon/cookies-txt/) with this extension. - A cookie.txt file is required to use downloader! 5. Run `python kemono-dl.py --cookies "cookie.txt" --links https://kemono.party/SERVICE/user/USERID` - If the script doesn't run try replacing `python` with `python3` or `py` @@ -26,13 +26,13 @@ Takes in a url or list of urls separated by a comma. `--from-file FILE` Reads in a file with urls separated by new lines. Lines starting with # will not be read in. `--kemono-fav-users SERVICE` -Downloads favorite users from kemono.party of specified type or types separated by a comma. Types include: all, patreon, fanbox, gumroad, subscribestar, dlsite, fantia. Your cookie file must have been gotten while logged in to work. +Downloads favorite users from kemono.party/su of specified type or types separated by a comma. Types include: all, patreon, fanbox, gumroad, subscribestar, dlsite, fantia. Your cookie file must have been gotten while logged in to work. `--coomer-fav-users SERVICE` -Downloads favorite users from coomer.party of specified type or types separated by a comma. Types include: all, onlyfans. Your cookie file must have been gotten while logged in to work. +Downloads favorite users from coomer.party/su of specified type or types separated by a comma. Types include: all, onlyfans. Your cookie file must have been gotten while logged in to work. `--kemono-fav-posts` -Downloads favorite posts from kemono.party. Your cookie file must have been gotten while logged in to work. +Downloads favorite posts from kemono.party/su. Your cookie file must have been gotten while logged in to work. `--coomer-fav-posts` -Downloads favorite posts from coomer.party. Your cookie file must have been gotten while logged in to work. +Downloads favorite posts from coomer.party/su. Your cookie file must have been gotten while logged in to work. ## What files to download @@ -56,6 +56,8 @@ Download the users profile banner. Only works when a user url is passed. Try to download the post embed with yt-dlp. `--skip-attachments` Do not download post attachments. +`--skip-local-hash` +Do not check hash for downloaded local files. `--overwrite` Overwrite any previously created files. @@ -121,6 +123,11 @@ The time in seconds to wait between downloading posts. (default: 0) The amount of times to retry / resume downloading a file. (default: 5) `--ratelimit-sleep SEC` The time in seconds to wait after being ratelimited (default: 120) +`--ratelimit-ms MS` +The time in millisecond to limit before next request (default: 300) + +`--proxy-agent https://agent/proxy` +Proxy agent URL. This is NOT standrad http/s proxy. Pass 'u' parameter to agent for proxying. Not enabled by default. Enable this you can not download kemono and commer at once. # Notes - Excepted link formats: diff --git a/src/api_test.py b/src/api_test.py new file mode 100644 index 0000000..9ac122e --- /dev/null +++ b/src/api_test.py @@ -0,0 +1,78 @@ +import unittest +import requests +from numbers import Number +from requests.adapters import HTTPAdapter, Retry + +class ApiTest(unittest.TestCase): + site = '' + timeout = 5 + headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'} + def getSession(self): + retries = Retry( + total = 3, + backoff_factor = 0.1, + status_forcelist = [ 500, 502, 503, 504 ] + ) + session = requests.Session() + session.mount('https://', HTTPAdapter(max_retries=retries)) + session.mount('http://', HTTPAdapter(max_retries=retries)) + return session + def callApi(self, url): + print(f'Requst {url}') + response = self.getSession().get(url=url, headers=self.headers, timeout=self.timeout) + if response.status_code == 401: + raise Exception(f'HTTP 401 bad cookie | {response.status_code} {response.reason}') + elif not response.ok: + raise Exception(f'HTTP not ok, | {response.status_code} {response.reason}') + print(f'Response {response.text}') + return response.json() + +class KemonoApiTest(ApiTest, unittest.TestCase): + site = 'kemono.party' + patreonUser = '35150295' + patreonPost = '65210116' + + def test_creators(self): + print('Start test for creators') + creators = self.callApi(url=f'https://{self.site}/api/creators/') + self.assertGreaterEqual(len(creators), 1, 'creator can not be empty') + creator = creators[0] + self.assertTrue(isinstance(creator['favorited'], Number), 'favorited must be number') + self.assertTrue(isinstance(creator['indexed'], Number), 'indexed must be number') + self.assertTrue(isinstance(creator['updated'], Number), 'updated must be number') + self.assertTrue(isinstance(creator['id'], str), 'favorited must be str') + self.assertTrue(isinstance(creator['name'], str), 'favorited must be str') + self.assertTrue(isinstance(creator['service'], str), 'favorited must be str') + + def test_patreon_post(self): + print('Start test for Patreon post api') + post = self.callApi(url=f"https://{self.site}/api/patreon/user/{self.patreonUser}/post/{self.patreonPost}") + self.assertEqual(len(post), 1, 'Post list must equal to 1') + post = post[0] + self.assertEqual(post['added'], 'Thu, 28 Apr 2022 03:16:21 GMT', 'added not equal') + self.assertEqual(post['attachments'], [{'name': 'Nelves_Moonwell_Final.jpg', + 'path': '/59/ca/59ca91127d30cd44c85a8fd71a7a560b74c4eb7e0a2873065057fe20f7e3c5b8.jpg'}], + 'attachment not equal') + self.assertEqual(post['content'], "

Made a quick render of night elves \"bathing\" \ +in a moonwell while waiting for simulations on the Miss Fortune animation. I'm in the polishing stage of the Miss \ +Fortune animation and will be hiring a VA soon as well as beginning the render!

Hope you all enjoy this scene :)

", + 'content not equal') + self.assertEqual(post['edited'], 'Sat, 16 Apr 2022 14:03:04 GMT', 'edited not equal') + self.assertEqual(post['embed'], {}, 'embed not equal') + self.assertEqual(post['file'], {'name': 'Nelves_Moonwell_Final.jpg', + 'path': '/59/ca/59ca91127d30cd44c85a8fd71a7a560b74c4eb7e0a2873065057fe20f7e3c5b8.jpg'}, + 'file not equal') + self.assertEqual(post['id'], '65210116', 'post id not equal') + self.assertEqual(post['published'], 'Sat, 16 Apr 2022 14:03:04 GMT', 'published not equal') + self.assertEqual(post['title'], 'Moonwell Bathing and Miss Fortune Update', 'title not equal') + self.assertEqual(post['user'], self.patreonUser, 'user must be same') + self.assertEqual(post['service'], 'patreon', 'service must be patreon') + self.assertFalse(post['shared_file'], 'shared file must be false in this post') + + +class CoomerApiTest(ApiTest, unittest.TestCase): + site = 'coomer.party' + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/src/args.py b/src/args.py index 8fc8cc6..6bed199 100644 --- a/src/args.py +++ b/src/args.py @@ -3,6 +3,7 @@ import re import argparse from http.cookiejar import MozillaCookieJar, LoadError +from urllib.parse import urlparse, urlunparse from .version import __version__ @@ -26,19 +27,19 @@ def get_args(): ap.add_argument("--kemono-fav-users", metavar="SERVICE", type=str, default=None, - help="Downloads favorite users from kemono.party of specified type or types separated by a comma. Types include: all, patreon, fanbox, gumroad, subscribestar, dlsite, fantia. Your cookie file must have been gotten while logged in to work.") + help="Downloads favorite users from kemono.party/su of specified type or types separated by a comma. Types include: all, patreon, fanbox, gumroad, subscribestar, dlsite, fantia. Your cookie file must have been gotten while logged in to work.") ap.add_argument("--coomer-fav-users", metavar="SERVICE", type=str, default=None, - help="Downloads favorite users from coomer.party of specified type or types separated by a comma. Types include: all, onlyfans. Your cookie file must have been gotten while logged in to work.") + help="Downloads favorite users from coomer.party/su of specified type or types separated by a comma. Types include: all, onlyfans. Your cookie file must have been gotten while logged in to work.") ap.add_argument("--kemono-fav-posts", action='store_true', default=False, - help="Downloads favorite posts from kemono.party. Your cookie file must have been gotten while logged in to work.") + help="Downloads favorite posts from kemono.party/su. Your cookie file must have been gotten while logged in to work.") ap.add_argument("--coomer-fav-posts", action='store_true', default=False, - help="Downloads favorite posts from coomer.party. Your cookie file must have been gotten while logged in to work.") + help="Downloads favorite posts from coomer.party/su. Your cookie file must have been gotten while logged in to work.") @@ -82,6 +83,10 @@ def get_args(): action='store_true', default=False, help="Do not download post attachments.") + ap.add_argument("--skip-local-hash", + action='store_true', default=False, + help="Do not check hash for downloaded local files.") + ap.add_argument("--overwrite", action='store_true', default=False, help="Overwrite any previously created files.") @@ -196,20 +201,31 @@ def get_args(): metavar="SEC", type=int, default=120, help="The time in seconds to wait after being ratelimited (default: 120)") + ap.add_argument("--ratelimit-ms", + metavar="MS", type=int, default=300, + help="The time in millisecond to limit before next request (default: 300)") + ap.add_argument("--user-agent", metavar="UA", type=str, default='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', help="Set a custom user agent") + ap.add_argument("--proxy-agent", + metavar="https://agent/proxy", type=str, default=None, + help="Proxy agent URL. This is NOT standrad http/s proxy. Pass 'u' parameter to agent for proxying. Not enabled by default. " + "Enable this you can not download kemono and commer at once.") + args = vars(ap.parse_args()) + args['cookie_domains'] = {'kemono': None, 'coomer': None} # takes a comma seperated lost of cookie files and loads them into a cookie jar if args['cookies']: cookie_files = [s.strip() for s in args["cookies"].split(",")] args['cookies'] = MozillaCookieJar() + loaded_cookies = MozillaCookieJar() loaded = 0 for cookie_file in cookie_files: try: - args['cookies'].load(cookie_file) + loaded_cookies.load(cookie_file) loaded += 1 except LoadError: print(F"Unable to load cookie {cookie_file}") @@ -217,7 +233,26 @@ def get_args(): print(F"Unable to find cookie {cookie_file}") if loaded == 0: print("No cookies loaded | exiting"), exit() - + # make sure cookies are wildcard for better compatibility + for cookie in loaded_cookies: + args['cookie_domains']['kemono'] = args['cookie_domains']['kemono'] or ( + match := re.search(r'^(?:www)?\.?(kemono\.(?:party|su))$', cookie.domain)) and match.group(1) + args['cookie_domains']['coomer'] = args['cookie_domains']['coomer'] or ( + match := re.search(r'^(?:www)?\.?(coomer\.(?:party|su))$', cookie.domain)) and match.group(1) + + if cookie.domain.startswith('www.'): + cookie.domain = cookie.domain[3:] + cookie.domain_specified = True + cookie.domain_initial_dot = True + elif not cookie.domain.startswith('.'): + cookie.domain = f'.{cookie.domain}' + cookie.domain_specified = True + cookie.domain_initial_dot = True + args['cookies'].set_cookie(cookie) + + if (not args['cookie_domains']['kemono'] and (args['kemono_fav_users'] or args['kemono_fav_posts'])) or ( + not args['cookie_domains']['coomer'] and (args['coomer_fav_users'] or args['coomer_fav_posts'])): + print(f"Bad cookie file | Unable to detect domain when download favorites"), exit() # takes a comma seperated string of links and converts them to a list if args['links']: args['links'] = [s.strip().split('?')[0] for s in args["links"].split(",")] @@ -306,4 +341,21 @@ def check_size(args, key): print(f"--coomer-fav-users no valid options were passed") args['coomer_fav_users'] = temp + if args['proxy_agent']: + u = urlparse(args['proxy_agent']) + if not u.netloc or not u.path: + print(f"Bad proxy agent url | Url shoule be something like https://example.com/agent"), exit() + if not u.scheme: + u.scheme = 'http' + args['proxy_agent'] = urlunparse(u) + + # we should change cookie domain to proxy agent + new_cookies = MozillaCookieJar() + for cookie in args['cookies']: + cookie.domain = f'.{u.netloc}' + cookie.domain_specified = True + cookie.domain_initial_dot = True + new_cookies.set_cookie(cookie) + args['cookies'] = new_cookies + return args \ No newline at end of file diff --git a/src/helper.py b/src/helper.py index b2e06df..8d15368 100644 --- a/src/helper.py +++ b/src/helper.py @@ -2,10 +2,17 @@ import hashlib import os import time +import requests +from urllib.parse import urlparse, urlencode, parse_qs, urlunparse + +from .args import get_args +from .logger import logger + +running_args = get_args() def parse_url(url): # parse urls - downloadable = re.search(r'^https://(kemono\.party|coomer\.party)/([^/]+)/user/([^/]+)($|/post/([^/]+)$)',url) + downloadable = re.search(r'^https://((?:kemono|coomer)\.(?:party|su))/([^/]+)/user/([^/]+)($|/post/([^/]+)$)',url) if not downloadable: return None return downloadable.group(1) @@ -13,9 +20,11 @@ def parse_url(url): # create path from template pattern def compile_post_path(post_variables, template, ascii): drive, tail = os.path.splitdrive(template) - tail = tail[1:] if tail[0] in {'/','\\'} else tail + tail_trimmed = tail[0] in {'/','\\'} + tail = tail[1:] if tail_trimmed else tail tail_split = re.split(r'\\|/', tail) - cleaned_path = drive + os.path.sep if drive else '' + cleaned_path = (drive + os.path.sep if drive else + (os.path.sep if tail_trimmed else '')) for folder in tail_split: if ascii: cleaned_path = os.path.join(cleaned_path, restrict_ascii(clean_folder_name(folder.format(**post_variables)))) @@ -87,7 +96,7 @@ def print_download_bar(total:int, downloaded:int, resumed:int, start): rate = (downloaded-resumed)/time_diff - eta = time.strftime("%H:%M:%S", time.gmtime((total-downloaded) / rate)) + eta = time.strftime("%H:%M:%S", time.gmtime((total-downloaded) / rate)) if rate else '99:99:99' if rate/2**10 < 100: rate = (round(rate/2**10, 1), 'KB') @@ -141,4 +150,57 @@ def print_download_bar(total:int, downloaded:int, resumed:int, start): # latest_version = datetime.datetime.strptime(latest_tag, r'%Y.%m.%d.%H') # if current_version < latest_version: # logger.debug(f"Using kemono-dl {__version__} while latest release is kemono-dl {latest_tag}") -# logger.warning(f"A newer version of kemono-dl is available. Please update to the latest release at https://github.com/AplhaSlayer1964/kemono-dl/releases/latest") \ No newline at end of file +# logger.warning(f"A newer version of kemono-dl is available. Please update to the latest release at https://github.com/AplhaSlayer1964/kemono-dl/releases/latest") + + +# doesn't support multithreading +def function_rate_limit(func): + last_call_times = {} + + def wrapper(*args, **kwargs): + nonlocal last_call_times + func_name = func.__name__ + t = time.time() + last_call_time = last_call_times.get(func_name, 0) + if (t - last_call_time) * 1000 < running_args['ratelimit_ms']: + time.sleep(running_args['ratelimit_ms'] / 1000 - (t - last_call_time)) + last_call_times[func_name] = time.time() + return func(*args, **kwargs) + + return wrapper + +class RefererSession(requests.Session): + def __init__(self, *args, **kwargs): + self.proxy_agent = kwargs.pop('proxy_agent', None) + self.max_retries_429 = kwargs.pop('max_retries_429', 3) + self.sleep_429 = kwargs.pop('sleep_429', 120) + + super().__init__(*args, **kwargs) + + def rebuild_auth(self, prepared_request, response): + super().rebuild_auth(prepared_request, response) + u = urlparse(response.url) + prepared_request.headers["Referer"] = f'{u.scheme}://{u.netloc}/' + + @function_rate_limit + def get(self, url, **kwargs): + old_url = url + retry_429 = kwargs.pop('retry_429', True) + max_retries_429 = kwargs.pop('max_retries_429', self.max_retries_429) + + if self.proxy_agent: + u = urlparse(self.proxy_agent) + q_params = parse_qs(u.query) + q_params['u'] = url + u = u._replace(query=urlencode(q_params)) + url = urlunparse(u) + + resp = super().get(url, **kwargs) + max_retries_429 -= 1 + if resp.status_code != 429 or not retry_429 or max_retries_429 < 1: + return resp + + # need retry + logger.warning(f"Failed to access: {url if self.proxy_agent else old_url} | {resp.status_code} Too Many Requests | Sleeping for {self.sleep_429} seconds") + time.sleep(self.sleep_429) + return self.get(old_url, retry_429=retry_429, max_retries_429=max_retries_429, **kwargs) \ No newline at end of file diff --git a/src/logger.py b/src/logger.py index 923c578..1df1c27 100644 --- a/src/logger.py +++ b/src/logger.py @@ -23,7 +23,7 @@ file_format = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s') stream_format = logging.Formatter('%(levelname)s:%(message)s') -file_handler = logging.FileHandler('debug.log', encoding="utf-16") +file_handler = logging.FileHandler('debug.log', encoding="utf-8") file_handler.setFormatter(file_format) stream_handler = logging.StreamHandler() diff --git a/src/main.py b/src/main.py index d46de17..f80ceda 100644 --- a/src/main.py +++ b/src/main.py @@ -2,17 +2,19 @@ from requests.adapters import HTTPAdapter, Retry import re import os +import math from bs4 import BeautifulSoup import time import datetime from PIL import Image from io import BytesIO import json +from numbers import Number from .args import get_args from .logger import logger from .version import __version__ -from .helper import get_file_hash, print_download_bar, check_date, parse_url, compile_post_path, compile_file_path +from .helper import get_file_hash, print_download_bar, check_date, parse_url, compile_post_path, compile_file_path, RefererSession from .my_yt_dlp import my_yt_dlp class downloader: @@ -44,6 +46,7 @@ def __init__(self, args): self.post_errors = 0 # controls what to download/save + self.local_hash = not args['skip_local_hash'] self.attachments = not args['skip_attachments'] self.inline = args['inline'] self.content = args['content'] @@ -80,10 +83,17 @@ def __init__(self, args): self.retry = args['retry'] self.no_part = args['no_part_files'] self.ratelimit_sleep = args['ratelimit_sleep'] + self.ratelimit_ms = args['ratelimit_ms'] self.post_timeout = args['post_timeout'] self.simulate = args['simulate'] + self.cookie_domains = args['cookie_domains'] + self.proxy_agent = args['proxy_agent'] - self.session = requests.Session() + self.session = RefererSession( + proxy_agent = self.proxy_agent, + max_retries_429 = self.retry, + sleep_429 = self.ratelimit_sleep + ) retries = Retry( total=self.retry, backoff_factor=0.1, @@ -96,7 +106,7 @@ def __init__(self, args): def get_creators(self, domain:str): # get site creators - creators_api = f"https://{domain}/api/creators/" + creators_api = f"https://{domain}/api/v1/creators.txt" logger.debug(f"Getting creator json from {creators_api}") return self.session.get(url=creators_api, cookies=self.cookies, headers=self.headers, timeout=self.timeout).json() @@ -107,7 +117,7 @@ def get_user(self, user_id:str, service:str): return None def get_favorites(self, domain:str, fav_type:str, services:list = None): - fav_api = f'https://{domain}/api/favorites?type={fav_type}' + fav_api = f'https://{domain}/api/v1/account/favorites?type={fav_type}' logger.debug(f"Getting favorite json from {fav_api}") response = self.session.get(url=fav_api, headers=self.headers, cookies=self.cookies, timeout=self.timeout) if response.status_code == 401: @@ -126,11 +136,11 @@ def get_favorites(self, domain:str, fav_type:str, services:list = None): self.get_post(f"https://{domain}/{favorite['service']}/user/{favorite['id']}") def get_post(self, url:str): - found = re.search(r'(https://(kemono\.party|coomer\.party)/)(([^/]+)/user/([^/]+)($|/post/[^/]+))', url) + found = re.search(r'(https://((?:kemono|coomer)\.(?:party|su))/)(([^/]+)/user/([^/]+)($|/post/[^/]+))', url) if not found: logger.error(f"Unable to find url parameters for {url}") return - api = f"{found.group(1)}api/{found.group(3)}" + api = f"{found.group(1)}api/v1/{found.group(3)}" site = found.group(2) service = found.group(4) user_id = found.group(5) @@ -142,13 +152,13 @@ def get_post(self, url:str): if not is_post: if self.skip_user(user): return - logger.info(f"Downloading posts from {site}.party | {service} | {user['name']} | {user['id']}") + logger.info(f"Downloading posts from {site} | {service} | {user['name']} | {user['id']}") chunk = 0 first = True while True: if is_post: logger.debug(f"Requesting post json from: {api}") - json = self.session.get(url=api, cookies=self.cookies, headers=self.headers, timeout=self.timeout).json() + json = [self.session.get(url=api, cookies=self.cookies, headers=self.headers, timeout=self.timeout).json()] else: logger.debug(f"Requesting user json from: {api}?o={chunk}") json = self.session.get(url=f"{api}?o={chunk}", cookies=self.cookies, headers=self.headers, timeout=self.timeout).json() @@ -176,19 +186,25 @@ def get_post(self, url:str): except: logger.exception("Unable to download post | service:{service} user_id:{user_id} post_id:{id}".format(**post['post_variables'])) self.comp_posts.append("https://{site}/{service}/user/{user_id}/post/{id}".format(**post['post_variables'])) - if len(json) < 25: + # kemono and coomer shares same offset now which is 50 + # according to its api document "stepping of 50 is enforced" + # nekohouse also set to 50 now(but its api seems not work currently) + # however it still support 25 just like kemono before. + # So it would be fine we fixed in 50 + chunk_size = 50 + if len(json) < chunk_size: return # completed - chunk += 25 + chunk += chunk_size def download_icon_banner(self, post:dict, img_types:list): for img_type in img_types: if post['post_variables']['service'] in {'dlsite'}: logger.warning(f"Profile {img_type}s are not supported for {post['post_variables']['service']} users") - return + continue if post['post_variables']['service'] in {'gumroad'} and img_type == 'banner': logger.warning(f"Profile {img_type}s are not supported for {post['post_variables']['service']} users") - return + continue image_url = "https://{site}/{img_type}s/{service}/{user_id}".format(img_type=img_type, **post['post_variables']) response = self.session.get(url=image_url,headers=self.headers, cookies=self.cookies, timeout=self.timeout) try: @@ -228,7 +244,8 @@ def write_dms(self, post:dict): def get_inline_images(self, post, content_soup): # only get images that are hosted by the .party site - inline_images = [inline_image for inline_image in content_soup.find_all("img") if inline_image['src'][0] == '/'] + inline_images = [inline_image for inline_image in content_soup.find_all("img") + if inline_image.get('src') and inline_image.get('src')[0] == '/'] for index, inline_image in enumerate(inline_images): file = {} filename, file_extension = os.path.splitext(inline_image['src'].rsplit('/')[-1]) @@ -242,8 +259,10 @@ def get_inline_images(self, post, content_soup): 'index': f"{index + 1}".zfill(len(str(len(inline_images)))) } file['file_path'] = compile_file_path(post['post_path'], post['post_variables'], file['file_variables'], self.inline_filename_template, self.restrict_ascii) + # get dir which stores html to calculate relative path + html_dir = os.path.split(compile_file_path(post['post_path'], post['post_variables'], file['file_variables'], self.other_filename_template, self.restrict_ascii))[0] # set local image location in html - inline_image['src'] = file['file_path'] + inline_image['src'] = os.path.relpath(file['file_path'], html_dir) post['inline_images'].append(file) return content_soup @@ -267,12 +286,12 @@ def get_comments(self, post_variables:dict): comment_soup = page_soup.find("div", {"class": "post__comments"}) no_comments = re.search('([^ ]+ does not support comment scraping yet\.|No comments found for this post\.)',comment_soup.text) if no_comments: - logger.debug(no_comments.group(1).strip()) + logger.debug(f"{no_comments.group(1).strip()} from {post_url}") return '' return comment_soup.prettify() except: self.post_errors += 1 - logger.exception("Failed to get post comments") + logger.exception(f"Failed to get post comments for {post_url}") def compile_post_content(self, post, content_soup, comment_soup, embed): post['content']['text'] = f"{content_soup}\n{embed}\n{comment_soup}" @@ -292,17 +311,18 @@ def clean_post(self, post:dict, user:dict, domain:str): new_post['post_variables']['username'] = user['name'] new_post['post_variables']['site'] = domain new_post['post_variables']['service'] = post['service'] - new_post['post_variables']['added'] = datetime.datetime.strptime(post['added'], r'%a, %d %b %Y %H:%M:%S %Z').strftime(self.date_strf_pattern) if post['added'] else None - new_post['post_variables']['updated'] = datetime.datetime.strptime(post['edited'], r'%a, %d %b %Y %H:%M:%S %Z').strftime(self.date_strf_pattern) if post['edited'] else None - new_post['post_variables']['user_updated'] = datetime.datetime.strptime(user['updated'], r'%a, %d %b %Y %H:%M:%S %Z').strftime(self.date_strf_pattern) if user['updated'] else None - new_post['post_variables']['published'] = datetime.datetime.strptime(post['published'], r'%a, %d %b %Y %H:%M:%S %Z').strftime(self.date_strf_pattern) if post['published'] else None + new_post['post_variables']['added'] = self.format_time_by_type(post['added']) if post['added'] else None + new_post['post_variables']['updated'] = self.format_time_by_type(post['edited']) if post['edited'] else None + new_post['post_variables']['user_updated'] = self.format_time_by_type(user['updated']) if user['updated'] else None + new_post['post_variables']['published'] = self.format_time_by_type(post['published']) if post['published'] else None new_post['post_path'] = compile_post_path(new_post['post_variables'], self.download_path_template, self.restrict_ascii) new_post['attachments'] = [] if self.attachments: # add post file to front of attachments list if it doesn't already exist - if post['file'] and not post['file'] in post['attachments']: + # sometimes post['file'] is an empty structure + if post['file'] and not post['file'] in post['attachments'] and post['file'].get('name'): post['attachments'].insert(0, post['file']) # loop over attachments and set file variables for index, attachment in enumerate(post['attachments']): @@ -378,7 +398,7 @@ def write_content(self, post:dict): self.write_to_file(post['content']['file_path'], post['content']['text']) except: self.post_errors += 1 - logger.exception(f"Failed to save content") + logger.exception(f"Failed to save content to {post['content']['file_path']}") def write_links(self, post:dict): # Write post content links @@ -387,7 +407,7 @@ def write_links(self, post:dict): self.write_to_file(post['links']['file_path'], post['links']['text']) except: self.post_errors += 1 - logger.exception(f"Failed to save content links") + logger.exception(f"Failed to save content links to {post['links']['file_path']}") def write_json(self, post:dict): try: @@ -400,7 +420,7 @@ def write_json(self, post:dict): self.write_to_file(file_path, post) except: self.post_errors += 1 - logger.exception(f"Failed to save json") + logger.exception(f"Failed to save json to {file_path}") def write_to_file(self, file_path, file_content): # check if file exists and if should overwrite @@ -419,7 +439,7 @@ def write_to_file(self, file_path, file_content): json.dump(file_content, f, indent=4, sort_keys=True) else: with open(file_path,'wb') as f: - f.write(file_content.encode("utf-16")) + f.write(file_content.encode("utf-8")) def download_file(self, file:dict, retry:int): # download a file @@ -471,16 +491,16 @@ def download_file(self, file:dict, retry:int): os.rename(part_file, file['file_path']) return logger.error("Incorrect amount of bytes downloaded | Something went so wrong I have no idea what happened | Removing file") - os.remove(part_file) + # attempt to keep this file + filepath = os.path.splitext(file['file_path']) + filepath = filepath[0] + '_statuscode416' + filepath[1] + # assume broken file, replace directly + os.replace(part_file, filepath) self.post_errors += 1 return if response.status_code == 429: - logger.warning(f"Failed to download: {os.path.split(file['file_path'])[1]} | 429 Too Many Requests | Sleeping for {self.ratelimit_sleep} seconds") - time.sleep(self.ratelimit_sleep) - if retry > 0: - self.download_file(file, retry=retry-1) - return + # already retried for 429 logger.error(f"Failed to download: {os.path.split(file['file_path'])[1]} | 429 Too Many Requests | All retries failed") self.post_errors += 1 return @@ -496,9 +516,10 @@ def download_file(self, file:dict, retry:int): if not self.simulate: if not os.path.exists(os.path.split(file['file_path'])[0]): os.makedirs(os.path.split(file['file_path'])[0]) - with open(part_file, 'ab') as f: + with open(part_file, 'wb' if resume_size == 0 else 'ab') as f: start = time.time() downloaded = resume_size + print_download_bar(total, downloaded, resume_size, start) for chunk in response.iter_content(chunk_size=1024*1024): downloaded += len(chunk) f.write(chunk) @@ -510,13 +531,23 @@ def download_file(self, file:dict, retry:int): logger.debug(f"Local File hash: {local_hash}") logger.debug(f"Sever File hash: {file['file_variables']['hash']}") if local_hash != file['file_variables']['hash']: - logger.warning(f"File hash did not match server! | Retrying") - if retry > 0: - self.download_file(file, retry=retry-1) + if file['file_variables']['hash'] != None: + # we have hash + logger.warning(f"File hash did not match server! | Retrying") + os.remove(part_file) + if retry > 0: + self.download_file(file, retry=retry-1) + return + logger.error(f"File hash did not match server! | All retries failed") + self.post_errors += 1 + else: + # no hash provided + logger.warning(f"No file hash from server! | Save file with suffix in name") + filepath = os.path.splitext(file['file_path']) + filepath = filepath[0] + '_noserverhash' + filepath[1] + # assume broken file, replace directly + os.replace(part_file, filepath) return - logger.error(f"File hash did not match server! | All retries failed") - self.post_errors += 1 - return # remove .part from file name if self.overwrite: os.replace(part_file, file['file_path']) @@ -543,28 +574,29 @@ def write_archive(self, post:dict): def skip_user(self, user:dict): # check last update date if self.user_up_datebefore or self.user_up_dateafter: - if check_date(datetime.datetime.strptime(user['updated'], r'%a, %d %b %Y %H:%M:%S %Z'), None, self.user_up_datebefore, self.user_up_dateafter): - logger.info("Skipping user | user updated date not in range") + if check_date(self.get_date_by_type(user['updated']), None, self.user_up_datebefore, self.user_up_dateafter): + logger.info(f"Skipping user {user['name']} | user updated date not in range") return True return False def skip_post(self, post:dict): + post_title = post['post_variables']['title'] # check if the post should be downloaded if self.archive_file: if "https://{site}/{service}/user/{user_id}/post/{id}".format(**post['post_variables']) in self.archive_list: - logger.info("Skipping post | post already archived") + logger.info(f"Skipping post {post_title} | post already archived") return True if self.date or self.datebefore or self.dateafter: if not post['post_variables']['published']: - logger.info("Skipping post | post published date not in range") + logger.info(f"Skipping post {post_title} | post published date not in range") return True - elif check_date(datetime.datetime.strptime(post['post_variables']['published'], self.date_strf_pattern), self.date, self.datebefore, self.dateafter): - logger.info("Skipping post | post published date not in range") + elif check_date(self.get_date_by_type(post['post_variables']['published'], self.date_strf_pattern), self.date, self.datebefore, self.dateafter): + logger.info(f"Skipping post {post_title} | post published date not in range") return True if "https://{site}/{service}/user/{user_id}/post/{id}".format(**post['post_variables']) in self.comp_posts: - logger.info("Skipping post | post was already downloaded this session") + logger.info(f"Skipping post {post_title} | post was already downloaded this session") return True return False @@ -573,7 +605,16 @@ def skip_file(self, file:dict): # check if file exists if not self.overwrite: if os.path.exists(file['file_path']): - logger.info(f"Skipping: {os.path.split(file['file_path'])[1]} | File already exists") + confirm_msg = '' + if self.local_hash and 'hash' in file['file_variables'] and file['file_variables']['hash'] != None: + local_hash = get_file_hash(file['file_path']) + if local_hash != file['file_variables']['hash']: + logger.warning(f"Corrupted file detected, remove this file and try to redownload | path: {file['file_path']} " + + f"local hash: {local_hash} server hash: {file['file_variables']['hash']}") + os.remove(file['file_path']) + return False + confirm_msg = ' hash confirmed' + logger.info(f"Skipping: {os.path.split(file['file_path'])[1]} | File already exists{confirm_msg}") return True # check file name extention @@ -620,15 +661,17 @@ def start_download(self): if not domain: logger.warning(f"URL is not downloadable | {url}") continue + if domain not in self.cookie_domains.values(): + logger.warning(f"Domain not in cookie files, cookie won't work properly | {url}") urls.append(url) if not domain in domains: domains.append(domain) if self.k_fav_posts or self.k_fav_users: - if not 'kemono.party' in domains: - domains.append('kemono.party') + if self.cookie_domains['kemono'] not in domains: + domains.append(self.cookie_domains['kemono']) if self.c_fav_posts or self.c_fav_users: - if not 'coomer.party' in domains: - domains.append('coomer.party') + if self.cookie_domains['coomer'] not in domains: + domains.append(self.cookie_domains['coomer']) for domain in domains: try: @@ -639,26 +682,27 @@ def start_download(self): logger.error("No creator information was retrieved. | exiting") exit() + # TODO retry not implemented if self.k_fav_posts: try: - self.get_favorites('kemono.party', 'post', retry=self.retry) + self.get_favorites(self.cookie_domains['kemono'], 'post', retry=self.retry) except: - logger.exception("Unable to get favorite posts from kemono.party") + logger.exception(f"Unable to get favorite posts from {self.cookie_domains['kemono']}") if self.c_fav_posts: try: - self.get_favorites('coomer.party', 'post') + self.get_favorites(self.cookie_domains['coomer'], 'post', retry=self.retry) except: - logger.exception("Unable to get favorite posts from coomer.party") + logger.exception(f"Unable to get favorite posts from {self.cookie_domains['coomer']}") if self.k_fav_users: try: - self.get_favorites('kemono.party', 'artist', self.k_fav_users) + self.get_favorites(self.cookie_domains['kemono'], 'artist', self.k_fav_users) except: - logger.exception("Unable to get favorite users from kemono.party") + logger.exception(f"Unable to get favorite users from {self.cookie_domains['kemono']}") if self.c_fav_users: try: - self.get_favorites('coomer.party', 'artist', self.c_fav_users) + self.get_favorites(self.cookie_domains['coomer'], 'artist', self.c_fav_users) except: - logger.exception("Unable to get favorite users from coomer.party") + logger.exception(f"Unable to get favorite users from {self.cookie_domains['coomer']}") for url in urls: try: @@ -666,5 +710,20 @@ def start_download(self): except: logger.exception(f"Unable to get posts for {url}") + def get_date_by_type(self, time, date_format = None): + if isinstance(time, Number): + t = datetime.datetime.fromtimestamp(time) + elif isinstance(time, str): + t = datetime.datetime.fromisoformat(time) if date_format is None else datetime.datetime.strptime(time, date_format) + elif time is None: + return None + else: + raise Exception(f'Can not format time {time}') + return t + + def format_time_by_type(self, time): + t = self.get_date_by_type(time) + return t.strftime(self.date_strf_pattern) if t != None else t + def main(): downloader(get_args())