From 8dbd23ef75efb4fdfaab91f8b1f6c4722d6b1f09 Mon Sep 17 00:00:00 2001 From: Satyam Kumar Date: Sun, 1 Nov 2020 10:49:53 +0530 Subject: [PATCH 1/2] Fixes #160 - Fail properly on downloads --- CHANGELOG.md | 1 + openedx2zim/html_processor.py | 15 +++++++++++++-- openedx2zim/scraper.py | 10 ++++++++++ openedx2zim/xblocks_extractor/libcast.py | 11 ++++++++++- openedx2zim/xblocks_extractor/video.py | 10 ++++++++-- 5 files changed, 42 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcde20e..c25419c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ - fixed mobile navigation menu - fixed video encoding in low quality in WebM - added support for internationalization +- Fail properly on too many failed downloads or critical failures # 1.0.0 diff --git a/openedx2zim/html_processor.py b/openedx2zim/html_processor.py index d05c2c3..ba484e9 100644 --- a/openedx2zim/html_processor.py +++ b/openedx2zim/html_processor.py @@ -6,10 +6,13 @@ import lxml.html from bs4 import BeautifulSoup -from .constants import DOWNLOADABLE_EXTENSIONS, AUDIO_FORMATS +from .constants import DOWNLOADABLE_EXTENSIONS, AUDIO_FORMATS, getLogger from .utils import jinja, prepare_url, get_back_jumps, remove_autogenerated_tags +logger = getLogger() + + class HtmlProcessor: def __init__(self, scraper): self.scraper = scraper @@ -41,12 +44,20 @@ def download_and_get_filename( return None, None fresh_download = False if not output_file.exists(): + file_url = prepare_url(src, netloc, path_on_server) if self.scraper.download_file( - prepare_url(src, netloc, path_on_server), + file_url, output_file, ): fresh_download = True else: + self.scraper.failed_urls.add(file_url) + if len(self.scraper.failed_urls) >= self.scraper.failure_threshhold: + logger.error("Too many failures. Try running the scraper again") + logger.debug("The following URLs failed to download:") + for failed_url in self.scraper.failed_urls: + logger.debug(failed_url) + raise SystemExit(1) return None, None return filename, fresh_download diff --git a/openedx2zim/scraper.py b/openedx2zim/scraper.py index e6081b4..f19b0fe 100644 --- a/openedx2zim/scraper.py +++ b/openedx2zim/scraper.py @@ -194,6 +194,9 @@ def __init__( self.wiki = None self.forum = None + self.failed_urls = {} + self.failure_threshhold = 1 + # set and record locale for translations locale_details = get_language_details(locale_name) self.instance_lang = locale_details["iso-639-1"] @@ -884,4 +887,11 @@ def run(self): shutil.rmtree(self.build_dir, ignore_errors=True) # shutdown the youtube downloader self.yt_downloader.shutdown() + if self.failed_urls: + logger.error( + f"Content from {len(self.failed_urls)} URL(s) failed to download" + ) + logger.debug("Failed URLs:") + for failed_url in self.failed_urls: + logger.debug(failed_url) logger.info("Done everything") diff --git a/openedx2zim/xblocks_extractor/libcast.py b/openedx2zim/xblocks_extractor/libcast.py index 0c0e4d8..8a0f888 100644 --- a/openedx2zim/xblocks_extractor/libcast.py +++ b/openedx2zim/xblocks_extractor/libcast.py @@ -2,6 +2,10 @@ from .base_xblock import BaseXblock from ..utils import jinja, download_and_convert_subtitles, prepare_url, get_back_jumps +from ..constants import getLogger + + +logger = getLogger() class Libcast(BaseXblock): @@ -44,9 +48,14 @@ def download(self, instance_connection): else: video_path = self.output_path.joinpath("video.mp4") if not video_path.exists(): - self.scraper.download_file( + downloaded = self.scraper.download_file( prepare_url(url, self.scraper.instance_url), video_path ) + if not downloaded: + logger.error( + f"Video for libcast block {self.xblock_json['student_view_url']} failed to download" + ) + raise SystemExit(1) def render(self): return jinja( diff --git a/openedx2zim/xblocks_extractor/video.py b/openedx2zim/xblocks_extractor/video.py index 907e355..d81e01c 100644 --- a/openedx2zim/xblocks_extractor/video.py +++ b/openedx2zim/xblocks_extractor/video.py @@ -114,13 +114,19 @@ def download(self, instance_connection): else: video_path = self.output_path.joinpath("video.mp4") if not video_path.exists(): + downloaded = None if youtube: - self.scraper.download_file(url, video_path) + downloaded = self.scraper.download_file(url, video_path) else: - self.scraper.download_file( + downloaded = self.scraper.download_file( prepare_url(urllib.parse.unquote(url), self.scraper.instance_url), video_path, ) + if not downloaded: + logger.error( + f"Video for video block {self.xblock_json['student_view_url']} failed to download" + ) + raise SystemExit(1) real_subtitle = download_and_convert_subtitles( self.output_path, subs_lang, instance_connection ) From 0335f07aa19419632b542a025e97be8086ac7da5 Mon Sep 17 00:00:00 2001 From: Satyam Kumar Date: Sun, 1 Nov 2020 11:07:40 +0530 Subject: [PATCH 2/2] Fix MP4 convertion if correct file is downloaded --- openedx2zim/scraper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/openedx2zim/scraper.py b/openedx2zim/scraper.py index f19b0fe..53fda79 100644 --- a/openedx2zim/scraper.py +++ b/openedx2zim/scraper.py @@ -663,6 +663,10 @@ def convert_video(self, src, dst): preset = VideoWebmLow() if self.video_format == "webm" else VideoMp4Low() elif src.suffix[1:] != self.video_format: preset = VideoWebmHigh() if self.video_format == "webm" else VideoMp4High() + else: + if not src.resolve() == dst.resolve(): + shutil.move(src, dst) + return True return reencode( src, dst,