-
Notifications
You must be signed in to change notification settings - Fork 10k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
…loses #21185, closes #26711, closes #27068, closes #27930, closes #28198, closes #28199, closes #28274) * Generalize unique video ids for zdf based extractors * Improve extraction * Fix 3sat and phoenix
- Loading branch information
Showing
3 changed files
with
276 additions
and
285 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,193 +1,43 @@ | ||
from __future__ import unicode_literals | ||
|
||
import re | ||
from .zdf import ZDFIE | ||
|
||
from .common import InfoExtractor | ||
from ..utils import ( | ||
int_or_none, | ||
unified_strdate, | ||
xpath_text, | ||
determine_ext, | ||
float_or_none, | ||
ExtractorError, | ||
) | ||
|
||
|
||
class DreiSatIE(InfoExtractor): | ||
class DreiSatIE(ZDFIE): | ||
IE_NAME = '3sat' | ||
_GEO_COUNTRIES = ['DE'] | ||
_VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)' | ||
_TESTS = [ | ||
{ | ||
'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', | ||
'md5': 'be37228896d30a88f315b638900a026e', | ||
'info_dict': { | ||
'id': '45918', | ||
'ext': 'mp4', | ||
'title': 'Waidmannsheil', | ||
'description': 'md5:cce00ca1d70e21425e72c86a98a56817', | ||
'uploader': 'SCHWEIZWEIT', | ||
'uploader_id': '100000210', | ||
'upload_date': '20140913' | ||
}, | ||
'params': { | ||
'skip_download': True, # m3u8 downloads | ||
} | ||
_VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' | ||
_TESTS = [{ | ||
# Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html | ||
'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', | ||
'md5': '0aff3e7bc72c8813f5e0fae333316a1d', | ||
'info_dict': { | ||
'id': '141007_ab18_10wochensommer_film', | ||
'ext': 'mp4', | ||
'title': 'Ab 18! - 10 Wochen Sommer', | ||
'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', | ||
'duration': 2660, | ||
'timestamp': 1608604200, | ||
'upload_date': '20201222', | ||
}, | ||
{ | ||
'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', | ||
'only_matching': True, | ||
}, { | ||
'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', | ||
'info_dict': { | ||
'id': '140913_sendung_schweizweit', | ||
'ext': 'mp4', | ||
'title': 'Waidmannsheil', | ||
'description': 'md5:cce00ca1d70e21425e72c86a98a56817', | ||
'timestamp': 1410623100, | ||
'upload_date': '20140913' | ||
}, | ||
] | ||
|
||
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): | ||
param_groups = {} | ||
for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): | ||
group_id = param_group.get(self._xpath_ns( | ||
'id', 'http://www.w3.org/XML/1998/namespace')) | ||
params = {} | ||
for param in param_group: | ||
params[param.get('name')] = param.get('value') | ||
param_groups[group_id] = params | ||
|
||
formats = [] | ||
for video in smil.findall(self._xpath_ns('.//video', namespace)): | ||
src = video.get('src') | ||
if not src: | ||
continue | ||
bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) | ||
group_id = video.get('paramGroup') | ||
param_group = param_groups[group_id] | ||
for proto in param_group['protocols'].split(','): | ||
formats.append({ | ||
'url': '%s://%s' % (proto, param_group['host']), | ||
'app': param_group['app'], | ||
'play_path': src, | ||
'ext': 'flv', | ||
'format_id': '%s-%d' % (proto, bitrate), | ||
'tbr': bitrate, | ||
}) | ||
self._sort_formats(formats) | ||
return formats | ||
|
||
def extract_from_xml_url(self, video_id, xml_url): | ||
doc = self._download_xml( | ||
xml_url, video_id, | ||
note='Downloading video info', | ||
errnote='Failed to download video info') | ||
|
||
status_code = xpath_text(doc, './status/statuscode') | ||
if status_code and status_code != 'ok': | ||
if status_code == 'notVisibleAnymore': | ||
message = 'Video %s is not available' % video_id | ||
else: | ||
message = '%s returned error: %s' % (self.IE_NAME, status_code) | ||
raise ExtractorError(message, expected=True) | ||
|
||
title = xpath_text(doc, './/information/title', 'title', True) | ||
|
||
urls = [] | ||
formats = [] | ||
for fnode in doc.findall('.//formitaeten/formitaet'): | ||
video_url = xpath_text(fnode, 'url') | ||
if not video_url or video_url in urls: | ||
continue | ||
urls.append(video_url) | ||
|
||
is_available = 'http://www.metafilegenerator' not in video_url | ||
geoloced = 'static_geoloced_online' in video_url | ||
if not is_available or geoloced: | ||
continue | ||
|
||
format_id = fnode.attrib['basetype'] | ||
format_m = re.match(r'''(?x) | ||
(?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ | ||
(?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) | ||
''', format_id) | ||
|
||
ext = determine_ext(video_url, None) or format_m.group('container') | ||
|
||
if ext == 'meta': | ||
continue | ||
elif ext == 'smil': | ||
formats.extend(self._extract_smil_formats( | ||
video_url, video_id, fatal=False)) | ||
elif ext == 'm3u8': | ||
# the certificates are misconfigured (see | ||
# https://github.com/ytdl-org/youtube-dl/issues/8665) | ||
if video_url.startswith('https://'): | ||
continue | ||
formats.extend(self._extract_m3u8_formats( | ||
video_url, video_id, 'mp4', 'm3u8_native', | ||
m3u8_id=format_id, fatal=False)) | ||
elif ext == 'f4m': | ||
formats.extend(self._extract_f4m_formats( | ||
video_url, video_id, f4m_id=format_id, fatal=False)) | ||
else: | ||
quality = xpath_text(fnode, './quality') | ||
if quality: | ||
format_id += '-' + quality | ||
|
||
abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) | ||
vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) | ||
|
||
tbr = int_or_none(self._search_regex( | ||
r'_(\d+)k', video_url, 'bitrate', None)) | ||
if tbr and vbr and not abr: | ||
abr = tbr - vbr | ||
|
||
formats.append({ | ||
'format_id': format_id, | ||
'url': video_url, | ||
'ext': ext, | ||
'acodec': format_m.group('acodec'), | ||
'vcodec': format_m.group('vcodec'), | ||
'abr': abr, | ||
'vbr': vbr, | ||
'tbr': tbr, | ||
'width': int_or_none(xpath_text(fnode, './width')), | ||
'height': int_or_none(xpath_text(fnode, './height')), | ||
'filesize': int_or_none(xpath_text(fnode, './filesize')), | ||
'protocol': format_m.group('proto').lower(), | ||
}) | ||
|
||
geolocation = xpath_text(doc, './/details/geolocation') | ||
if not formats and geolocation and geolocation != 'none': | ||
self.raise_geo_restricted(countries=self._GEO_COUNTRIES) | ||
|
||
self._sort_formats(formats) | ||
|
||
thumbnails = [] | ||
for node in doc.findall('.//teaserimages/teaserimage'): | ||
thumbnail_url = node.text | ||
if not thumbnail_url: | ||
continue | ||
thumbnail = { | ||
'url': thumbnail_url, | ||
} | ||
thumbnail_key = node.get('key') | ||
if thumbnail_key: | ||
m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) | ||
if m: | ||
thumbnail['width'] = int(m.group(1)) | ||
thumbnail['height'] = int(m.group(2)) | ||
thumbnails.append(thumbnail) | ||
|
||
upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) | ||
|
||
return { | ||
'id': video_id, | ||
'title': title, | ||
'description': xpath_text(doc, './/information/detail'), | ||
'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), | ||
'thumbnails': thumbnails, | ||
'uploader': xpath_text(doc, './/details/originChannelTitle'), | ||
'uploader_id': xpath_text(doc, './/details/originChannelId'), | ||
'upload_date': upload_date, | ||
'formats': formats, | ||
'params': { | ||
'skip_download': True, | ||
} | ||
|
||
def _real_extract(self, url): | ||
video_id = self._match_id(url) | ||
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id | ||
return self.extract_from_xml_url(video_id, details_url) | ||
}, { | ||
# Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html | ||
'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', | ||
'only_matching': True, | ||
}, { | ||
# Same as https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids | ||
'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', | ||
'only_matching': True, | ||
}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,128 @@ | ||
# coding: utf-8 | ||
from __future__ import unicode_literals | ||
|
||
from .dreisat import DreiSatIE | ||
import re | ||
|
||
from .youtube import YoutubeIE | ||
from .zdf import ZDFBaseIE | ||
from ..compat import compat_str | ||
from ..utils import ( | ||
int_or_none, | ||
merge_dicts, | ||
unified_timestamp, | ||
xpath_text, | ||
) | ||
|
||
class PhoenixIE(DreiSatIE): | ||
|
||
class PhoenixIE(ZDFBaseIE): | ||
IE_NAME = 'phoenix.de' | ||
_VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ | ||
(?: | ||
phoenix/die_sendungen/(?:[^/]+/)? | ||
)? | ||
(?P<id>[0-9]+)''' | ||
_TESTS = [ | ||
{ | ||
'url': 'http://www.phoenix.de/content/884301', | ||
'md5': 'ed249f045256150c92e72dbb70eadec6', | ||
'info_dict': { | ||
'id': '884301', | ||
'ext': 'mp4', | ||
'title': 'Michael Krons mit Hans-Werner Sinn', | ||
'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', | ||
'upload_date': '20141025', | ||
'uploader': 'Im Dialog', | ||
} | ||
_VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P<id>\d+)\.html' | ||
_TESTS = [{ | ||
# Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html | ||
'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', | ||
'md5': '34ec321e7eb34231fd88616c65c92db0', | ||
'info_dict': { | ||
'id': '210222_phx_nachgehakt_corona_protest', | ||
'ext': 'mp4', | ||
'title': 'Wohin führt der Protest in der Pandemie?', | ||
'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', | ||
'duration': 1691, | ||
'timestamp': 1613906100, | ||
'upload_date': '20210221', | ||
'uploader': 'Phoenix', | ||
'channel': 'corona nachgehakt', | ||
}, | ||
{ | ||
'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', | ||
'only_matching': True, | ||
}, { | ||
# Youtube embed | ||
'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', | ||
'info_dict': { | ||
'id': 'hMQtqFYjomk', | ||
'ext': 'mp4', | ||
'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', | ||
'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', | ||
'duration': 3509, | ||
'upload_date': '20201219', | ||
'uploader': 'phoenix', | ||
'uploader_id': 'phoenix', | ||
}, | ||
{ | ||
'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', | ||
'only_matching': True, | ||
'params': { | ||
'skip_download': True, | ||
}, | ||
] | ||
}, { | ||
'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', | ||
'only_matching': True, | ||
}, { | ||
# no media | ||
'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', | ||
'only_matching': True, | ||
}, { | ||
# Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html | ||
'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', | ||
'only_matching': True, | ||
}] | ||
|
||
def _real_extract(self, url): | ||
video_id = self._match_id(url) | ||
webpage = self._download_webpage(url, video_id) | ||
article_id = self._match_id(url) | ||
|
||
article = self._download_json( | ||
'https://www.phoenix.de/response/id/%s' % article_id, article_id, | ||
'Downloading article JSON') | ||
|
||
video = article['absaetze'][0] | ||
title = video.get('titel') or article.get('subtitel') | ||
|
||
if video.get('typ') == 'video-youtube': | ||
video_id = video['id'] | ||
return self.url_result( | ||
video_id, ie=YoutubeIE.ie_key(), video_id=video_id, | ||
video_title=title) | ||
|
||
video_id = compat_str(video.get('basename') or video.get('content')) | ||
|
||
internal_id = self._search_regex( | ||
r'<div class="phx_vod" id="phx_vod_([0-9]+)"', | ||
webpage, 'internal video ID') | ||
details = self._download_xml( | ||
'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php', | ||
video_id, 'Downloading details XML', query={ | ||
'ak': 'web', | ||
'ptmd': 'true', | ||
'id': video_id, | ||
'profile': 'player2', | ||
}) | ||
|
||
title = title or xpath_text( | ||
details, './/information/title', 'title', fatal=True) | ||
content_id = xpath_text( | ||
details, './/video/details/basename', 'content id', fatal=True) | ||
|
||
info = self._extract_ptmd( | ||
'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id, | ||
content_id, None, url) | ||
|
||
timestamp = unified_timestamp(xpath_text(details, './/details/airtime')) | ||
|
||
thumbnails = [] | ||
for node in details.findall('.//teaserimages/teaserimage'): | ||
thumbnail_url = node.text | ||
if not thumbnail_url: | ||
continue | ||
thumbnail = { | ||
'url': thumbnail_url, | ||
} | ||
thumbnail_key = node.get('key') | ||
if thumbnail_key: | ||
m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) | ||
if m: | ||
thumbnail['width'] = int(m.group(1)) | ||
thumbnail['height'] = int(m.group(2)) | ||
thumbnails.append(thumbnail) | ||
|
||
api_url = 'http://www.phoenix.de/php/mediaplayer/data/beitrags_details.php?ak=web&id=%s' % internal_id | ||
return self.extract_from_xml_url(video_id, api_url) | ||
return merge_dicts(info, { | ||
'id': content_id, | ||
'title': title, | ||
'description': xpath_text(details, './/information/detail'), | ||
'duration': int_or_none(xpath_text(details, './/details/lengthSec')), | ||
'thumbnails': thumbnails, | ||
'timestamp': timestamp, | ||
'uploader': xpath_text(details, './/details/channel'), | ||
'uploader_id': xpath_text(details, './/details/originChannelId'), | ||
'channel': xpath_text(details, './/details/originChannelTitle'), | ||
}) |
Oops, something went wrong.