From 9359f3d4f02856128f5626e754c7f64e2232b02f Mon Sep 17 00:00:00 2001 From: Felix S Date: Sat, 2 Oct 2021 18:43:42 +0000 Subject: [PATCH] [extractor] Extract storyboards from SMIL manifests (#1128) Authored by: fstirlitz --- yt_dlp/YoutubeDL.py | 6 +++--- yt_dlp/extractor/common.py | 23 ++++++++++++++++++++-- yt_dlp/utils.py | 39 +++++++++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index c42a29ee3..9c4dd3ec5 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3029,9 +3029,7 @@ class YoutubeDL(object): @staticmethod def format_resolution(format, default='unknown'): - if format.get('vcodec') == 'none': - if format.get('acodec') == 'none': - return 'images' + if format.get('vcodec') == 'none' and format.get('acodec') != 'none': return 'audio only' if format.get('resolution') is not None: return format['resolution'] @@ -3043,6 +3041,8 @@ class YoutubeDL(object): res = '%dx?' % format['width'] else: res = default + if format.get('vcodec') == 'none' and format.get('acodec') == 'none': + res += ' (images)' return res def _format_note(self, fdict): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 5da29dc63..f65a098d7 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2346,14 +2346,15 @@ class InfoExtractor(object): rtmp_count = 0 http_count = 0 m3u8_count = 0 + imgs_count = 0 - srcs = [] + srcs = set() media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) for medium in media: src = medium.get('src') if not src or src in srcs: continue - srcs.append(src) + srcs.add(src) bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) filesize = int_or_none(medium.get('size') or medium.get('fileSize')) @@ -2427,6 +2428,24 @@ class InfoExtractor(object): 'height': height, }) + for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)): + src = medium.get('src') + if not src or src in srcs: + continue + srcs.add(src) + + imgs_count += 1 + formats.append({ + 'format_id': 'imagestream-%d' % (imgs_count), + 'url': src, + 'ext': mimetype2ext(medium.get('type')), + 'acodec': 'none', + 'vcodec': 'none', + 'width': int_or_none(medium.get('width')), + 'height': int_or_none(medium.get('height')), + 'format_note': 'SMIL storyboards', + }) + return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 1bc0ac767..7a77edf4c 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -4546,20 +4546,24 @@ def mimetype2ext(mt): if mt is None: return None - ext = { + mt, _, params = mt.partition(';') + mt = mt.strip() + + FULL_MAP = { 'audio/mp4': 'm4a', # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as # it's the most popular one 'audio/mpeg': 'mp3', 'audio/x-wav': 'wav', - }.get(mt) + 'audio/wav': 'wav', + 'audio/wave': 'wav', + } + + ext = FULL_MAP.get(mt) if ext is not None: return ext - _, _, res = mt.rpartition('/') - res = res.split(';')[0].strip().lower() - - return { + SUBTYPE_MAP = { '3gpp': '3gp', 'smptett+xml': 'tt', 'ttaf+xml': 'dfxp', @@ -4578,7 +4582,28 @@ def mimetype2ext(mt): 'quicktime': 'mov', 'mp2t': 'ts', 'x-wav': 'wav', - }.get(res, res) + 'filmstrip+json': 'fs', + 'svg+xml': 'svg', + } + + _, _, subtype = mt.rpartition('/') + ext = SUBTYPE_MAP.get(subtype.lower()) + if ext is not None: + return ext + + SUFFIX_MAP = { + 'json': 'json', + 'xml': 'xml', + 'zip': 'zip', + 'gzip': 'gz', + } + + _, _, suffix = subtype.partition('+') + ext = SUFFIX_MAP.get(suffix) + if ext is not None: + return ext + + return subtype.replace('+', '.') def parse_codecs(codecs_str):