From 177662e0f24bfd54e57b87698739d7a518321bac Mon Sep 17 00:00:00 2001 From: sam Date: Tue, 4 Oct 2022 02:52:30 +1300 Subject: [PATCH] [extractor/MicrosoftEmbed] Add extractor (#5082) Closes #2638 Authored by: DoubleCouponDay --- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/microsoftembed.py | 70 ++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/microsoftembed.py diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4fcf1f5cc..bc6de4926 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3640,7 +3640,7 @@ class YoutubeDL: return None return render_table( self._list_format_headers('ID', 'Width', 'Height', 'URL'), - [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]) + [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails]) def render_subtitles_table(self, video_id, subtitles): def _row(lang, formats): diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f4d7c3ab5..3a92c1d02 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -960,6 +960,7 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .microsoftembed import MicrosoftEmbedIE from .mildom import ( MildomIE, MildomVodIE, diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py new file mode 100644 index 000000000..8cdf66778 --- /dev/null +++ b/yt_dlp/extractor/microsoftembed.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, +) + + +class MicrosoftEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?microsoft\.com/(?:[^/]+/)?videoplayer/embed/(?P[a-z0-9A-Z]+)' + + _TESTS = [{ + 'url': 'https://www.microsoft.com/en-us/videoplayer/embed/RWL07e', + 'md5': 'eb0ae9007f9b305f9acd0a03e74cb1a9', + 'info_dict': { + 'id': 'RWL07e', + 'title': 'Microsoft for Public Health and Social Services', + 'ext': 'mp4', + 'thumbnail': 'http://img-prod-cms-rt-microsoft-com.akamaized.net/cms/api/am/imageFileData/RWL7Ju?ver=cae5', + 'age_limit': 0, + 'timestamp': 1631658316, + 'upload_date': '20210914' + } + }] + _API_URL = 'https://prod-video-cms-rt-microsoft-com.akamaized.net/vhs/api/videos/' + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json(self._API_URL + video_id, video_id) + + formats = [] + for source_type, source in metadata['streams'].items(): + if source_type == 'smooth_Streaming': + formats.extend(self._extract_ism_formats(source['url'], video_id, 'mss')) + elif source_type == 'apple_HTTP_Live_Streaming': + formats.extend(self._extract_m3u8_formats(source['url'], video_id, 'mp4')) + elif source_type == 'mPEG_DASH': + formats.extend(self._extract_mpd_formats(source['url'], video_id)) + else: + formats.append({ + 'format_id': source_type, + 'url': source['url'], + 'height': source.get('heightPixels'), + 'width': source.get('widthPixels'), + }) + self._sort_formats(formats) + + subtitles = { + lang: [{ + 'url': data.get('url'), + 'ext': 'vtt', + }] for lang, data in traverse_obj(metadata, 'captions', default={}).items() + } + + thumbnails = [{ + 'url': thumb.get('url'), + 'width': thumb.get('width') or None, + 'height': thumb.get('height') or None, + } for thumb in traverse_obj(metadata, ('snippet', 'thumbnails', ...))] + self._remove_duplicate_formats(thumbnails) + + return { + 'id': video_id, + 'title': traverse_obj(metadata, ('snippet', 'title')), + 'timestamp': unified_timestamp(traverse_obj(metadata, ('snippet', 'activeStartDate'))), + 'age_limit': int_or_none(traverse_obj(metadata, ('snippet', 'minimumAge'))) or 0, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + }