From d39c122abdf4dab56ebc415162d31f2340aace9d Mon Sep 17 00:00:00 2001 From: bergoid Date: Sun, 11 Sep 2022 19:39:17 +0200 Subject: [PATCH 01/23] Fixes for VRT MAX --- yt_dlp/extractor/canvas.py | 99 +++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 8eff4a57c..471d3d34b 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -20,7 +20,7 @@ from ..utils import ( class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' + _VALID_URL = r'https://media-services-public\.vrt\.be/media-aggregator/v2/media-items/(?P.+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', @@ -43,48 +43,41 @@ class CanvasIE(InfoExtractor): 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8_native', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' + _REST_API_BASE_TOKEN = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' + _REST_API_BASE_VIDEO = 'https://media-services-public.vrt.be/media-aggregator/v2' def _real_extract(self, url): mobj = self._match_valid_url(url) - site_id, video_id = mobj.group('site_id'), mobj.group('id') + video_id = mobj.group('video_id') data = None - if site_id != 'vrtvideo': - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) - - # New API endpoint - if not data: - vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', - video_id, note='refreshtoken: Retrieve vrtnutoken', - errnote='refreshtoken failed')['vrtnutoken'] - headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json; charset=utf-8'}) - vrtPlayerToken = self._download_json( - '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', headers=headers, data=json.dumps({ - 'identityToken': vrtnutoken - }).encode('utf-8'))['vrtPlayerToken'] - data = self._download_json( - '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': vrtPlayerToken, - 'client': 'null', - }, expected_status=400) - if 'title' not in data: - code = data.get('code') - if code == 'AUTHENTICATION_REQUIRED': - self.raise_login_required() - elif code == 'INVALID_LOCATION': - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) + + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( + '%s/tokens' % self._REST_API_BASE_TOKEN, video_id, + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] + data = self._download_json( + '%s/media-items/%s' % (self._REST_API_BASE_VIDEO, video_id), + video_id, 'Downloading video JSON', query={ + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'vrtnu-web@PROD', + }, expected_status=400) + if 'title' not in data: + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) # Note: The title may be an empty string - title = data['title'] or f'{site_id} {video_id}' + title = data['title'] or f'{video_id}' description = data.get('description') formats = [] @@ -224,7 +217,7 @@ class CanvasEenIE(InfoExtractor): class VrtNUIE(GigyaBaseIE): IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?P.+)/(?P[0-9]{4})/(?P.+)/?' _TESTS = [{ # Available via old API endpoint 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', @@ -312,28 +305,21 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - attrs = extract_attributes(self._search_regex( - r'(]+>)', webpage, 'media element')) - video_id = attrs['videoid'] - publication_id = attrs.get('publicationid') - if publication_id: - video_id = publication_id + '$' + video_id + episode_data = self._download_json(f'{url.strip("/")}.model.json', display_id, + 'Downloading asset JSON', 'Unable to download asset JSON') + details = episode_data.get('details') + actions = details.get('actions') + episode_publication_id = actions[2].get('episodePublicationId') + episode_video_id = actions[2].get('episodeVideoId') + video_id = f'{episode_publication_id}${episode_video_id}' - page = (self._parse_json(self._search_regex( - r'digitalData\s*=\s*({.+?});', webpage, 'digial data', - default='{}'), video_id, fatal=False) or {}).get('page') or {} - - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts(info, { + return { '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), + 'url': 'https://media-services-public.vrt.be/media-aggregator/v2/media-items/%s' % video_id, + 'ie_key': 'Canvas', 'id': video_id, - 'display_id': display_id, - 'season_number': int_or_none(page.get('episode_season')), - }) + 'display_id': display_id + } class DagelijkseKostIE(InfoExtractor): @@ -382,3 +368,4 @@ class DagelijkseKostIE(InfoExtractor): 'title': title, 'description': description, } + From aa0317f209168ca4090f79064eb212e6a3a19e08 Mon Sep 17 00:00:00 2001 From: bergoid Date: Tue, 13 Sep 2022 22:14:13 +0200 Subject: [PATCH 02/23] flake8 --- yt_dlp/extractor/canvas.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 471d3d34b..0cea464d6 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -7,11 +7,8 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, clean_html, - extract_attributes, float_or_none, get_element_by_class, - int_or_none, - merge_dicts, str_or_none, strip_or_none, url_or_none, @@ -53,8 +50,8 @@ class CanvasIE(InfoExtractor): data = None vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', - video_id, note='refreshtoken: Retrieve vrtnutoken', - errnote='refreshtoken failed')['vrtnutoken'] + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] headers = self.geo_verification_headers() headers.update({'Content-Type': 'application/json; charset=utf-8'}) vrtPlayerToken = self._download_json( @@ -306,7 +303,7 @@ class VrtNUIE(GigyaBaseIE): display_id = self._match_id(url) episode_data = self._download_json(f'{url.strip("/")}.model.json', display_id, - 'Downloading asset JSON', 'Unable to download asset JSON') + 'Downloading asset JSON', 'Unable to download asset JSON') details = episode_data.get('details') actions = details.get('actions') episode_publication_id = actions[2].get('episodePublicationId') @@ -368,4 +365,3 @@ class DagelijkseKostIE(InfoExtractor): 'title': title, 'description': description, } - From f0075841e8483f3eaf1ba1c6c9be1ade3c96c76e Mon Sep 17 00:00:00 2001 From: bergoid Date: Thu, 15 Sep 2022 23:55:07 +0200 Subject: [PATCH 03/23] Add test. Clear dead VRT extractors --- yt_dlp/extractor/_extractors.py | 7 - yt_dlp/extractor/canvas.py | 367 -------------------------------- yt_dlp/extractor/ketnet.py | 70 ------ yt_dlp/extractor/vrt.py | 226 ++++++++++++++------ 4 files changed, 161 insertions(+), 509 deletions(-) delete mode 100644 yt_dlp/extractor/canvas.py delete mode 100644 yt_dlp/extractor/ketnet.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index aedf063f6..55d22836d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -246,12 +246,6 @@ from .camwithher import CamWithHerIE from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) from .carambatv import ( CarambaTVIE, CarambaTVPageIE, @@ -786,7 +780,6 @@ from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py deleted file mode 100644 index 0cea464d6..000000000 --- a/yt_dlp/extractor/canvas.py +++ /dev/null @@ -1,367 +0,0 @@ -import json - - -from .common import InfoExtractor -from .gigya import GigyaBaseIE -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - clean_html, - float_or_none, - get_element_by_class, - str_or_none, - strip_or_none, - url_or_none, - urlencode_postdata -) - - -class CanvasIE(InfoExtractor): - _VALID_URL = r'https://media-services-public\.vrt\.be/media-aggregator/v2/media-items/(?P.+)' - _TESTS = [{ - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'mp4', - 'title': 'Nachtwacht: De Greystook', - 'description': 'Nachtwacht: De Greystook', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'only_matching': True, - }] - _GEO_BYPASS = False - _HLS_ENTRY_PROTOCOLS_MAP = { - 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8_native', - } - _REST_API_BASE_TOKEN = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' - _REST_API_BASE_VIDEO = 'https://media-services-public.vrt.be/media-aggregator/v2' - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('video_id') - - data = None - - vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', - video_id, note='refreshtoken: Retrieve vrtnutoken', - errnote='refreshtoken failed')['vrtnutoken'] - headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json; charset=utf-8'}) - vrtPlayerToken = self._download_json( - '%s/tokens' % self._REST_API_BASE_TOKEN, video_id, - 'Downloading token', headers=headers, data=json.dumps({ - 'identityToken': vrtnutoken - }).encode('utf-8'))['vrtPlayerToken'] - data = self._download_json( - '%s/media-items/%s' % (self._REST_API_BASE_VIDEO, video_id), - video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': vrtPlayerToken, - 'client': 'vrtnu-web@PROD', - }, expected_status=400) - if 'title' not in data: - code = data.get('code') - if code == 'AUTHENTICATION_REQUIRED': - self.raise_login_required() - elif code == 'INVALID_LOCATION': - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) - - # Note: The title may be an empty string - title = data['title'] or f'{video_id}' - description = data.get('description') - - formats = [] - subtitles = {} - for target in data['targetUrls']: - format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) - if not format_url or not format_type: - continue - format_type = format_type.upper() - if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: - fmts, subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], - m3u8_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - fmts, subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HSS': - fmts, subs = self._extract_ism_formats_and_subtitles( - format_url, video_id, ism_id='mss', fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - self._sort_formats(formats) - - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, - } - - -class CanvasEenIE(InfoExtractor): - IE_DESC = 'canvas.be and een.be' - _VALID_URL = r'https?://(?:www\.)?(?Pcanvas|een)\.be/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ed66976748d12350b118455979cca293', - 'info_dict': { - 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'flv', - 'title': 'De afspraak veilt voor de Warmste Week', - 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 49.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # with subtitles - 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', - 'info_dict': { - 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', - 'display_id': 'pieter-0167', - 'ext': 'mp4', - 'title': 'Pieter 0167', - 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2553.08, - 'subtitles': { - 'nl': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Pagina niet gevonden', - }, { - 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', - 'info_dict': { - 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', - 'display_id': 'emma-pakt-thilly-aan', - 'ext': 'mp4', - 'title': 'Emma pakt Thilly aan', - 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 118.24, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, display_id = mobj.group('site_id'), mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(self._search_regex( - r']+class="video__body__header__title"[^>]*>(.+?)', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None)) - - video_id = self._html_search_regex( - r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - } - - -class VrtNUIE(GigyaBaseIE): - IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?P.+)/(?P[0-9]{4})/(?P.+)/?' - _TESTS = [{ - # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', - 'info_dict': { - 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'mp4', - 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', - 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', - 'duration': 1457.04, - 'thumbnail': r're:^https?://.*\.jpg$', - 'series': 'Postbus X', - 'season': 'Seizoen 1989', - 'season_number': 1989, - 'episode': 'De zwarte weduwe', - 'episode_number': 1, - 'timestamp': 1595822400, - 'upload_date': '20200727', - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['is not a supported codec'], - }, { - # Only available via new API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', - 'info_dict': { - 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', - 'ext': 'mp4', - 'title': 'Aflevering 5', - 'description': 'Wie valt door de mand tijdens een missie?', - 'duration': 2967.06, - 'season': 'Season 1', - 'season_number': 1, - 'episode_number': 5, - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], - }] - _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' - _CONTEXT_ID = 'R3595707040' - - def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) - - if auth_info.get('errorDetails'): - raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) - - # Sometimes authentication fails for no good reason, retry - login_attempt = 1 - while login_attempt <= 3: - try: - self._request_webpage('https://token.vrt.be/vrtnuinitlogin', - None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', - query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) - - post_data = { - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - } - - self._request_webpage( - 'https://login.vrt.be/perform_login', - None, note='Performing login', errnote='perform login failed', - headers={}, query={ - 'client_id': 'vrtnu-site' - }, data=urlencode_postdata(post_data)) - - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - login_attempt += 1 - self.report_warning('Authentication failed') - self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') - else: - raise e - else: - break - - def _real_extract(self, url): - display_id = self._match_id(url) - - episode_data = self._download_json(f'{url.strip("/")}.model.json', display_id, - 'Downloading asset JSON', 'Unable to download asset JSON') - details = episode_data.get('details') - actions = details.get('actions') - episode_publication_id = actions[2].get('episodePublicationId') - episode_video_id = actions[2].get('episodeVideoId') - video_id = f'{episode_publication_id}${episode_video_id}' - - return { - '_type': 'url_transparent', - 'url': 'https://media-services-public.vrt.be/media-aggregator/v2/media-items/%s' % video_id, - 'ie_key': 'Canvas', - 'id': video_id, - 'display_id': display_id - } - - -class DagelijkseKostIE(InfoExtractor): - IE_DESC = 'dagelijksekost.een.be' - _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', - 'md5': '30bfffc323009a3e5f689bef6efa2365', - 'info_dict': { - 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', - 'display_id': 'hachis-parmentier-met-witloof', - 'ext': 'mp4', - 'title': 'Hachis parmentier met witloof', - 'description': 'md5:9960478392d87f63567b5b117688cdc5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 283.02, - }, - 'expected_warnings': ['is not a supported codec'], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(get_element_by_class( - 'dish-metadata__title', webpage - ) or self._html_search_meta( - 'twitter:title', webpage)) - - description = clean_html(get_element_by_class( - 'dish-description', webpage) - ) or self._html_search_meta( - ('description', 'twitter:description', 'og:description'), - webpage) - - video_id = self._html_search_regex( - r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - } diff --git a/yt_dlp/extractor/ketnet.py b/yt_dlp/extractor/ketnet.py deleted file mode 100644 index ab6276727..000000000 --- a/yt_dlp/extractor/ketnet.py +++ /dev/null @@ -1,70 +0,0 @@ -from .canvas import CanvasIE -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, - parse_iso8601, -) - - -class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', - 'ext': 'mp4', - 'title': 'Nachtwacht - Reeks 3: Aflevering 1', - 'description': 'De Nachtwacht krijgt te maken met een parasiet', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - 'timestamp': 1609225200, - 'upload_date': '20201229', - 'series': 'Nachtwacht', - 'season': 'Reeks 3', - 'episode': 'De Greystook', - 'episode_number': 1, - }, - 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], - }, { - 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - video = self._download_json( - 'https://senior-bff.ketnet.be/graphql', display_id, query={ - 'query': '''{ - video(id: "content/ketnet/nl/%s.model.json") { - description - episodeNr - imageUrl - mediaReference - programTitle - publicationDate - seasonTitle - subtitleVideodetail - titleVideodetail - } -}''' % display_id, - })['data']['video'] - - mz_id = compat_urllib_parse_unquote(video['mediaReference']) - - return { - '_type': 'url_transparent', - 'id': mz_id, - 'title': video['titleVideodetail'], - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, - 'thumbnail': video.get('imageUrl'), - 'description': video.get('description'), - 'timestamp': parse_iso8601(video.get('publicationDate')), - 'series': video.get('programTitle'), - 'season': video.get('seasonTitle'), - 'episode': video.get('subtitleVideodetail'), - 'episode_number': int_or_none(video.get('episodeNr')), - 'ie_key': CanvasIE.ie_key(), - } diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 26f48bf67..410e23a79 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,82 +1,178 @@ -from .common import InfoExtractor +import json + +from .gigya import GigyaBaseIE +from ..compat import compat_HTTPError from ..utils import ( - extract_attributes, + ExtractorError, float_or_none, - get_element_by_class, - strip_or_none, - unified_timestamp, + str_or_none, + url_or_none, + urlencode_postdata ) -class VRTIE(InfoExtractor): - IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' - _VALID_URL = r'https?://(?:www\.)?(?Pvrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P[^/?&#]+)' +class VRTIE(GigyaBaseIE): + IE_DESC = 'VRT' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?P.+)/?' _TESTS = [{ - 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', - 'md5': 'e1663accf5cf13f375f3cd0d10476669', - 'info_dict': { - 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', - 'ext': 'mp4', - 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', - 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', - 'timestamp': 1557924660, - 'upload_date': '20190515', - 'duration': 31.2, - }, - }, { - 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', - 'md5': '910bba927566e9ab992278f647eb4b75', + 'url': 'https://www.vrt.be/vrtnu/a-z/heizel-1985/trailer/heizel-1985-trailer/', 'info_dict': { - 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', - 'ext': 'mp4', - 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', - 'timestamp': 1557923760, - 'upload_date': '20190515', - 'duration': 115.17, + 'id': 'pbs-pub-e1d6e4ec-cbf4-451e-9e87-d835bb65cd28$vid-2ad45eb6-9bc8-40d4-ad72-5f25c0f59d75', + 'title': 'Trailer \'Heizel 1985\'', + 'thumbnail': 'https://images.vrt.be/orig/2022/09/07/6e44ce6f-2eb3-11ed-b07d-02b7b76bf47f.jpg', + 'duration': 41.05 }, - }, { - 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', - 'only_matching': True, - }, { - 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', - 'only_matching': True, }] - _CLIENT_MAP = { - 'vrt.be/vrtnws': 'vrtnieuws', - 'sporza.be': 'sporza', + _NETRC_MACHINE = 'vrtnu' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' + _CONTEXT_ID = 'R3595707040' + _REST_API_BASE_TOKEN = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' + _REST_API_BASE_VIDEO = 'https://media-services-public.vrt.be/media-aggregator/v2' + _HLS_ENTRY_PROTOCOLS_MAP = { + 'HLS': 'm3u8_native', + 'HLS_AES': 'm3u8_native', } + _authenticated = False + + def _perform_login(self, username, password): + auth_info = self._gigya_login({ + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) + + if auth_info.get('errorDetails'): + raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) + + # Sometimes authentication fails for no good reason, retry + login_attempt = 1 + while login_attempt <= 3: + try: + self._request_webpage('https://token.vrt.be/vrtnuinitlogin', + None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', + query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) + + post_data = { + 'UID': auth_info['UID'], + 'UIDSignature': auth_info['UIDSignature'], + 'signatureTimestamp': auth_info['signatureTimestamp'], + '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + } + + self._request_webpage( + 'https://login.vrt.be/perform_login', + None, note='Performing login', errnote='perform login failed', + headers={}, query={ + 'client_id': 'vrtnu-site' + }, data=urlencode_postdata(post_data)) + + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + login_attempt += 1 + self.report_warning('Authentication failed') + self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') + else: + raise e + else: + break + + self._authenticated = True + def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, display_id) - attrs = extract_attributes(self._search_regex( - r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video')) - - asset_id = attrs['data-video-id'] - publication_id = attrs.get('data-publication-id') - if publication_id: - asset_id = publication_id + '$' + asset_id - client = attrs.get('data-client-code') or self._CLIENT_MAP[site] - - title = strip_or_none(get_element_by_class( - 'vrt-title', webpage) or self._html_search_meta( - ['og:title', 'twitter:title', 'name'], webpage)) - description = self._html_search_meta( - ['og:description', 'twitter:description', 'description'], webpage) - if description == '…': - description = None - timestamp = unified_timestamp(self._html_search_meta( - 'article:published_time', webpage)) + display_id = self._match_id(url) + + episode_data = self._download_json(f'{url.strip("/")}.model.json', display_id, + 'Downloading asset JSON', 'Unable to download asset JSON') + details = episode_data.get('details') + actions = details.get('actions') + episode_publication_id = actions[2].get('episodePublicationId') + episode_video_id = actions[2].get('episodeVideoId') + video_id = f'{episode_publication_id}${episode_video_id}' + + data = None + vrtnutoken = "" + + if self._authenticated: + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] + + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( + '%s/tokens' % self._REST_API_BASE_TOKEN, video_id, + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] + + data = self._download_json( + '%s/media-items/%s' % (self._REST_API_BASE_VIDEO, video_id), + video_id, 'Downloading video JSON', query={ + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'vrtnu-web@PROD', + }, expected_status=400) + if 'title' not in data: + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) + + # Note: The title may be an empty string + title = data['title'] or f'{video_id}' + description = data.get('description') + + formats = [] + subtitles = {} + for target in data['targetUrls']: + format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) + if not format_url or not format_type: + continue + format_type = format_type.upper() + if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], + m3u8_id=format_type, fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_type, fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + elif format_type == 'HSS': + fmts, subs = self._extract_ism_formats_and_subtitles( + format_url, video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + self._sort_formats(formats) + + subtitle_urls = data.get('subtitleUrls') + if isinstance(subtitle_urls, list): + for subtitle in subtitle_urls: + subtitle_url = subtitle.get('url') + if subtitle_url and subtitle.get('type') == 'CLOSED': + subtitles.setdefault('nl', []).append({'url': subtitle_url}) return { - '_type': 'url_transparent', - 'id': asset_id, - 'display_id': display_id, + 'id': video_id, + 'display_id': video_id, 'title': title, 'description': description, - 'thumbnail': attrs.get('data-posterimage'), - 'timestamp': timestamp, - 'duration': float_or_none(attrs.get('data-duration'), 1000), - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), - 'ie_key': 'Canvas', + 'formats': formats, + 'duration': float_or_none(data.get('duration'), 1000), + 'thumbnail': data.get('posterImageUrl'), + 'subtitles': subtitles, } From 9c76ec8738f2d312cd3be18aafca7a16026413a7 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sat, 17 Sep 2022 01:16:24 +0200 Subject: [PATCH 04/23] VRTIE: regex, metadata --- yt_dlp/extractor/vrt.py | 78 ++++++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 410e23a79..75a61642e 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,9 +1,12 @@ import json +from time import (strftime, gmtime) from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( + parse_iso8601, ExtractorError, + int_or_none, float_or_none, str_or_none, url_or_none, @@ -13,14 +16,27 @@ from ..utils import ( class VRTIE(GigyaBaseIE): IE_DESC = 'VRT' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?P.+)/?' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.vrt.be/vrtnu/a-z/heizel-1985/trailer/heizel-1985-trailer/', 'info_dict': { 'id': 'pbs-pub-e1d6e4ec-cbf4-451e-9e87-d835bb65cd28$vid-2ad45eb6-9bc8-40d4-ad72-5f25c0f59d75', 'title': 'Trailer \'Heizel 1985\'', 'thumbnail': 'https://images.vrt.be/orig/2022/09/07/6e44ce6f-2eb3-11ed-b07d-02b7b76bf47f.jpg', - 'duration': 41.05 + 'ext': 'mp4', + 'duration': 41.05, + 'release_date': '20220908', + 'description': 'md5:816dcd9e3be706b16e4e32e3f723a5a1', + 'display_id': 'heizel-1985-trailer', + 'episode_number': 0, + 'timestamp': 1662609600, + 'series': 'Heizel 1985', + 'upload_date': '20220908', + 'channel': 'VRT', + 'season_id': '1662373764370', + 'episode_id': '1662373764405', + 'release_timestamp': 1662609600, + 'episode': 'Aflevering 0' }, }] _NETRC_MACHINE = 'vrtnu' @@ -84,15 +100,29 @@ class VRTIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - episode_data = self._download_json(f'{url.strip("/")}.model.json', display_id, - 'Downloading asset JSON', 'Unable to download asset JSON') - details = episode_data.get('details') + model_json = self._download_json(f'{url.strip("/")}.model.json', display_id, + 'Downloading asset JSON', 'Unable to download asset JSON') + details = model_json.get('details') actions = details.get('actions') + title = details.get('title') episode_publication_id = actions[2].get('episodePublicationId') episode_video_id = actions[2].get('episodeVideoId') video_id = f'{episode_publication_id}${episode_video_id}' - - data = None + description = details.get('description') + episode = details.get('data').get('episode') + display_id = episode.get('name') + timestamp = parse_iso8601(episode.get('onTime').get('raw')) + upload_date = strftime('%Y%m%d', gmtime(timestamp)) + series_info = details.get('data') + series = series_info.get('program').get('title') + season = series_info.get('season').get('title').get('value') + season_number = series_info.get('season').get('title').get('raw') + season_id = series_info.get('season').get('id') + episode = series_info.get('episode').get('number').get('value') + episode_number = series_info.get('episode').get('number').get('raw') + episode_id = series_info.get('episode').get('id') + + video_info = None vrtnutoken = "" if self._authenticated: @@ -108,27 +138,23 @@ class VRTIE(GigyaBaseIE): 'identityToken': vrtnutoken }).encode('utf-8'))['vrtPlayerToken'] - data = self._download_json( + video_info = self._download_json( '%s/media-items/%s' % (self._REST_API_BASE_VIDEO, video_id), video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': vrtPlayerToken, 'client': 'vrtnu-web@PROD', }, expected_status=400) - if 'title' not in data: - code = data.get('code') + if 'title' not in video_info: + code = video_info.get('code') if code == 'AUTHENTICATION_REQUIRED': self.raise_login_required() elif code == 'INVALID_LOCATION': self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) - - # Note: The title may be an empty string - title = data['title'] or f'{video_id}' - description = data.get('description') + raise ExtractorError(video_info.get('message') or code, expected=True) formats = [] subtitles = {} - for target in data['targetUrls']: + for target in video_info['targetUrls']: format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) if not format_url or not format_type: continue @@ -159,7 +185,7 @@ class VRTIE(GigyaBaseIE): }) self._sort_formats(formats) - subtitle_urls = data.get('subtitleUrls') + subtitle_urls = video_info.get('subtitleUrls') if isinstance(subtitle_urls, list): for subtitle in subtitle_urls: subtitle_url = subtitle.get('url') @@ -168,11 +194,23 @@ class VRTIE(GigyaBaseIE): return { 'id': video_id, - 'display_id': video_id, + 'display_id': display_id, + 'timestamp': timestamp, + 'release_timestamp': timestamp, + 'upload_date': upload_date, + 'release_date': upload_date, 'title': title, 'description': description, + 'series': series, + 'season': season, + 'season_number': int_or_none(season_number), + 'season_id': season_id, + 'episode': episode, + 'episode_number': episode_number, + 'episode_id': episode_id, + 'channel': 'VRT', 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), + 'duration': float_or_none(video_info.get('duration'), 1000), + 'thumbnail': video_info.get('posterImageUrl'), 'subtitles': subtitles, } From 5621944b0b00faa5cb82224c3b3261a1ea6022c3 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sat, 16 Sep 2023 01:23:38 +0200 Subject: [PATCH 05/23] login but no tokens --- yt_dlp/extractor/vrt.py | 147 ++++++++++++++++++++++++++++++---------- 1 file changed, 111 insertions(+), 36 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index ea609e44a..ad6d74ad6 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -2,6 +2,9 @@ import functools import json import time import urllib.parse +################## +import sys +################## from .gigya import GigyaBaseIE from ..networking.exceptions import HTTPError @@ -285,49 +288,121 @@ class VrtNUIE(VRTBaseIE): _NETRC_MACHINE = 'vrtnu' _authenticated = False + def _extract_cookies(self, res): + cookies_nvp = [header_value.split(';')[0] for header_value in res.headers.get_all('Set-Cookie')] + return {name: value for nvp in cookies_nvp for name, value in [nvp.split('=')]} + + def _create_cookie_header(self, cookies): + return { 'Cookie': '; '.join([f'{key}={value}' for key, value in cookies.items()]) } + def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy', - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) + # Step 1 + # ------ + res = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None) + session_cookies = self._extract_cookies(res) - if auth_info.get('errorDetails'): - raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True) +# print("===================================") +# print(res.status) +# # print(res.headers) +# print(json.dumps(cookies)) +# # print(self._make_cookie_header(cookies)) +# print("===================================") + + + # ( Step 2 : already done in Step 1 through redir ) + + # Step 3 + # ------ + headers = { + 'Content-Type': 'application/json', + 'Oidcxsrf': f'{session_cookies["OIDCXSRF"]}', + **self._create_cookie_header(session_cookies) + } +# sys.exit(0) + post_data = { "loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site" } + res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) + +# print("===================================") +# print(res.status) +# print("===================================") +# +# sys.exit(0) + + # Step 4 + # ------ + headers = { + 'Host': 'login.vrt.be', + **self._create_cookie_header(session_cookies) + } + + query= { + 'redirect_uri': 'https://www.vrt.be/vrtnu/sso/callback', + 'response_type': 'code', + 'client_id': 'vrtnu-site', +# 'scope': urllib.parse.quote('openid profile email address video accessibility subprofiles mid') +# 'scope': 'openid profile email address video accessibility subprofiles mid' + } + res = self._request_webpage('https://login.vrt.be/authorize', None, fatal=True, headers=headers, query=query) +# tokens = self._extract_cookies(res) + + print("===================================") + print(res.headers) +# print(json.dumps(tokens)) + print(res.status) + print("===================================") + + sys.exit(0) + + + +# auth_info = self._gigya_login({ +# 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy', +# 'targetEnv': 'jssdk', +# 'loginID': username, +# 'password': password, +# 'authMode': 'cookie', +# }) +# +# if auth_info.get('errorDetails'): +# raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True) + +# print("===================================") +# print(json.dumps(auth_info, indent=2)) +# print("===================================") +# sys.exit(0) # Sometimes authentication fails for no good reason, retry - for retry in self.RetryManager(): - if retry.attempt > 1: - self._sleep(1, None) - try: - self._request_webpage( - 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token', - errnote='Could not get XSRF Token', query={ - 'provider': 'site', - 'destination': 'https://www.vrt.be/vrtnu/', - }) - self._request_webpage( - 'https://login.vrt.be/perform_login', None, - note='Performing login', errnote='Login failed', - query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({ - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - })) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 401: - retry.error = e - continue - raise +# for retry in self.RetryManager(): +# if retry.attempt > 1: +# self._sleep(1, None) +# try: +# self._request_webpage( +# 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token', +# errnote='Could not get XSRF Token', query={ +# 'provider': 'site', +# 'destination': 'https://www.vrt.be/vrtnu/', +# }) +# self._request_webpage( +# 'https://login.vrt.be/perform_login', None, +# note='Performing login', errnote='Login failed', +# query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({ +# 'UID': auth_info['UID'], +# 'UIDSignature': auth_info['UIDSignature'], +# 'signatureTimestamp': auth_info['signatureTimestamp'], +# '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, +# })) +# except ExtractorError as e: +# if isinstance(e.cause, HTTPError) and e.cause.status == 401: +# retry.error = e +# continue +# raise self._authenticated = True def _real_extract(self, url): display_id = self._match_id(url) parsed_url = urllib.parse.urlparse(url) + print(f'Model JSON URL: {parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json') details = self._download_json( f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json', display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details'] @@ -339,9 +414,9 @@ class VrtNUIE(VRTBaseIE): if '$' not in video_id: raise ExtractorError('Unable to extract video ID') - vrtnutoken = self._download_json( - 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken', - errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None +# vrtnutoken = self._download_json( +# 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken', +# errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken) From 843649e2ef28b233bac331fc2b8e0398e8c3126f Mon Sep 17 00:00:00 2001 From: bergoid Date: Sun, 17 Sep 2023 10:15:38 +0200 Subject: [PATCH 06/23] non-working tryouts --- yt_dlp/extractor/vrt.py | 77 ++++++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index ad6d74ad6..4cb0baeb5 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,7 +1,9 @@ import functools import json import time +from http.cookiejar import CookieJar import urllib.parse +import urllib.request ################## import sys ################## @@ -234,6 +236,11 @@ class VRTIE(VRTBaseIE): } +class NoRedirect(urllib.request.HTTPRedirectHandler): + def redirect_request(self, req, fp, code, msg, headers, newurl): + return None + + class VrtNUIE(VRTBaseIE): IE_DESC = 'VRT MAX' _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' @@ -296,59 +303,81 @@ class VrtNUIE(VRTBaseIE): return { 'Cookie': '; '.join([f'{key}={value}' for key, value in cookies.items()]) } def _perform_login(self, username, password): - # Step 1 - # ------ - res = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None) - session_cookies = self._extract_cookies(res) -# print("===================================") -# print(res.status) -# # print(res.headers) -# print(json.dumps(cookies)) -# # print(self._make_cookie_header(cookies)) -# print("===================================") + # 1. Get session cookies + # Using urllib directly to be able to grab all cookies while following redirect + cookie_jar = CookieJar() + opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar)) + urllib.request.install_opener(opener) + res = urllib.request.urlopen('https://www.vrt.be/vrtnu/sso/login', None) + cookies_dict = {cookie.name: cookie.value for cookie in cookie_jar} + session_cookies = f'OIDCXSRF={cookies_dict["OIDCXSRF"]}; SESSION={cookies_dict["SESSION"]}' + +# ytdlp_cookies = self._get_cookies('https://www.vrt.be') +# session_cookies = self._extract_cookies(res) - # ( Step 2 : already done in Step 1 through redir ) + print("===================================") + print(res.status) + +# for cookie in cookie_jar: +# print(f'{cookie.name}={cookie.value}') - # Step 3 + print(cookies_dict) + print("===================================") + +# sys.exit(0) + + # 2. # ------ headers = { 'Content-Type': 'application/json', - 'Oidcxsrf': f'{session_cookies["OIDCXSRF"]}', - **self._create_cookie_header(session_cookies) +# 'Oidcxsrf': f'{cookies_dict["OIDCXSRF"]}', + 'Oidcxsrf': cookies_dict["OIDCXSRF"], + 'Cookie': session_cookies } # sys.exit(0) post_data = { "loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site" } res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) -# print("===================================") -# print(res.status) -# print("===================================") -# + print("===================================") + print(res.status) + print("===================================") + # sys.exit(0) - # Step 4 - # ------ + # 3. + opener = urllib.request.build_opener(NoRedirect) + urllib.request.install_opener(opener) headers = { 'Host': 'login.vrt.be', - **self._create_cookie_header(session_cookies) + 'Cookie': session_cookies, } query= { - 'redirect_uri': 'https://www.vrt.be/vrtnu/sso/callback', 'response_type': 'code', + 'access_type': 'offline', 'client_id': 'vrtnu-site', + 'scope': 'openid%20profile%20email%20address%20video%20accessibility%20subprofiles%20mid', + 'redirect_uri': 'https%3A%2F%2Fwww.vrt.be%2Fvrtnu%2Fsso%2Fcallback', + 'state': cookies_dict["oidcstate"], + 'code_challenge': 'cxsSoHdwPdr0yY8AwHnwfDWRSnImhlCTo4lWgbz7few', + 'code_challenge_method': 'S256' # 'scope': urllib.parse.quote('openid profile email address video accessibility subprofiles mid') # 'scope': 'openid profile email address video accessibility subprofiles mid' } - res = self._request_webpage('https://login.vrt.be/authorize', None, fatal=True, headers=headers, query=query) + request = urllib.request.Request('https://login.vrt.be/authorize?' + urllib.parse.urlencode(query), headers=headers) + res = urllib.request.urlopen(request, None) +# cookies_dict = {cookie.name: cookie.value for cookie in cookie_jar} +# res = self._request_webpage('https://login.vrt.be/authorize', None, fatal=True, headers=headers, query=query) # tokens = self._extract_cookies(res) print("===================================") + print('https://login.vrt.be/authorize?' + urllib.parse.urlencode(query)) + print(res.status) print(res.headers) +# print(cookies_dict) # print(json.dumps(tokens)) - print(res.status) print("===================================") sys.exit(0) From b619e119e6926c7b58cdb422d80df67ec7d7f4d9 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sun, 17 Sep 2023 11:50:57 +0200 Subject: [PATCH 07/23] Able to get site_profile tokens --- yt_dlp/extractor/vrt.py | 83 +++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 4cb0baeb5..302637165 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -237,8 +237,11 @@ class VRTIE(VRTBaseIE): class NoRedirect(urllib.request.HTTPRedirectHandler): - def redirect_request(self, req, fp, code, msg, headers, newurl): - return None + def http_error_302(self, req, fp, code, msg, headers): + result = urllib.error.HTTPError(req.get_full_url(), code, msg, headers, fp) + return result + http_error_301 = http_error_303 = http_error_307 = http_error_302 + class VrtNUIE(VRTBaseIE): @@ -305,75 +308,81 @@ class VrtNUIE(VRTBaseIE): def _perform_login(self, username, password): - # 1. Get session cookies - # Using urllib directly to be able to grab all cookies while following redirect cookie_jar = CookieJar() - opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar)) + + # Disable automatic redirection to be able to + # grab necessary info in intermediate step + opener= urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(cookie_jar)) urllib.request.install_opener(opener) - res = urllib.request.urlopen('https://www.vrt.be/vrtnu/sso/login', None) - cookies_dict = {cookie.name: cookie.value for cookie in cookie_jar} - session_cookies = f'OIDCXSRF={cookies_dict["OIDCXSRF"]}; SESSION={cookies_dict["SESSION"]}' -# ytdlp_cookies = self._get_cookies('https://www.vrt.be') -# session_cookies = self._extract_cookies(res) + # 1.a Visit 'login' URL. Get 'authorize' location and 'oidcstate' cookie + res = urllib.request.urlopen('https://www.vrt.be/vrtnu/sso/login', None) + auth_url = res.headers.get_all('Location')[0] print("===================================") + print('login') print(res.status) - + print(res.headers.get_all('Location')[0]) # for cookie in cookie_jar: # print(f'{cookie.name}={cookie.value}') + print("===================================") + + + # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies + res = urllib.request.urlopen(auth_url, None) + cookies_dict = {cookie.name: cookie.value for cookie in cookie_jar} + cookies_header = f'OIDCXSRF={cookies_dict["OIDCXSRF"]}; SESSION={cookies_dict["SESSION"]}' + print("===================================") + print('authorize') + print(res.status) print(cookies_dict) print("===================================") - + # sys.exit(0) - # 2. - # ------ + # 2. Perform login headers = { 'Content-Type': 'application/json', -# 'Oidcxsrf': f'{cookies_dict["OIDCXSRF"]}', 'Oidcxsrf': cookies_dict["OIDCXSRF"], - 'Cookie': session_cookies + 'Cookie': cookies_header } -# sys.exit(0) post_data = { "loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site" } res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) print("===================================") + print('perform_login') print(res.status) print("===================================") # sys.exit(0) - # 3. - opener = urllib.request.build_opener(NoRedirect) - urllib.request.install_opener(opener) + # 3.a Visit 'authorize' again headers = { 'Host': 'login.vrt.be', - 'Cookie': session_cookies, + 'Cookie': cookies_header } + request = urllib.request.Request(auth_url, headers=headers) + res = urllib.request.urlopen(request, None) + callback_url = res.headers.get_all('Location')[0] - query= { - 'response_type': 'code', - 'access_type': 'offline', - 'client_id': 'vrtnu-site', - 'scope': 'openid%20profile%20email%20address%20video%20accessibility%20subprofiles%20mid', - 'redirect_uri': 'https%3A%2F%2Fwww.vrt.be%2Fvrtnu%2Fsso%2Fcallback', - 'state': cookies_dict["oidcstate"], - 'code_challenge': 'cxsSoHdwPdr0yY8AwHnwfDWRSnImhlCTo4lWgbz7few', - 'code_challenge_method': 'S256' -# 'scope': urllib.parse.quote('openid profile email address video accessibility subprofiles mid') -# 'scope': 'openid profile email address video accessibility subprofiles mid' + print("===================================") + print('authorize') + print(res.status) + print(res.headers.get_all('Location')[0]) +# print(cookies_dict) +# print(json.dumps(tokens)) + print("===================================") + + # 3.b Visit 'callback' + headers = { + 'Cookie': f'oidcstate={cookies_dict["oidcstate"]}' } - request = urllib.request.Request('https://login.vrt.be/authorize?' + urllib.parse.urlencode(query), headers=headers) + request = urllib.request.Request(callback_url, headers=headers) res = urllib.request.urlopen(request, None) -# cookies_dict = {cookie.name: cookie.value for cookie in cookie_jar} -# res = self._request_webpage('https://login.vrt.be/authorize', None, fatal=True, headers=headers, query=query) -# tokens = self._extract_cookies(res) print("===================================") - print('https://login.vrt.be/authorize?' + urllib.parse.urlencode(query)) + print('callback') print(res.status) print(res.headers) # print(cookies_dict) From 28e4722b005329d4a8a199b3260cf85e29632c55 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sun, 17 Sep 2023 16:19:33 +0200 Subject: [PATCH 08/23] Some streamlining --- yt_dlp/extractor/vrt.py | 48 +++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 302637165..4c0aa2e09 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -237,12 +237,25 @@ class VRTIE(VRTBaseIE): class NoRedirect(urllib.request.HTTPRedirectHandler): + def http_error_302(self, req, fp, code, msg, headers): result = urllib.error.HTTPError(req.get_full_url(), code, msg, headers, fp) return result + http_error_301 = http_error_303 = http_error_307 = http_error_302 +class CookiePot(CookieJar): + + def __getitem__(self, name): + for cookie in self: + if cookie.name == name: + return cookie.value + return None + + def __str__(self): + return '\n'.join(f'{cookie.name}={cookie.value}' for cookie in self) + class VrtNUIE(VRTBaseIE): IE_DESC = 'VRT MAX' @@ -308,11 +321,19 @@ class VrtNUIE(VRTBaseIE): def _perform_login(self, username, password): - cookie_jar = CookieJar() + cookies = CookiePot() + + # TODO: + # 1. Does the _request_webpage() respect this opener too? + # 2. If so: + # a. modify (and rename) the class NoRedirect to store the 'Location:' header + # b. Steps 1.a & 1.b become one call to _request_webpage() + # + # https://stackoverflow.com/questions/47002795/how-to-trace-or-to-check-history-of-redirected-urls-with-python-only-urllib-libr # Disable automatic redirection to be able to # grab necessary info in intermediate step - opener= urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(cookie_jar)) + opener= urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(cookies)) urllib.request.install_opener(opener) # 1.a Visit 'login' URL. Get 'authorize' location and 'oidcstate' cookie @@ -323,28 +344,28 @@ class VrtNUIE(VRTBaseIE): print('login') print(res.status) print(res.headers.get_all('Location')[0]) -# for cookie in cookie_jar: +# for cookie in cookies: # print(f'{cookie.name}={cookie.value}') print("===================================") # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies res = urllib.request.urlopen(auth_url, None) - cookies_dict = {cookie.name: cookie.value for cookie in cookie_jar} - cookies_header = f'OIDCXSRF={cookies_dict["OIDCXSRF"]}; SESSION={cookies_dict["SESSION"]}' +# cookies = {cookie.name: cookie.value for cookie in cookies} + cookies_header = f'OIDCXSRF={cookies["OIDCXSRF"]}; SESSION={cookies["SESSION"]}' print("===================================") print('authorize') print(res.status) - print(cookies_dict) + print(cookies) print("===================================") - + # sys.exit(0) # 2. Perform login headers = { 'Content-Type': 'application/json', - 'Oidcxsrf': cookies_dict["OIDCXSRF"], + 'Oidcxsrf': cookies["OIDCXSRF"], 'Cookie': cookies_header } post_data = { "loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site" } @@ -357,6 +378,8 @@ class VrtNUIE(VRTBaseIE): # sys.exit(0) + # TODO: re-enable auto redir here and do step 3 in one urlopen() call? + # 3.a Visit 'authorize' again headers = { 'Host': 'login.vrt.be', @@ -370,13 +393,13 @@ class VrtNUIE(VRTBaseIE): print('authorize') print(res.status) print(res.headers.get_all('Location')[0]) -# print(cookies_dict) +# print(cookies) # print(json.dumps(tokens)) print("===================================") # 3.b Visit 'callback' headers = { - 'Cookie': f'oidcstate={cookies_dict["oidcstate"]}' + 'Cookie': f'oidcstate={cookies["oidcstate"]}' } request = urllib.request.Request(callback_url, headers=headers) res = urllib.request.urlopen(request, None) @@ -385,8 +408,11 @@ class VrtNUIE(VRTBaseIE): print('callback') print(res.status) print(res.headers) -# print(cookies_dict) + print(cookies) # print(json.dumps(tokens)) + +# for cookie in cookies: +# print(f'{cookie.name}={cookie.value}') print("===================================") sys.exit(0) From c96f4429f3f94856079bbf0b04ce1c7e25081c95 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sun, 17 Sep 2023 17:44:16 +0200 Subject: [PATCH 09/23] Able to get vrtPlayerToken --- yt_dlp/extractor/vrt.py | 183 +++++++++++++++++++++------------------- 1 file changed, 97 insertions(+), 86 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 4c0aa2e09..2dd0451f3 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -36,22 +36,23 @@ class VRTBaseIE(GigyaBaseIE): _PLAYER_INFO = { 'platform': 'desktop', 'app': { - 'type': 'browser', - 'name': 'Chrome', - }, + 'type': 'browser', + 'name': 'Chrome' + }, 'device': 'undefined (undefined)', 'os': { - 'name': 'Windows', - 'version': 'x86_64' - }, + 'name': 'Windows', + 'version': 'x86_64' + }, 'player': { - 'name': 'VRT web player', - 'version': '2.7.4-prod-2023-04-19T06:05:45' + 'name': 'VRT web player', + 'version': '3.2.6-prod-2023-09-11T12:37:41' + } } - } # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' - _JWT_SIGNING_KEY = 'b5f500d55cb44715107249ccd8a5c0136cfb2788dbb71b90a4f142423bacaf38' # -dev + _JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae' +# _JWT_SIGNING_KEY = 'b5f500d55cb44715107249ccd8a5c0136cfb2788dbb71b90a4f142423bacaf38' # -dev # player-stag.vrt.be key: d23987504521ae6fbf2716caca6700a24bb1579477b43c84e146b279de5ca595 # player.vrt.be key: 2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae @@ -153,6 +154,7 @@ class VRTIE(VRTBaseIE): } _authenticated = False + _id_token = '' def _perform_login(self, username, password): auth_info = self._gigya_login({ @@ -311,12 +313,12 @@ class VrtNUIE(VRTBaseIE): _NETRC_MACHINE = 'vrtnu' _authenticated = False - def _extract_cookies(self, res): - cookies_nvp = [header_value.split(';')[0] for header_value in res.headers.get_all('Set-Cookie')] - return {name: value for nvp in cookies_nvp for name, value in [nvp.split('=')]} - - def _create_cookie_header(self, cookies): - return { 'Cookie': '; '.join([f'{key}={value}' for key, value in cookies.items()]) } +# def _extract_cookies(self, res): +# cookies_nvp = [header_value.split(';')[0] for header_value in res.headers.get_all('Set-Cookie')] +# return {name: value for nvp in cookies_nvp for name, value in [nvp.split('=')]} +# +# def _create_cookie_header(self, cookies): +# return { 'Cookie': '; '.join([f'{key}={value}' for key, value in cookies.items()]) } def _perform_login(self, username, password): @@ -340,25 +342,24 @@ class VrtNUIE(VRTBaseIE): res = urllib.request.urlopen('https://www.vrt.be/vrtnu/sso/login', None) auth_url = res.headers.get_all('Location')[0] - print("===================================") - print('login') - print(res.status) - print(res.headers.get_all('Location')[0]) -# for cookie in cookies: -# print(f'{cookie.name}={cookie.value}') - print("===================================") +# print("===================================") +# print('login') +# print(res.status) +# print(res.headers.get_all('Location')[0]) +# # for cookie in cookies: +# # print(f'{cookie.name}={cookie.value}') +# print("===================================") # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies res = urllib.request.urlopen(auth_url, None) -# cookies = {cookie.name: cookie.value for cookie in cookies} cookies_header = f'OIDCXSRF={cookies["OIDCXSRF"]}; SESSION={cookies["SESSION"]}' - print("===================================") - print('authorize') - print(res.status) - print(cookies) - print("===================================") +# print("===================================") +# print('authorize') +# print(res.status) +# print(cookies) +# print("===================================") # sys.exit(0) @@ -371,14 +372,16 @@ class VrtNUIE(VRTBaseIE): post_data = { "loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site" } res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) - print("===================================") - print('perform_login') - print(res.status) - print("===================================") +# print("===================================") +# print('perform_login') +# print(res.status) +# print("===================================") # sys.exit(0) - # TODO: re-enable auto redir here and do step 3 in one urlopen() call? + # TODO: + # . re-enable auto redir here and do step 3 in one urlopen() call? + # . should this step be the new "refreshtoken" in _real_extract? # 3.a Visit 'authorize' again headers = { @@ -389,13 +392,13 @@ class VrtNUIE(VRTBaseIE): res = urllib.request.urlopen(request, None) callback_url = res.headers.get_all('Location')[0] - print("===================================") - print('authorize') - print(res.status) - print(res.headers.get_all('Location')[0]) -# print(cookies) -# print(json.dumps(tokens)) - print("===================================") +# print("===================================") +# print('authorize') +# print(res.status) +# print(res.headers.get_all('Location')[0]) +# # print(cookies) +# # print(json.dumps(tokens)) +# print("===================================") # 3.b Visit 'callback' headers = { @@ -404,62 +407,70 @@ class VrtNUIE(VRTBaseIE): request = urllib.request.Request(callback_url, headers=headers) res = urllib.request.urlopen(request, None) - print("===================================") - print('callback') - print(res.status) - print(res.headers) - print(cookies) + _id_token = cookies['vrtnu-site_profile_vt'] + +# print("===================================") +# print('callback') +# print(res.status) +# print(res.headers) +# print(cookies) # print(json.dumps(tokens)) # for cookie in cookies: # print(f'{cookie.name}={cookie.value}') print("===================================") - sys.exit(0) +# sys.exit(0) + # 4. Obtain vrtPlayerToken + + # TODO: make this a constant at the top + ######################################### + player_info_base = { + 'platform': 'desktop', + 'app': { + 'type': 'browser', + 'name': 'Chrome' + }, + 'device': 'undefined (undefined)', + 'os': { + 'name': 'Windows', + 'version': 'x86_64' + }, + 'player': { + 'name': 'VRT web player', + 'version': '3.2.6-prod-2023-09-11T12:37:41' + } + } + + ######################################### + + # TODO: should move to _call_api() + + player_info = {'exp': (round(time.time(), 3) + 900), **player_info_base} + player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ + 'kid': self._JWT_KEY_ID + }).decode() + headers = { + **self.geo_verification_headers(), + 'Content-Type': 'application/json', + } -# auth_info = self._gigya_login({ -# 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy', -# 'targetEnv': 'jssdk', -# 'loginID': username, -# 'password': password, -# 'authMode': 'cookie', -# }) -# -# if auth_info.get('errorDetails'): -# raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True) + data = { + 'identityToken': _id_token, + 'playerInfo': player_info_jwt + } -# print("===================================") -# print(json.dumps(auth_info, indent=2)) -# print("===================================") -# sys.exit(0) + json_response = self._download_json( + 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', + None, 'Downloading player token', headers=headers, data=json.dumps(data).encode()) - # Sometimes authentication fails for no good reason, retry -# for retry in self.RetryManager(): -# if retry.attempt > 1: -# self._sleep(1, None) -# try: -# self._request_webpage( -# 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token', -# errnote='Could not get XSRF Token', query={ -# 'provider': 'site', -# 'destination': 'https://www.vrt.be/vrtnu/', -# }) -# self._request_webpage( -# 'https://login.vrt.be/perform_login', None, -# note='Performing login', errnote='Login failed', -# query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({ -# 'UID': auth_info['UID'], -# 'UIDSignature': auth_info['UIDSignature'], -# 'signatureTimestamp': auth_info['signatureTimestamp'], -# '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, -# })) -# except ExtractorError as e: -# if isinstance(e.cause, HTTPError) and e.cause.status == 401: -# retry.error = e -# continue -# raise + print("===================================") + print(json.dumps(json_response)) + print("===================================") + + sys.exit(0) self._authenticated = True From deeb4857bd05a8adf4565120f8cd6482170ffdff Mon Sep 17 00:00:00 2001 From: bergoid Date: Sun, 17 Sep 2023 23:17:58 +0200 Subject: [PATCH 10/23] Download works --- yt_dlp/extractor/vrt.py | 232 ++++++++++++---------------------------- 1 file changed, 67 insertions(+), 165 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 2dd0451f3..333ff1f9c 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -4,9 +4,6 @@ import time from http.cookiejar import CookieJar import urllib.parse import urllib.request -################## -import sys -################## from .gigya import GigyaBaseIE from ..networking.exceptions import HTTPError @@ -33,6 +30,7 @@ from ..utils import ( class VRTBaseIE(GigyaBaseIE): _GEO_BYPASS = False + _PLAYER_INFO = { 'platform': 'desktop', 'app': { @@ -49,6 +47,9 @@ class VRTBaseIE(GigyaBaseIE): 'version': '3.2.6-prod-2023-09-11T12:37:41' } } + + _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n permalink\n seo {\n ...seoFragment\n __typename\n }\n socialSharing {\n ...socialSharingFragment\n __typename\n }\n trackingData {\n data\n perTrigger {\n trigger\n data\n template {\n id\n __typename\n }\n __typename\n }\n __typename\n }\n ldjson\n components {\n __typename\n ... on IComponent {\n componentType\n __typename\n }\n }\n episode {\n id\n title\n available\n whatsonId\n brand\n brandLogos {\n type\n width\n height\n primary\n mono\n __typename\n }\n logo\n primaryMeta {\n ...metaFragment\n __typename\n }\n secondaryMeta {\n ...metaFragment\n __typename\n }\n image {\n ...imageFragment\n __typename\n }\n durationRaw\n durationValue\n durationSeconds\n onTimeRaw\n offTimeRaw\n ageRaw\n regionRaw\n announcementValue\n name\n episodeNumberRaw\n episodeNumberValue\n subtitle\n richDescription {\n __typename\n html\n }\n program {\n id\n link\n title\n __typename\n }\n watchAction {\n streamId\n videoId\n episodeId\n avodUrl\n resumePoint\n __typename\n }\n shareAction {\n title\n description\n image {\n templateUrl\n __typename\n }\n url\n __typename\n }\n favoriteAction {\n id\n title\n favorite\n programWhatsonId\n programUrl\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment metaFragment on MetaDataItem {\n __typename\n type\n value\n shortValue\n longValue\n}\nfragment imageFragment on Image {\n objectId\n id: objectId\n alt\n title\n focalPoint\n templateUrl\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}\nfragment socialSharingFragment on SocialSharingProperties {\n __typename\n title\n description\n image {\n __typename\n id: objectId\n templateUrl\n }\n}" + # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' _JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae' @@ -95,17 +96,24 @@ class VRTBaseIE(GigyaBaseIE): def _call_api(self, video_id, client='null', id_token=None, version='v2'): player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} - player_token = self._download_json( - 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', - video_id, 'Downloading player token', headers={ - **self.geo_verification_headers(), - 'Content-Type': 'application/json', - }, data=json.dumps({ - 'identityToken': id_token or {}, - 'playerInfo': jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ + player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ 'kid': self._JWT_KEY_ID }).decode() - }, separators=(',', ':')).encode())['vrtPlayerToken'] + + headers = { + **self.geo_verification_headers(), + 'Content-Type': 'application/json', + } + + data = { + 'identityToken': id_token or self._cookies['vrtnu-site_profile_vt'], + 'playerInfo': player_info_jwt + } + + json_response = self._download_json( + 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', + None, 'Downloading player token', headers=headers, data=json.dumps(data).encode()) + player_token = json_response['vrtPlayerToken'] return self._download_json( f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}', @@ -154,7 +162,6 @@ class VRTIE(VRTBaseIE): } _authenticated = False - _id_token = '' def _perform_login(self, username, password): auth_info = self._gigya_login({ @@ -312,19 +319,11 @@ class VrtNUIE(VRTBaseIE): }] _NETRC_MACHINE = 'vrtnu' _authenticated = False - -# def _extract_cookies(self, res): -# cookies_nvp = [header_value.split(';')[0] for header_value in res.headers.get_all('Set-Cookie')] -# return {name: value for nvp in cookies_nvp for name, value in [nvp.split('=')]} -# -# def _create_cookie_header(self, cookies): -# return { 'Cookie': '; '.join([f'{key}={value}' for key, value in cookies.items()]) } + _cookies = CookiePot() def _perform_login(self, username, password): - cookies = CookiePot() - # TODO: # 1. Does the _request_webpage() respect this opener too? # 2. If so: @@ -335,50 +334,26 @@ class VrtNUIE(VRTBaseIE): # Disable automatic redirection to be able to # grab necessary info in intermediate step - opener= urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(cookies)) + opener= urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(self._cookies)) urllib.request.install_opener(opener) # 1.a Visit 'login' URL. Get 'authorize' location and 'oidcstate' cookie res = urllib.request.urlopen('https://www.vrt.be/vrtnu/sso/login', None) auth_url = res.headers.get_all('Location')[0] -# print("===================================") -# print('login') -# print(res.status) -# print(res.headers.get_all('Location')[0]) -# # for cookie in cookies: -# # print(f'{cookie.name}={cookie.value}') -# print("===================================") - - # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies res = urllib.request.urlopen(auth_url, None) - cookies_header = f'OIDCXSRF={cookies["OIDCXSRF"]}; SESSION={cookies["SESSION"]}' - -# print("===================================") -# print('authorize') -# print(res.status) -# print(cookies) -# print("===================================") - -# sys.exit(0) + cookies_header = f'OIDCXSRF={self._cookies["OIDCXSRF"]}; SESSION={self._cookies["SESSION"]}' # 2. Perform login headers = { 'Content-Type': 'application/json', - 'Oidcxsrf': cookies["OIDCXSRF"], + 'Oidcxsrf': self._cookies["OIDCXSRF"], 'Cookie': cookies_header } post_data = { "loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site" } res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) -# print("===================================") -# print('perform_login') -# print(res.status) -# print("===================================") - -# sys.exit(0) - # TODO: # . re-enable auto redir here and do step 3 in one urlopen() call? # . should this step be the new "refreshtoken" in _real_extract? @@ -392,144 +367,71 @@ class VrtNUIE(VRTBaseIE): res = urllib.request.urlopen(request, None) callback_url = res.headers.get_all('Location')[0] -# print("===================================") -# print('authorize') -# print(res.status) -# print(res.headers.get_all('Location')[0]) -# # print(cookies) -# # print(json.dumps(tokens)) -# print("===================================") - # 3.b Visit 'callback' headers = { - 'Cookie': f'oidcstate={cookies["oidcstate"]}' + 'Cookie': f'oidcstate={self._cookies["oidcstate"]}' } request = urllib.request.Request(callback_url, headers=headers) res = urllib.request.urlopen(request, None) - _id_token = cookies['vrtnu-site_profile_vt'] - -# print("===================================") -# print('callback') -# print(res.status) -# print(res.headers) -# print(cookies) -# print(json.dumps(tokens)) - -# for cookie in cookies: -# print(f'{cookie.name}={cookie.value}') - print("===================================") - -# sys.exit(0) - - # 4. Obtain vrtPlayerToken - - # TODO: make this a constant at the top - ######################################### - player_info_base = { - 'platform': 'desktop', - 'app': { - 'type': 'browser', - 'name': 'Chrome' - }, - 'device': 'undefined (undefined)', - 'os': { - 'name': 'Windows', - 'version': 'x86_64' - }, - 'player': { - 'name': 'VRT web player', - 'version': '3.2.6-prod-2023-09-11T12:37:41' - } - } - - ######################################### + self._authenticated = True - # TODO: should move to _call_api() - player_info = {'exp': (round(time.time(), 3) + 900), **player_info_base} - player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ - 'kid': self._JWT_KEY_ID - }).decode() + def _real_extract(self, url): + display_id = self._match_id(url) + parsed_url = urllib.parse.urlparse(url) + print(f'pageId: {parsed_url.path.rstrip("/")}.model.json') headers = { - **self.geo_verification_headers(), - 'Content-Type': 'application/json', - } + 'Origin': 'https://www.vrt.be', + 'Referer': f'{url}', + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self._cookies["vrtnu-site_profile_at"]}' + } data = { - 'identityToken': _id_token, - 'playerInfo': player_info_jwt + 'operationName': 'VideoPage', + 'query': self._VIDEOPAGE_QUERY , + 'variables': { + 'pageId': f'{parsed_url.path.rstrip("/")}.model.json' + } } - json_response = self._download_json( - 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', - None, 'Downloading player token', headers=headers, data=json.dumps(data).encode()) - - print("===================================") - print(json.dumps(json_response)) - print("===================================") + model_json = self._download_json( + 'https://www.vrt.be/vrtnu-api/graphql/v1', + display_id, 'Downloading asset JSON', 'Unable to download asset JSON', headers=headers, data=json.dumps(data).encode())['data']['page'] - sys.exit(0) + video_id = model_json['episode']['watchAction']['streamId'] + title = model_json['seo']['title'] + season_number = int(model_json['episode']['onTimeRaw'][:4]) + ld_json = json.loads(model_json['ldjson'][1]) - self._authenticated = True - - def _real_extract(self, url): - display_id = self._match_id(url) - parsed_url = urllib.parse.urlparse(url) - print(f'Model JSON URL: {parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json') - details = self._download_json( - f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json', - display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details'] - - watch_info = traverse_obj(details, ( - 'actions', lambda _, v: v['type'] == 'watch-episode', {dict}), get_all=False) or {} - video_id = join_nonempty( - 'episodePublicationId', 'episodeVideoId', delim='$', from_dict=watch_info) - if '$' not in video_id: - raise ExtractorError('Unable to extract video ID') - -# vrtnutoken = self._download_json( -# 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken', -# errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None - - video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken) - - if 'title' not in video_info: - code = video_info.get('code') - if code in ('AUTHENTICATION_REQUIRED', 'CONTENT_IS_AGE_RESTRICTED'): - self.raise_login_required(code, method='password') - elif code in ('INVALID_LOCATION', 'CONTENT_AVAILABLE_ONLY_IN_BE'): - self.raise_geo_restricted(countries=['BE']) - elif code == 'CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS': - if not self._authenticated: - self.raise_login_required(code, method='password') - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(code, expected=True) - - formats, subtitles = self._extract_formats_and_subtitles(video_info, video_id) + streaming_json = self._call_api(video_id, client='vrtnu-web@PROD') + formats, subtitles = self._extract_formats_and_subtitles(streaming_json, video_id) return { - **traverse_obj(details, { - 'title': 'title', - 'description': ('description', {clean_html}), - 'timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), - 'release_timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), - 'series': ('data', 'program', 'title'), - 'season': ('data', 'season', 'title', 'value'), - 'season_number': ('data', 'season', 'title', 'raw', {int_or_none}), - 'season_id': ('data', 'season', 'id', {str_or_none}), - 'episode': ('data', 'episode', 'number', 'value', {str_or_none}), - 'episode_number': ('data', 'episode', 'number', 'raw', {int_or_none}), - 'episode_id': ('data', 'episode', 'id', {str_or_none}), - 'age_limit': ('data', 'episode', 'age', 'raw', {parse_age_limit}), + **traverse_obj(model_json, { + 'description': ('seo', 'description', {clean_html}), + 'timestamp': ( 'episode', 'onTimeRaw', {parse_iso8601}), + 'release_timestamp': ( 'episode', 'onTimeRaw', {parse_iso8601}), + 'series': ('episode', 'program', 'title'), + 'episode': ('episode', 'episodeNumberRaw', {str_or_none}), + 'episode_number': ('episode', 'episodeNumberRaw', {int_or_none}), + 'age_limit': ('episode', 'ageRaw', {parse_age_limit}), + 'display_id': ('episode', 'name', {parse_age_limit}), + }), + **traverse_obj(ld_json, { + 'season': ('partOfSeason', 'name'), + 'season_id': ('partOfSeason', '@id'), + 'episode_id': ('@id', {str_or_none}), }), + 'title': title, + 'season_number': season_number, 'id': video_id, - 'display_id': display_id, 'channel': 'VRT', 'formats': formats, - 'duration': float_or_none(video_info.get('duration'), 1000), - 'thumbnail': url_or_none(video_info.get('posterImageUrl')), + 'duration': float_or_none(streaming_json.get('duration'), 1000), + 'thumbnail': url_or_none(streaming_json.get('posterImageUrl')), 'subtitles': subtitles, '_old_archive_ids': [make_archive_id('Canvas', video_id)], } From 44b69976b11c4ff62b5a32fcd7cb11b9cff1862a Mon Sep 17 00:00:00 2001 From: bergoid Date: Thu, 21 Sep 2023 00:11:48 +0200 Subject: [PATCH 11/23] Disabled playerInfo token --- yt_dlp/extractor/vrt.py | 78 ++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 47 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 333ff1f9c..43bf65727 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -31,22 +31,22 @@ from ..utils import ( class VRTBaseIE(GigyaBaseIE): _GEO_BYPASS = False - _PLAYER_INFO = { - 'platform': 'desktop', - 'app': { - 'type': 'browser', - 'name': 'Chrome' - }, - 'device': 'undefined (undefined)', - 'os': { - 'name': 'Windows', - 'version': 'x86_64' - }, - 'player': { - 'name': 'VRT web player', - 'version': '3.2.6-prod-2023-09-11T12:37:41' - } - } +# _PLAYER_INFO = { +# 'platform': 'desktop', +# 'app': { +# 'type': 'browser', +# 'name': 'Chrome' +# }, +# 'device': 'undefined (undefined)', +# 'os': { +# 'name': 'Windows', +# 'version': 'x86_64' +# }, +# 'player': { +# 'name': 'VRT web player', +# 'version': '3.2.6-prod-2023-09-11T12:37:41' +# } +# } _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n permalink\n seo {\n ...seoFragment\n __typename\n }\n socialSharing {\n ...socialSharingFragment\n __typename\n }\n trackingData {\n data\n perTrigger {\n trigger\n data\n template {\n id\n __typename\n }\n __typename\n }\n __typename\n }\n ldjson\n components {\n __typename\n ... on IComponent {\n componentType\n __typename\n }\n }\n episode {\n id\n title\n available\n whatsonId\n brand\n brandLogos {\n type\n width\n height\n primary\n mono\n __typename\n }\n logo\n primaryMeta {\n ...metaFragment\n __typename\n }\n secondaryMeta {\n ...metaFragment\n __typename\n }\n image {\n ...imageFragment\n __typename\n }\n durationRaw\n durationValue\n durationSeconds\n onTimeRaw\n offTimeRaw\n ageRaw\n regionRaw\n announcementValue\n name\n episodeNumberRaw\n episodeNumberValue\n subtitle\n richDescription {\n __typename\n html\n }\n program {\n id\n link\n title\n __typename\n }\n watchAction {\n streamId\n videoId\n episodeId\n avodUrl\n resumePoint\n __typename\n }\n shareAction {\n title\n description\n image {\n templateUrl\n __typename\n }\n url\n __typename\n }\n favoriteAction {\n id\n title\n favorite\n programWhatsonId\n programUrl\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment metaFragment on MetaDataItem {\n __typename\n type\n value\n shortValue\n longValue\n}\nfragment imageFragment on Image {\n objectId\n id: objectId\n alt\n title\n focalPoint\n templateUrl\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}\nfragment socialSharingFragment on SocialSharingProperties {\n __typename\n title\n description\n image {\n __typename\n id: objectId\n templateUrl\n }\n}" @@ -95,32 +95,31 @@ class VRTBaseIE(GigyaBaseIE): return formats, subtitles def _call_api(self, video_id, client='null', id_token=None, version='v2'): - player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} - player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ - 'kid': self._JWT_KEY_ID - }).decode() +# player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} +# player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ +# 'kid': self._JWT_KEY_ID +# }).decode() headers = { - **self.geo_verification_headers(), - 'Content-Type': 'application/json', + 'Content-Type': 'application/json' } data = { 'identityToken': id_token or self._cookies['vrtnu-site_profile_vt'], - 'playerInfo': player_info_jwt +# 'playerInfo': player_info_jwt } json_response = self._download_json( - 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', + f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', None, 'Downloading player token', headers=headers, data=json.dumps(data).encode()) player_token = json_response['vrtPlayerToken'] return self._download_json( - f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}', + f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/videos/{video_id}', video_id, 'Downloading API JSON', query={ 'vrtPlayerToken': player_token, 'client': client, - }, expected_status=400) + }) class VRTIE(VRTBaseIE): @@ -268,7 +267,7 @@ class CookiePot(CookieJar): class VrtNUIE(VRTBaseIE): IE_DESC = 'VRT MAX' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/(vrtmax|vrtnu)/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' _TESTS = [{ # CONTENT_IS_AGE_RESTRICTED 'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/', @@ -323,26 +322,16 @@ class VrtNUIE(VRTBaseIE): def _perform_login(self, username, password): - - # TODO: - # 1. Does the _request_webpage() respect this opener too? - # 2. If so: - # a. modify (and rename) the class NoRedirect to store the 'Location:' header - # b. Steps 1.a & 1.b become one call to _request_webpage() - # - # https://stackoverflow.com/questions/47002795/how-to-trace-or-to-check-history-of-redirected-urls-with-python-only-urllib-libr - # Disable automatic redirection to be able to # grab necessary info in intermediate step - opener= urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(self._cookies)) - urllib.request.install_opener(opener) + opener = urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(self._cookies)) # 1.a Visit 'login' URL. Get 'authorize' location and 'oidcstate' cookie - res = urllib.request.urlopen('https://www.vrt.be/vrtnu/sso/login', None) + res = opener.open('https://www.vrt.be/vrtnu/sso/login', None) auth_url = res.headers.get_all('Location')[0] # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies - res = urllib.request.urlopen(auth_url, None) + res = opener.open(auth_url, None) cookies_header = f'OIDCXSRF={self._cookies["OIDCXSRF"]}; SESSION={self._cookies["SESSION"]}' # 2. Perform login @@ -355,16 +344,14 @@ class VrtNUIE(VRTBaseIE): res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) # TODO: - # . re-enable auto redir here and do step 3 in one urlopen() call? # . should this step be the new "refreshtoken" in _real_extract? # 3.a Visit 'authorize' again headers = { - 'Host': 'login.vrt.be', 'Cookie': cookies_header } request = urllib.request.Request(auth_url, headers=headers) - res = urllib.request.urlopen(request, None) + res = opener.open(request, None) callback_url = res.headers.get_all('Location')[0] # 3.b Visit 'callback' @@ -372,7 +359,7 @@ class VrtNUIE(VRTBaseIE): 'Cookie': f'oidcstate={self._cookies["oidcstate"]}' } request = urllib.request.Request(callback_url, headers=headers) - res = urllib.request.urlopen(request, None) + res = opener.open(request, None) self._authenticated = True @@ -380,11 +367,8 @@ class VrtNUIE(VRTBaseIE): def _real_extract(self, url): display_id = self._match_id(url) parsed_url = urllib.parse.urlparse(url) - print(f'pageId: {parsed_url.path.rstrip("/")}.model.json') headers = { - 'Origin': 'https://www.vrt.be', - 'Referer': f'{url}', 'Content-Type': 'application/json', 'Authorization': f'Bearer {self._cookies["vrtnu-site_profile_at"]}' } From c90ab9dcd6594cce7f6f346f8c1fcd721460c322 Mon Sep 17 00:00:00 2001 From: bergoid Date: Thu, 21 Sep 2023 22:13:24 +0200 Subject: [PATCH 12/23] Linting --- yt_dlp/extractor/vrt.py | 80 ++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 43bf65727..17aa1ff53 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,12 +1,12 @@ import functools import json -import time +# import time from http.cookiejar import CookieJar import urllib.parse import urllib.request from .gigya import GigyaBaseIE -from ..networking.exceptions import HTTPError +# from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, clean_html, @@ -15,8 +15,8 @@ from ..utils import ( get_element_by_class, get_element_html_by_class, int_or_none, - join_nonempty, - jwt_encode_hs256, + # join_nonempty, + # jwt_encode_hs256, make_archive_id, parse_age_limit, parse_iso8601, @@ -95,23 +95,23 @@ class VRTBaseIE(GigyaBaseIE): return formats, subtitles def _call_api(self, video_id, client='null', id_token=None, version='v2'): -# player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} -# player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ -# 'kid': self._JWT_KEY_ID -# }).decode() + # player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} + # player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ + # 'kid': self._JWT_KEY_ID + # }).decode() headers = { - 'Content-Type': 'application/json' - } + 'Content-Type': 'application/json' + } data = { - 'identityToken': id_token or self._cookies['vrtnu-site_profile_vt'], -# 'playerInfo': player_info_jwt - } + 'identityToken': id_token or self._cookies['vrtnu-site_profile_vt'], + # 'playerInfo': player_info_jwt + } json_response = self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', - None, 'Downloading player token', headers=headers, data=json.dumps(data).encode()) + None, 'Downloading player token', headers=headers, data=json.dumps(data).encode()) player_token = json_response['vrtPlayerToken'] return self._download_json( @@ -140,8 +140,6 @@ class VRTIE(VRTBaseIE): 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', 'info_dict': { 'id': 'pbs-pub-e1d6e4ec-cbf4-451e-9e87-d835bb65cd28$vid-2ad45eb6-9bc8-40d4-ad72-5f25c0f59d75', - 'title': 'Trailer \'Heizel 1985\'', - 'thumbnail': 'https://images.vrt.be/orig/2022/09/07/6e44ce6f-2eb3-11ed-b07d-02b7b76bf47f.jpg', 'ext': 'mp4', 'title': 'De Belgian Cats zijn klaar voor het EK', 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal', @@ -197,7 +195,7 @@ class VRTIE(VRTBaseIE): }, data=urlencode_postdata(post_data)) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: login_attempt += 1 self.report_warning('Authentication failed') self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') @@ -324,7 +322,7 @@ class VrtNUIE(VRTBaseIE): # Disable automatic redirection to be able to # grab necessary info in intermediate step - opener = urllib.request.build_opener(NoRedirect,urllib.request.HTTPCookieProcessor(self._cookies)) + opener = urllib.request.build_opener(NoRedirect, urllib.request.HTTPCookieProcessor(self._cookies)) # 1.a Visit 'login' URL. Get 'authorize' location and 'oidcstate' cookie res = opener.open('https://www.vrt.be/vrtnu/sso/login', None) @@ -332,15 +330,17 @@ class VrtNUIE(VRTBaseIE): # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies res = opener.open(auth_url, None) + + # TODO: make this a method of CookiePot: get_header(c) with c an array of cookie names cookies_header = f'OIDCXSRF={self._cookies["OIDCXSRF"]}; SESSION={self._cookies["SESSION"]}' # 2. Perform login headers = { - 'Content-Type': 'application/json', - 'Oidcxsrf': self._cookies["OIDCXSRF"], - 'Cookie': cookies_header - } - post_data = { "loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site" } + 'Content-Type': 'application/json', + 'Oidcxsrf': self._cookies["OIDCXSRF"], + 'Cookie': cookies_header + } + post_data = {"loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site"} res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) # TODO: @@ -348,45 +348,45 @@ class VrtNUIE(VRTBaseIE): # 3.a Visit 'authorize' again headers = { - 'Cookie': cookies_header - } + 'Cookie': cookies_header + } request = urllib.request.Request(auth_url, headers=headers) res = opener.open(request, None) callback_url = res.headers.get_all('Location')[0] # 3.b Visit 'callback' headers = { - 'Cookie': f'oidcstate={self._cookies["oidcstate"]}' - } + # TODO: use get_header() + 'Cookie': f'oidcstate={self._cookies["oidcstate"]}' + } request = urllib.request.Request(callback_url, headers=headers) res = opener.open(request, None) self._authenticated = True - def _real_extract(self, url): display_id = self._match_id(url) parsed_url = urllib.parse.urlparse(url) headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self._cookies["vrtnu-site_profile_at"]}' - } + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self._cookies["vrtnu-site_profile_at"]}' + } data = { - 'operationName': 'VideoPage', - 'query': self._VIDEOPAGE_QUERY , - 'variables': { - 'pageId': f'{parsed_url.path.rstrip("/")}.model.json' - } - } + 'operationName': 'VideoPage', + 'query': self._VIDEOPAGE_QUERY, + 'variables': { + 'pageId': f'{parsed_url.path.rstrip("/")}.model.json' + } + } model_json = self._download_json( 'https://www.vrt.be/vrtnu-api/graphql/v1', display_id, 'Downloading asset JSON', 'Unable to download asset JSON', headers=headers, data=json.dumps(data).encode())['data']['page'] video_id = model_json['episode']['watchAction']['streamId'] - title = model_json['seo']['title'] + title = model_json['seo']['title'] season_number = int(model_json['episode']['onTimeRaw'][:4]) ld_json = json.loads(model_json['ldjson'][1]) @@ -396,8 +396,8 @@ class VrtNUIE(VRTBaseIE): return { **traverse_obj(model_json, { 'description': ('seo', 'description', {clean_html}), - 'timestamp': ( 'episode', 'onTimeRaw', {parse_iso8601}), - 'release_timestamp': ( 'episode', 'onTimeRaw', {parse_iso8601}), + 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), + 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), 'series': ('episode', 'program', 'title'), 'episode': ('episode', 'episodeNumberRaw', {str_or_none}), 'episode_number': ('episode', 'episodeNumberRaw', {int_or_none}), From 86d98dab3bd697995368da52bc619899f005620a Mon Sep 17 00:00:00 2001 From: bergoid Date: Thu, 21 Sep 2023 22:55:53 +0200 Subject: [PATCH 13/23] Move token refresh to _real_extract() --- yt_dlp/extractor/vrt.py | 75 +++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 17aa1ff53..d8559e816 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -262,6 +262,9 @@ class CookiePot(CookieJar): def __str__(self): return '\n'.join(f'{cookie.name}={cookie.value}' for cookie in self) + def header(self, cookie_names): + return '; '.join(f'{name}={self[name]}' for name in cookie_names) + class VrtNUIE(VRTBaseIE): IE_DESC = 'VRT MAX' @@ -317,57 +320,54 @@ class VrtNUIE(VRTBaseIE): _NETRC_MACHINE = 'vrtnu' _authenticated = False _cookies = CookiePot() + _auth_url = '' - def _perform_login(self, username, password): + # Perform some requests with automatic redirection disabled + # to be able to grab necessary info in intermediate step + _opener = urllib.request.build_opener(NoRedirect, urllib.request.HTTPCookieProcessor(_cookies)) - # Disable automatic redirection to be able to - # grab necessary info in intermediate step - opener = urllib.request.build_opener(NoRedirect, urllib.request.HTTPCookieProcessor(self._cookies)) + def _perform_login(self, username, password): + # 1. Obtain session cookies # 1.a Visit 'login' URL. Get 'authorize' location and 'oidcstate' cookie - res = opener.open('https://www.vrt.be/vrtnu/sso/login', None) - auth_url = res.headers.get_all('Location')[0] + res = self._opener.open('https://www.vrt.be/vrtnu/sso/login', None) + self._auth_url = res.headers.get_all('Location')[0] # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies - res = opener.open(auth_url, None) - - # TODO: make this a method of CookiePot: get_header(c) with c an array of cookie names - cookies_header = f'OIDCXSRF={self._cookies["OIDCXSRF"]}; SESSION={self._cookies["SESSION"]}' + res = self._opener.open(self._auth_url, None) # 2. Perform login headers = { 'Content-Type': 'application/json', 'Oidcxsrf': self._cookies["OIDCXSRF"], - 'Cookie': cookies_header + 'Cookie': self._cookies.header(['OIDCXSRF', 'SESSION']) } post_data = {"loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site"} res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) - # TODO: - # . should this step be the new "refreshtoken" in _real_extract? + self._authenticated = True - # 3.a Visit 'authorize' again + def _real_extract(self, url): + display_id = self._match_id(url) + parsed_url = urllib.parse.urlparse(url) + + # 1. Obtain/refresh 'vrtnu-site_profile' tokens + # 1.a Visit 'authorize' URL again headers = { - 'Cookie': cookies_header + 'Cookie': self._cookies.header(['OIDCXSRF', 'SESSION']) } - request = urllib.request.Request(auth_url, headers=headers) - res = opener.open(request, None) + request = urllib.request.Request(self._auth_url, headers=headers) + res = self._opener.open(request, None) callback_url = res.headers.get_all('Location')[0] - # 3.b Visit 'callback' + # 1.b Follow redirection: visit 'callback' URL headers = { - # TODO: use get_header() - 'Cookie': f'oidcstate={self._cookies["oidcstate"]}' + 'Cookie': self._cookies.header(['oidcstate']) } request = urllib.request.Request(callback_url, headers=headers) - res = opener.open(request, None) - - self._authenticated = True - - def _real_extract(self, url): - display_id = self._match_id(url) - parsed_url = urllib.parse.urlparse(url) + res = self._opener.open(request, None) + # 2. Perform GraphQL query to obtain video metadata headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self._cookies["vrtnu-site_profile_at"]}' @@ -381,20 +381,21 @@ class VrtNUIE(VRTBaseIE): } } - model_json = self._download_json( + metadata = self._download_json( 'https://www.vrt.be/vrtnu-api/graphql/v1', display_id, 'Downloading asset JSON', 'Unable to download asset JSON', headers=headers, data=json.dumps(data).encode())['data']['page'] - video_id = model_json['episode']['watchAction']['streamId'] - title = model_json['seo']['title'] - season_number = int(model_json['episode']['onTimeRaw'][:4]) - ld_json = json.loads(model_json['ldjson'][1]) + video_id = metadata['episode']['watchAction']['streamId'] + title = metadata['seo']['title'] + season_number = int(metadata['episode']['onTimeRaw'][:4]) + ld_json = json.loads(metadata['ldjson'][1]) - streaming_json = self._call_api(video_id, client='vrtnu-web@PROD') - formats, subtitles = self._extract_formats_and_subtitles(streaming_json, video_id) + # 3. Obtain streaming info + streaming_info = self._call_api(video_id, client='vrtnu-web@PROD') + formats, subtitles = self._extract_formats_and_subtitles(streaming_info, video_id) return { - **traverse_obj(model_json, { + **traverse_obj(metadata, { 'description': ('seo', 'description', {clean_html}), 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), @@ -414,8 +415,8 @@ class VrtNUIE(VRTBaseIE): 'id': video_id, 'channel': 'VRT', 'formats': formats, - 'duration': float_or_none(streaming_json.get('duration'), 1000), - 'thumbnail': url_or_none(streaming_json.get('posterImageUrl')), + 'duration': float_or_none(streaming_info.get('duration'), 1000), + 'thumbnail': url_or_none(streaming_info.get('posterImageUrl')), 'subtitles': subtitles, '_old_archive_ids': [make_archive_id('Canvas', video_id)], } From abd99c4f0732c8fe61846e39d1c5131df4e22dc7 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sat, 4 Nov 2023 15:36:41 +0100 Subject: [PATCH 14/23] Change parsing of 'description' --- yt_dlp/extractor/vrt.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index d8559e816..c820d768a 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,6 +1,5 @@ import functools import json -# import time from http.cookiejar import CookieJar import urllib.parse import urllib.request @@ -396,7 +395,7 @@ class VrtNUIE(VRTBaseIE): return { **traverse_obj(metadata, { - 'description': ('seo', 'description', {clean_html}), + 'description': ('seo', 'description', {str_or_none}), 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), 'series': ('episode', 'program', 'title'), From 56b838bbad7526d1b5835681b93e59b783644011 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sat, 4 Nov 2023 21:49:11 +0100 Subject: [PATCH 15/23] Improvements as recommended by seproDev --- yt_dlp/extractor/vrt.py | 95 ++++++++++++----------------------------- 1 file changed, 28 insertions(+), 67 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index c820d768a..492ba4bef 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -26,6 +26,12 @@ from ..utils import ( urlencode_postdata, ) +def parse_year(timestamp): + """ Return the first 4 characters as an int """ + if isinstance(timestamp, str) and len(timestamp) >= 4: + return int_or_none(timestamp[:4]) + else: + return None class VRTBaseIE(GigyaBaseIE): _GEO_BYPASS = False @@ -47,7 +53,7 @@ class VRTBaseIE(GigyaBaseIE): # } # } - _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n permalink\n seo {\n ...seoFragment\n __typename\n }\n socialSharing {\n ...socialSharingFragment\n __typename\n }\n trackingData {\n data\n perTrigger {\n trigger\n data\n template {\n id\n __typename\n }\n __typename\n }\n __typename\n }\n ldjson\n components {\n __typename\n ... on IComponent {\n componentType\n __typename\n }\n }\n episode {\n id\n title\n available\n whatsonId\n brand\n brandLogos {\n type\n width\n height\n primary\n mono\n __typename\n }\n logo\n primaryMeta {\n ...metaFragment\n __typename\n }\n secondaryMeta {\n ...metaFragment\n __typename\n }\n image {\n ...imageFragment\n __typename\n }\n durationRaw\n durationValue\n durationSeconds\n onTimeRaw\n offTimeRaw\n ageRaw\n regionRaw\n announcementValue\n name\n episodeNumberRaw\n episodeNumberValue\n subtitle\n richDescription {\n __typename\n html\n }\n program {\n id\n link\n title\n __typename\n }\n watchAction {\n streamId\n videoId\n episodeId\n avodUrl\n resumePoint\n __typename\n }\n shareAction {\n title\n description\n image {\n templateUrl\n __typename\n }\n url\n __typename\n }\n favoriteAction {\n id\n title\n favorite\n programWhatsonId\n programUrl\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment metaFragment on MetaDataItem {\n __typename\n type\n value\n shortValue\n longValue\n}\nfragment imageFragment on Image {\n objectId\n id: objectId\n alt\n title\n focalPoint\n templateUrl\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}\nfragment socialSharingFragment on SocialSharingProperties {\n __typename\n title\n description\n image {\n __typename\n id: objectId\n templateUrl\n }\n}" + _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n seo {\n ...seoFragment\n __typename\n }\n ldjson\n episode {\n onTimeRaw\n ageRaw\n name\n episodeNumberRaw\n program {\n title\n __typename\n }\n watchAction {\n streamId\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}" # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' @@ -99,18 +105,13 @@ class VRTBaseIE(GigyaBaseIE): # 'kid': self._JWT_KEY_ID # }).decode() - headers = { - 'Content-Type': 'application/json' - } - - data = { - 'identityToken': id_token or self._cookies['vrtnu-site_profile_vt'], - # 'playerInfo': player_info_jwt - } +# headers = { 'Content-Type': 'application/json' } +# +# data = { 'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value } json_response = self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', - None, 'Downloading player token', headers=headers, data=json.dumps(data).encode()) + None, 'Downloading player token', headers={ 'Content-Type': 'application/json' }, data=json.dumps({ 'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value }).encode()) player_token = json_response['vrtPlayerToken'] return self._download_json( @@ -250,21 +251,6 @@ class NoRedirect(urllib.request.HTTPRedirectHandler): http_error_301 = http_error_303 = http_error_307 = http_error_302 -class CookiePot(CookieJar): - - def __getitem__(self, name): - for cookie in self: - if cookie.name == name: - return cookie.value - return None - - def __str__(self): - return '\n'.join(f'{cookie.name}={cookie.value}' for cookie in self) - - def header(self, cookie_names): - return '; '.join(f'{name}={self[name]}' for name in cookie_names) - - class VrtNUIE(VRTBaseIE): IE_DESC = 'VRT MAX' _VALID_URL = r'https?://(?:www\.)?vrt\.be/(vrtmax|vrtnu)/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' @@ -318,58 +304,35 @@ class VrtNUIE(VRTBaseIE): }] _NETRC_MACHINE = 'vrtnu' _authenticated = False - _cookies = CookiePot() - _auth_url = '' - - # Perform some requests with automatic redirection disabled - # to be able to grab necessary info in intermediate step - _opener = urllib.request.build_opener(NoRedirect, urllib.request.HTTPCookieProcessor(_cookies)) def _perform_login(self, username, password): - # 1. Obtain session cookies - # 1.a Visit 'login' URL. Get 'authorize' location and 'oidcstate' cookie - res = self._opener.open('https://www.vrt.be/vrtnu/sso/login', None) - self._auth_url = res.headers.get_all('Location')[0] - - # 1.b Follow redirection: visit 'authorize' URL. Get OIDCXSRF & SESSION cookies - res = self._opener.open(self._auth_url, None) - - # 2. Perform login - headers = { - 'Content-Type': 'application/json', - 'Oidcxsrf': self._cookies["OIDCXSRF"], - 'Cookie': self._cookies.header(['OIDCXSRF', 'SESSION']) - } - post_data = {"loginID": f"{username}", "password": f"{password}", "clientId": "vrtnu-site"} - res = self._request_webpage('https://login.vrt.be/perform_login', None, note='Performing login', errnote='Login failed', fatal=True, data=json.dumps(post_data).encode(), headers=headers) - + # sepro proposal + login_page = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None) + + res = self._download_json( + 'https://login.vrt.be/perform_login', None, data=json.dumps({ + "loginID": username, + "password": password, + "clientId": "vrtnu-site" + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + }) self._authenticated = True + return def _real_extract(self, url): display_id = self._match_id(url) parsed_url = urllib.parse.urlparse(url) # 1. Obtain/refresh 'vrtnu-site_profile' tokens - # 1.a Visit 'authorize' URL again - headers = { - 'Cookie': self._cookies.header(['OIDCXSRF', 'SESSION']) - } - request = urllib.request.Request(self._auth_url, headers=headers) - res = self._opener.open(request, None) - callback_url = res.headers.get_all('Location')[0] - - # 1.b Follow redirection: visit 'callback' URL - headers = { - 'Cookie': self._cookies.header(['oidcstate']) - } - request = urllib.request.Request(callback_url, headers=headers) - res = self._opener.open(request, None) + res = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Obtaining tokens', errnote='Failed to obtain tokens') # 2. Perform GraphQL query to obtain video metadata headers = { 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self._cookies["vrtnu-site_profile_at"]}' + 'Authorization': f'Bearer {self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_at").value}' } data = { @@ -385,8 +348,6 @@ class VrtNUIE(VRTBaseIE): display_id, 'Downloading asset JSON', 'Unable to download asset JSON', headers=headers, data=json.dumps(data).encode())['data']['page'] video_id = metadata['episode']['watchAction']['streamId'] - title = metadata['seo']['title'] - season_number = int(metadata['episode']['onTimeRaw'][:4]) ld_json = json.loads(metadata['ldjson'][1]) # 3. Obtain streaming info @@ -395,6 +356,8 @@ class VrtNUIE(VRTBaseIE): return { **traverse_obj(metadata, { + 'title': ('seo', 'title', {str_or_none}), + 'season_number': ('episode', 'onTimeRaw', {parse_year}), 'description': ('seo', 'description', {str_or_none}), 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), @@ -409,8 +372,6 @@ class VrtNUIE(VRTBaseIE): 'season_id': ('partOfSeason', '@id'), 'episode_id': ('@id', {str_or_none}), }), - 'title': title, - 'season_number': season_number, 'id': video_id, 'channel': 'VRT', 'formats': formats, From dd935aeda15bef493a666dc4bf07fd7470fdaf7a Mon Sep 17 00:00:00 2001 From: bergoid Date: Sat, 4 Nov 2023 22:11:19 +0100 Subject: [PATCH 16/23] More improvements --- yt_dlp/extractor/vrt.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 492ba4bef..e6834abe3 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -5,7 +5,6 @@ import urllib.parse import urllib.request from .gigya import GigyaBaseIE -# from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, clean_html, @@ -105,18 +104,14 @@ class VRTBaseIE(GigyaBaseIE): # 'kid': self._JWT_KEY_ID # }).decode() -# headers = { 'Content-Type': 'application/json' } -# -# data = { 'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value } - json_response = self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', - None, 'Downloading player token', headers={ 'Content-Type': 'application/json' }, data=json.dumps({ 'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value }).encode()) + None, 'Downloading player token', 'Failed to download player token', headers={ 'Content-Type': 'application/json' }, data=json.dumps({ 'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value }).encode()) player_token = json_response['vrtPlayerToken'] return self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/videos/{video_id}', - video_id, 'Downloading API JSON', query={ + video_id, 'Downloading API JSON', 'Failed to download API JSON', query={ 'vrtPlayerToken': player_token, 'client': client, }) @@ -242,15 +237,6 @@ class VRTIE(VRTBaseIE): } -class NoRedirect(urllib.request.HTTPRedirectHandler): - - def http_error_302(self, req, fp, code, msg, headers): - result = urllib.error.HTTPError(req.get_full_url(), code, msg, headers, fp) - return result - - http_error_301 = http_error_303 = http_error_307 = http_error_302 - - class VrtNUIE(VRTBaseIE): IE_DESC = 'VRT MAX' _VALID_URL = r'https?://(?:www\.)?vrt\.be/(vrtmax|vrtnu)/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' @@ -307,8 +293,7 @@ class VrtNUIE(VRTBaseIE): def _perform_login(self, username, password): - # sepro proposal - login_page = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None) + login_page = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') res = self._download_json( 'https://login.vrt.be/perform_login', None, data=json.dumps({ @@ -318,7 +303,7 @@ class VrtNUIE(VRTBaseIE): }).encode(), headers={ 'Content-Type': 'application/json', 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - }) + }, note='Logging in', errnote='Login failed') self._authenticated = True return @@ -327,7 +312,7 @@ class VrtNUIE(VRTBaseIE): parsed_url = urllib.parse.urlparse(url) # 1. Obtain/refresh 'vrtnu-site_profile' tokens - res = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Obtaining tokens', errnote='Failed to obtain tokens') + res = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting tokens', errnote='Failed to get tokens') # 2. Perform GraphQL query to obtain video metadata headers = { From aef8a39bf3011e2f387aa7f48fa89798e54ce220 Mon Sep 17 00:00:00 2001 From: bergoid Date: Sat, 4 Nov 2023 22:17:35 +0100 Subject: [PATCH 17/23] flake8 improvements --- yt_dlp/extractor/vrt.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index e6834abe3..9768babe7 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -1,6 +1,5 @@ import functools import json -from http.cookiejar import CookieJar import urllib.parse import urllib.request @@ -25,6 +24,7 @@ from ..utils import ( urlencode_postdata, ) + def parse_year(timestamp): """ Return the first 4 characters as an int """ if isinstance(timestamp, str) and len(timestamp) >= 4: @@ -32,6 +32,7 @@ def parse_year(timestamp): else: return None + class VRTBaseIE(GigyaBaseIE): _GEO_BYPASS = False @@ -106,7 +107,7 @@ class VRTBaseIE(GigyaBaseIE): json_response = self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', - None, 'Downloading player token', 'Failed to download player token', headers={ 'Content-Type': 'application/json' }, data=json.dumps({ 'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value }).encode()) + None, 'Downloading player token', 'Failed to download player token', headers={'Content-Type': 'application/json'}, data=json.dumps({'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value}).encode()) player_token = json_response['vrtPlayerToken'] return self._download_json( @@ -293,17 +294,17 @@ class VrtNUIE(VRTBaseIE): def _perform_login(self, username, password): - login_page = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') + self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') - res = self._download_json( + self._download_json( 'https://login.vrt.be/perform_login', None, data=json.dumps({ "loginID": username, "password": password, "clientId": "vrtnu-site" - }).encode(), headers={ - 'Content-Type': 'application/json', - 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - }, note='Logging in', errnote='Login failed') + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + }, note='Logging in', errnote='Login failed') self._authenticated = True return @@ -312,7 +313,7 @@ class VrtNUIE(VRTBaseIE): parsed_url = urllib.parse.urlparse(url) # 1. Obtain/refresh 'vrtnu-site_profile' tokens - res = self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting tokens', errnote='Failed to get tokens') + self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting tokens', errnote='Failed to get tokens') # 2. Perform GraphQL query to obtain video metadata headers = { From 6e8661c0ed922b62bdf2e750f34f713ba52bc070 Mon Sep 17 00:00:00 2001 From: bergoid Date: Tue, 14 Nov 2023 21:05:39 +0100 Subject: [PATCH 18/23] More changes recommended by reviewers --- yt_dlp/extractor/vrt.py | 133 ++++++++-------------------------------- 1 file changed, 26 insertions(+), 107 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 9768babe7..0057cef09 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -3,7 +3,7 @@ import json import urllib.parse import urllib.request -from .gigya import GigyaBaseIE +from .common import InfoExtractor from ..utils import ( ExtractorError, clean_html, @@ -12,8 +12,6 @@ from ..utils import ( get_element_by_class, get_element_html_by_class, int_or_none, - # join_nonempty, - # jwt_encode_hs256, make_archive_id, parse_age_limit, parse_iso8601, @@ -25,42 +23,28 @@ from ..utils import ( ) -def parse_year(timestamp): - """ Return the first 4 characters as an int """ - if isinstance(timestamp, str) and len(timestamp) >= 4: - return int_or_none(timestamp[:4]) - else: - return None +class VRTBaseIE(InfoExtractor): + _GEO_BYPASS = False + _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n seo {\n ...seoFragment\n __typename\n }\n ldjson\n episode {\n onTimeRaw\n ageRaw\n name\n episodeNumberRaw\n program {\n title\n __typename\n }\n watchAction {\n streamId\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}" -class VRTBaseIE(GigyaBaseIE): - _GEO_BYPASS = False + _authenticated = False -# _PLAYER_INFO = { -# 'platform': 'desktop', -# 'app': { -# 'type': 'browser', -# 'name': 'Chrome' -# }, -# 'device': 'undefined (undefined)', -# 'os': { -# 'name': 'Windows', -# 'version': 'x86_64' -# }, -# 'player': { -# 'name': 'VRT web player', -# 'version': '3.2.6-prod-2023-09-11T12:37:41' -# } -# } + def _perform_login(self, username, password): - _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n seo {\n ...seoFragment\n __typename\n }\n ldjson\n episode {\n onTimeRaw\n ageRaw\n name\n episodeNumberRaw\n program {\n title\n __typename\n }\n watchAction {\n streamId\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}" + self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') - # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js - _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' - _JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae' -# _JWT_SIGNING_KEY = 'b5f500d55cb44715107249ccd8a5c0136cfb2788dbb71b90a4f142423bacaf38' # -dev - # player-stag.vrt.be key: d23987504521ae6fbf2716caca6700a24bb1579477b43c84e146b279de5ca595 - # player.vrt.be key: 2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae + self._download_json( + 'https://login.vrt.be/perform_login', None, data=json.dumps({ + 'loginID': username, + 'password': password, + 'clientId': 'vrtnu-site' + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + }, note='Logging in', errnote='Login failed') + self._authenticated = True + return def _extract_formats_and_subtitles(self, data, video_id): if traverse_obj(data, 'drm'): @@ -100,14 +84,11 @@ class VRTBaseIE(GigyaBaseIE): return formats, subtitles def _call_api(self, video_id, client='null', id_token=None, version='v2'): - # player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} - # player_info_jwt = jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ - # 'kid': self._JWT_KEY_ID - # }).decode() - json_response = self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', - None, 'Downloading player token', 'Failed to download player token', headers={'Content-Type': 'application/json'}, data=json.dumps({'identityToken': id_token or self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_vt").value}).encode()) + None, 'Downloading player token', 'Failed to download player token', + headers={'Content-Type': 'application/json'}, + data=json.dumps({'identityToken': id_token or self._get_cookies('https://www.vrt.be').get('vrtnu-site_profile_vt').value}).encode()) player_token = json_response['vrtPlayerToken'] return self._download_json( @@ -154,54 +135,6 @@ class VRTIE(VRTBaseIE): 'HLS_AES': 'm3u8_native', } - _authenticated = False - - def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) - - if auth_info.get('errorDetails'): - raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) - - # Sometimes authentication fails for no good reason, retry - login_attempt = 1 - while login_attempt <= 3: - try: - self._request_webpage('https://token.vrt.be/vrtnuinitlogin', - None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', - query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) - - post_data = { - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - } - - self._request_webpage( - 'https://login.vrt.be/perform_login', - None, note='Performing login', errnote='perform login failed', - headers={}, query={ - 'client_id': 'vrtnu-site' - }, data=urlencode_postdata(post_data)) - - except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: - login_attempt += 1 - self.report_warning('Authentication failed') - self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') - else: - raise e - else: - break - - self._authenticated = True - def _real_extract(self, url): site, display_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, display_id) @@ -290,23 +223,6 @@ class VrtNUIE(VRTBaseIE): 'params': {'skip_download': 'm3u8'}, }] _NETRC_MACHINE = 'vrtnu' - _authenticated = False - - def _perform_login(self, username, password): - - self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') - - self._download_json( - 'https://login.vrt.be/perform_login', None, data=json.dumps({ - "loginID": username, - "password": password, - "clientId": "vrtnu-site" - }).encode(), headers={ - 'Content-Type': 'application/json', - 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - }, note='Logging in', errnote='Login failed') - self._authenticated = True - return def _real_extract(self, url): display_id = self._match_id(url) @@ -331,7 +247,10 @@ class VrtNUIE(VRTBaseIE): metadata = self._download_json( 'https://www.vrt.be/vrtnu-api/graphql/v1', - display_id, 'Downloading asset JSON', 'Unable to download asset JSON', headers=headers, data=json.dumps(data).encode())['data']['page'] + display_id, 'Downloading asset JSON', 'Unable to download asset JSON', + headers=headers, + data=json.dumps(data).encode() + )['data']['page'] video_id = metadata['episode']['watchAction']['streamId'] ld_json = json.loads(metadata['ldjson'][1]) @@ -343,7 +262,7 @@ class VrtNUIE(VRTBaseIE): return { **traverse_obj(metadata, { 'title': ('seo', 'title', {str_or_none}), - 'season_number': ('episode', 'onTimeRaw', {parse_year}), + 'season_number': ('episode', 'onTimeRaw', {lambda x: x[:4]}, {int_or_none}), 'description': ('seo', 'description', {str_or_none}), 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), From 102164411960a734dd97acfbbe02eb1882c1fbea Mon Sep 17 00:00:00 2001 From: bergoid Date: Tue, 14 Nov 2023 21:08:10 +0100 Subject: [PATCH 19/23] Inline function args --- yt_dlp/extractor/vrt.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 0057cef09..75db50655 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -232,24 +232,11 @@ class VrtNUIE(VRTBaseIE): self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting tokens', errnote='Failed to get tokens') # 2. Perform GraphQL query to obtain video metadata - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_at").value}' - } - - data = { - 'operationName': 'VideoPage', - 'query': self._VIDEOPAGE_QUERY, - 'variables': { - 'pageId': f'{parsed_url.path.rstrip("/")}.model.json' - } - } - metadata = self._download_json( 'https://www.vrt.be/vrtnu-api/graphql/v1', display_id, 'Downloading asset JSON', 'Unable to download asset JSON', - headers=headers, - data=json.dumps(data).encode() + headers={ 'Content-Type': 'application/json', 'Authorization': f'Bearer {self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_at").value}'}, + data=json.dumps({'operationName': 'VideoPage', 'query': self._VIDEOPAGE_QUERY, 'variables': { 'pageId': f'{parsed_url.path.rstrip("/")}.model.json' } }).encode() )['data']['page'] video_id = metadata['episode']['watchAction']['streamId'] From e8689a27cec141855fa9de165e59824bf06dc3b9 Mon Sep 17 00:00:00 2001 From: bergoid Date: Tue, 14 Nov 2023 22:14:20 +0100 Subject: [PATCH 20/23] More recommended changes --- yt_dlp/extractor/vrt.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 75db50655..193718726 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -26,8 +26,6 @@ from ..utils import ( class VRTBaseIE(InfoExtractor): _GEO_BYPASS = False - _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n seo {\n ...seoFragment\n __typename\n }\n ldjson\n episode {\n onTimeRaw\n ageRaw\n name\n episodeNumberRaw\n program {\n title\n __typename\n }\n watchAction {\n streamId\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}" - _authenticated = False def _perform_login(self, username, password): @@ -224,14 +222,14 @@ class VrtNUIE(VRTBaseIE): }] _NETRC_MACHINE = 'vrtnu' + _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n seo {\n ...seoFragment\n __typename\n }\n ldjson\n episode {\n onTimeRaw\n ageRaw\n name\n episodeNumberRaw\n program {\n title\n __typename\n }\n watchAction {\n streamId\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}" + def _real_extract(self, url): display_id = self._match_id(url) parsed_url = urllib.parse.urlparse(url) - # 1. Obtain/refresh 'vrtnu-site_profile' tokens self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting tokens', errnote='Failed to get tokens') - # 2. Perform GraphQL query to obtain video metadata metadata = self._download_json( 'https://www.vrt.be/vrtnu-api/graphql/v1', display_id, 'Downloading asset JSON', 'Unable to download asset JSON', @@ -240,9 +238,9 @@ class VrtNUIE(VRTBaseIE): )['data']['page'] video_id = metadata['episode']['watchAction']['streamId'] + # TODO : handle parse errors ld_json = json.loads(metadata['ldjson'][1]) - # 3. Obtain streaming info streaming_info = self._call_api(video_id, client='vrtnu-web@PROD') formats, subtitles = self._extract_formats_and_subtitles(streaming_info, video_id) From 62204182c610bc84553d91eaa63a568eec82eadc Mon Sep 17 00:00:00 2001 From: bergoid Date: Tue, 14 Nov 2023 22:16:42 +0100 Subject: [PATCH 21/23] Linting --- yt_dlp/extractor/vrt.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 193718726..b3b3cad69 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -5,7 +5,6 @@ import urllib.request from .common import InfoExtractor from ..utils import ( - ExtractorError, clean_html, extract_attributes, float_or_none, @@ -19,7 +18,6 @@ from ..utils import ( strip_or_none, traverse_obj, url_or_none, - urlencode_postdata, ) @@ -233,9 +231,9 @@ class VrtNUIE(VRTBaseIE): metadata = self._download_json( 'https://www.vrt.be/vrtnu-api/graphql/v1', display_id, 'Downloading asset JSON', 'Unable to download asset JSON', - headers={ 'Content-Type': 'application/json', 'Authorization': f'Bearer {self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_at").value}'}, - data=json.dumps({'operationName': 'VideoPage', 'query': self._VIDEOPAGE_QUERY, 'variables': { 'pageId': f'{parsed_url.path.rstrip("/")}.model.json' } }).encode() - )['data']['page'] + headers={'Content-Type': 'application/json', 'Authorization': f'Bearer {self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_at").value}'}, + data=json.dumps({'operationName': 'VideoPage', 'query': self._VIDEOPAGE_QUERY, 'variables': {'pageId': f'{parsed_url.path.rstrip("/")}.model.json'}}).encode() + )['data']['page'] video_id = metadata['episode']['watchAction']['streamId'] # TODO : handle parse errors From 19f01cacde37190fb0d2132e7964c3b0ff2197bf Mon Sep 17 00:00:00 2001 From: bergoid Date: Sun, 26 Nov 2023 21:59:47 +0100 Subject: [PATCH 22/23] Tests, improvements, cleanup --- yt_dlp/extractor/vrt.py | 149 +++++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index b3b3cad69..8093f78a1 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -14,6 +14,7 @@ from ..utils import ( make_archive_id, parse_age_limit, parse_iso8601, + unified_strdate, str_or_none, strip_or_none, traverse_obj, @@ -24,24 +25,6 @@ from ..utils import ( class VRTBaseIE(InfoExtractor): _GEO_BYPASS = False - _authenticated = False - - def _perform_login(self, username, password): - - self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') - - self._download_json( - 'https://login.vrt.be/perform_login', None, data=json.dumps({ - 'loginID': username, - 'password': password, - 'clientId': 'vrtnu-site' - }).encode(), headers={ - 'Content-Type': 'application/json', - 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - }, note='Logging in', errnote='Login failed') - self._authenticated = True - return - def _extract_formats_and_subtitles(self, data, video_id): if traverse_obj(data, 'drm'): self.report_drm(video_id) @@ -95,7 +78,29 @@ class VRTBaseIE(InfoExtractor): }) -class VRTIE(VRTBaseIE): +class VRTLoginIE(VRTBaseIE): + + _NETRC_MACHINE = 'vrtnu' + _authenticated = False + + def _perform_login(self, username, password): + + self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') + + self._download_json( + 'https://login.vrt.be/perform_login', None, data=json.dumps({ + 'loginID': username, + 'password': password, + 'clientId': 'vrtnu-site' + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + }, note='Logging in', errnote='Login failed') + self._authenticated = True + return + + +class VRTIE(VRTLoginIE): IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' _VALID_URL = r'https?://(?:www\.)?(?Pvrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P[^/?&#]+)' _TESTS = [{ @@ -121,7 +126,6 @@ class VRTIE(VRTBaseIE): }, 'params': {'skip_download': 'm3u8'}, }] - _NETRC_MACHINE = 'vrtnu' _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' _REST_API_BASE_TOKEN = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' @@ -167,60 +171,59 @@ class VRTIE(VRTBaseIE): } -class VrtNUIE(VRTBaseIE): +class VrtNUIE(VRTLoginIE): IE_DESC = 'VRT MAX' _VALID_URL = r'https?://(?:www\.)?vrt\.be/(vrtmax|vrtnu)/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' - _TESTS = [{ - # CONTENT_IS_AGE_RESTRICTED - 'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/', - 'info_dict': { - 'id': 'pbs-pub-855b00a8-6ce2-4032-ac4f-1fcf3ae78524$vid-d2243aa1-ec46-4e34-a55b-92568459906f', - 'ext': 'mp4', - 'title': 'Tom Waes', - 'description': 'Satirisch actualiteitenmagazine met Ella Leyers. Tom Waes is te gast.', - 'timestamp': 1673905125, - 'release_timestamp': 1673905125, - 'series': 'De ideale wereld', - 'season_id': '1672830988794', - 'episode': 'Aflevering 1', - 'episode_number': 1, - 'episode_id': '1672830988861', - 'display_id': 'de-ideale-wereld-d20230116', - 'channel': 'VRT', - 'duration': 1939.0, - 'thumbnail': 'https://images.vrt.be/orig/2023/01/10/1bb39cb3-9115-11ed-b07d-02b7b76bf47f.jpg', - 'release_date': '20230116', - 'upload_date': '20230116', - 'age_limit': 12, + _TESTS = [ + { + 'url': 'https://www.vrt.be/vrtmax/a-z/pano/trailer/pano-trailer-najaar-2023/', + 'info_dict': { + 'title': 'Pano - Nieuwe afleveringen vanaf 15 november - Trailer | VRT MAX', + 'description': 'Duidingsmagazine met indringende reportages over de grote thema\'s van deze tijd. Een gedreven team van reporters diept de beste nieuwsverhalen uit en zoekt het antwoord op actuele vragen. Bekijk de trailer met VRT MAX via de site of app.', + 'timestamp': 1699246800, + 'release_timestamp': 1699246800, + 'release_date': '20231106', + 'upload_date': '20231106', + 'series': 'Pano', + 'season': 'Trailer', + 'season_number': 2023, + 'season_id': '/vrtnu/a-z/pano/trailer/#tvseason', + 'episode_id': '3226122918145', + 'id': 'pbs-pub-5260ad6d-372c-46d3-a542-0e781fd5831a$vid-75fdb750-82f5-4157-8ea9-4485f303f20b', + 'channel': 'VRT', + 'duration': 37.16, + 'thumbnail': 'https://images.vrt.be/orig/2023/11/03/f570eb9b-7a4e-11ee-91d7-02b7b76bf47f.jpg', + 'ext': 'mp4', + }, }, - }, { - 'url': 'https://www.vrt.be/vrtnu/a-z/buurman--wat-doet-u-nu-/6/buurman--wat-doet-u-nu--s6-trailer/', - 'info_dict': { - 'id': 'pbs-pub-ad4050eb-d9e5-48c2-9ec8-b6c355032361$vid-0465537a-34a8-4617-8352-4d8d983b4eee', - 'ext': 'mp4', - 'title': 'Trailer seizoen 6 \'Buurman, wat doet u nu?\'', - 'description': 'md5:197424726c61384b4e5c519f16c0cf02', - 'timestamp': 1652940000, - 'release_timestamp': 1652940000, - 'series': 'Buurman, wat doet u nu?', - 'season': 'Seizoen 6', - 'season_number': 6, - 'season_id': '1652344200907', - 'episode': 'Aflevering 0', - 'episode_number': 0, - 'episode_id': '1652951873524', - 'display_id': 'buurman--wat-doet-u-nu--s6-trailer', - 'channel': 'VRT', - 'duration': 33.13, - 'thumbnail': 'https://images.vrt.be/orig/2022/05/23/3c234d21-da83-11ec-b07d-02b7b76bf47f.jpg', - 'release_date': '20220519', - 'upload_date': '20220519', - }, - 'params': {'skip_download': 'm3u8'}, - }] + { + 'url': 'https://www.vrt.be/vrtnu/a-z/factcheckers/trailer/factcheckers-trailer-s4/', + 'info_dict': { + 'title': 'Factcheckers - Nieuwe afleveringen vanaf 15 november - Trailer | VRT MAX', + 'season_number': 2023, + 'description': 'Infotainmentprogramma waarin Thomas Vanderveken, Jan Van Looveren en Britt Van Marsenille checken wat er nu eigenlijk klopt van de tsunami aan berichten, beweringen en weetjes die we dagelijks over ons heen krijgen. Bekijk de trailer met VRT MAX via de site of app.', + 'timestamp': 1699160400, + 'release_timestamp': 1699160400, + 'release_date': '20231105', + 'upload_date': '20231105', + 'series': 'Factcheckers', + 'episode': '0', + 'episode_number': 0, + 'season': 'Trailer', + 'season_id': '/vrtnu/a-z/factcheckers/trailer/#tvseason', + 'episode_id': '3179360900145', + 'id': 'pbs-pub-aa9397e9-ec2b-45f9-9148-7ce71b690b45$vid-04c67438-4866-4f5c-8978-51d173c0074b', + 'channel': 'VRT', + 'duration': 33.08, + 'thumbnail': 'https://images.vrt.be/orig/2023/11/07/37d244f0-7d8a-11ee-91d7-02b7b76bf47f.jpg', + 'ext': 'mp4', + }, + } + ] + _NETRC_MACHINE = 'vrtnu' - _VIDEOPAGE_QUERY = "query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n seo {\n ...seoFragment\n __typename\n }\n ldjson\n episode {\n onTimeRaw\n ageRaw\n name\n episodeNumberRaw\n program {\n title\n __typename\n }\n watchAction {\n streamId\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}" + _VIDEOPAGE_QUERY = 'query VideoPage($pageId: ID!) {\n page(id: $pageId) {\n ... on EpisodePage {\n id\n title\n seo {\n ...seoFragment\n __typename\n }\n ldjson\n episode {\n onTimeRaw\n ageRaw\n name\n episodeNumberRaw\n program {\n title\n __typename\n }\n watchAction {\n streamId\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\nfragment seoFragment on SeoProperties {\n __typename\n title\n description\n}' def _real_extract(self, url): display_id = self._match_id(url) @@ -236,8 +239,10 @@ class VrtNUIE(VRTBaseIE): )['data']['page'] video_id = metadata['episode']['watchAction']['streamId'] - # TODO : handle parse errors - ld_json = json.loads(metadata['ldjson'][1]) + try: + ld_json = json.loads(metadata['ldjson'][1]) + except Exception: + ld_json = {} streaming_info = self._call_api(video_id, client='vrtnu-web@PROD') formats, subtitles = self._extract_formats_and_subtitles(streaming_info, video_id) @@ -249,6 +254,8 @@ class VrtNUIE(VRTBaseIE): 'description': ('seo', 'description', {str_or_none}), 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), + 'release_date': ('episode', 'onTimeRaw', {unified_strdate}), + 'upload_date': ('episode', 'onTimeRaw', {unified_strdate}), 'series': ('episode', 'program', 'title'), 'episode': ('episode', 'episodeNumberRaw', {str_or_none}), 'episode_number': ('episode', 'episodeNumberRaw', {int_or_none}), From df2c44a034ad40e01d55d26e322edecd1552d215 Mon Sep 17 00:00:00 2001 From: bergoid Date: Tue, 28 Nov 2023 19:27:01 +0100 Subject: [PATCH 23/23] Line length --- yt_dlp/extractor/vrt.py | 350 +++++++++++++++++++++++++--------------- 1 file changed, 224 insertions(+), 126 deletions(-) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 8093f78a1..7bf39a7c4 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -30,34 +30,46 @@ class VRTBaseIE(InfoExtractor): self.report_drm(video_id) formats, subtitles = [], {} - for target in traverse_obj(data, ('targetUrls', lambda _, v: url_or_none(v['url']) and v['type'])): + for target in traverse_obj( + data, ('targetUrls', lambda _, v: url_or_none(v['url']) and v['type']) + ): format_type = target['type'].upper() format_url = target['url'] if format_type in ('HLS', 'HLS_AES'): fmts, subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', m3u8_id=format_type, fatal=False) + format_url, video_id, 'mp4', m3u8_id=format_type, fatal=False + ) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_type, fatal=False)) + formats.extend( + self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False + ) + ) elif format_type == 'MPEG_DASH': fmts, subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=format_type, fatal=False) + format_url, video_id, mpd_id=format_type, fatal=False + ) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif format_type == 'HSS': fmts, subs = self._extract_ism_formats_and_subtitles( - format_url, video_id, ism_id='mss', fatal=False) + format_url, video_id, ism_id='mss', fatal=False + ) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - - for sub in traverse_obj(data, ('subtitleUrls', lambda _, v: v['url'] and v['type'] == 'CLOSED')): + formats.append( + { + 'format_id': format_type, + 'url': format_url, + } + ) + + for sub in traverse_obj( + data, ('subtitleUrls', lambda _, v: v['url'] and v['type'] == 'CLOSED') + ): subtitles.setdefault('nl', []).append({'url': sub['url']}) return formats, subtitles @@ -65,37 +77,60 @@ class VRTBaseIE(InfoExtractor): def _call_api(self, video_id, client='null', id_token=None, version='v2'): json_response = self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', - None, 'Downloading player token', 'Failed to download player token', + None, + 'Downloading player token', + 'Failed to download player token', headers={'Content-Type': 'application/json'}, - data=json.dumps({'identityToken': id_token or self._get_cookies('https://www.vrt.be').get('vrtnu-site_profile_vt').value}).encode()) + data=json.dumps( + { + 'identityToken': id_token + or self._get_cookies('https://www.vrt.be') + .get('vrtnu-site_profile_vt') + .value + } + ).encode(), + ) player_token = json_response['vrtPlayerToken'] return self._download_json( f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/videos/{video_id}', - video_id, 'Downloading API JSON', 'Failed to download API JSON', query={ + video_id, + 'Downloading API JSON', + 'Failed to download API JSON', + query={ 'vrtPlayerToken': player_token, 'client': client, - }) + }, + ) class VRTLoginIE(VRTBaseIE): - _NETRC_MACHINE = 'vrtnu' _authenticated = False def _perform_login(self, username, password): - - self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting session cookies', errnote='Failed to get session cookies') + self._request_webpage( + 'https://www.vrt.be/vrtnu/sso/login', + None, + note='Getting session cookies', + errnote='Failed to get session cookies', + ) self._download_json( - 'https://login.vrt.be/perform_login', None, data=json.dumps({ - 'loginID': username, - 'password': password, - 'clientId': 'vrtnu-site' - }).encode(), headers={ + 'https://login.vrt.be/perform_login', + None, + data=json.dumps( + {'loginID': username, 'password': password, 'clientId': 'vrtnu-site'} + ).encode(), + headers={ 'Content-Type': 'application/json', - 'Oidcxsrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - }, note='Logging in', errnote='Login failed') + 'Oidcxsrf': self._get_cookies('https://login.vrt.be') + .get('OIDCXSRF') + .value, + }, + note='Logging in', + errnote='Login failed', + ) self._authenticated = True return @@ -103,29 +138,32 @@ class VRTLoginIE(VRTBaseIE): class VRTIE(VRTLoginIE): IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' _VALID_URL = r'https?://(?:www\.)?(?Pvrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P[^/?&#]+)' - _TESTS = [{ - 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', - 'info_dict': { - 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', - 'ext': 'mp4', - 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', - 'description': 'md5:6fd85f999b2d1841aa5568f4bf02c3ff', - 'duration': 31.2, - 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/2d914d61-7710-11e9-abcc-02b7b76bf47f.jpg', + _TESTS = [ + { + 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', + 'info_dict': { + 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', + 'ext': 'mp4', + 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', + 'description': 'md5:6fd85f999b2d1841aa5568f4bf02c3ff', + 'duration': 31.2, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/2d914d61-7710-11e9-abcc-02b7b76bf47f.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, - 'params': {'skip_download': 'm3u8'}, - }, { - 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', - 'info_dict': { - 'id': 'pbs-pub-e1d6e4ec-cbf4-451e-9e87-d835bb65cd28$vid-2ad45eb6-9bc8-40d4-ad72-5f25c0f59d75', - 'ext': 'mp4', - 'title': 'De Belgian Cats zijn klaar voor het EK', - 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal', - 'duration': 115.17, - 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/11c0dba3-770e-11e9-abcc-02b7b76bf47f.jpg', + { + 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', + 'info_dict': { + 'id': 'pbs-pub-e1d6e4ec-cbf4-451e-9e87-d835bb65cd28$vid-2ad45eb6-9bc8-40d4-ad72-5f25c0f59d75', + 'ext': 'mp4', + 'title': 'De Belgian Cats zijn klaar voor het EK', + 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal', + 'duration': 115.17, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/11c0dba3-770e-11e9-abcc-02b7b76bf47f.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, - 'params': {'skip_download': 'm3u8'}, - }] + ] _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' _REST_API_BASE_TOKEN = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' @@ -141,16 +179,22 @@ class VRTIE(VRTLoginIE): attrs = extract_attributes(get_element_html_by_class('vrtvideo', webpage) or '') asset_id = attrs.get('data-video-id') or attrs['data-videoid'] - publication_id = traverse_obj(attrs, 'data-publication-id', 'data-publicationid') + publication_id = traverse_obj( + attrs, 'data-publication-id', 'data-publicationid' + ) if publication_id: asset_id = f'{publication_id}${asset_id}' - client = traverse_obj(attrs, 'data-client-code', 'data-client') or self._CLIENT_MAP[site] + client = ( + traverse_obj(attrs, 'data-client-code', 'data-client') + or self._CLIENT_MAP[site] + ) data = self._call_api(asset_id, client) formats, subtitles = self._extract_formats_and_subtitles(data, asset_id) description = self._html_search_meta( - ['og:description', 'twitter:description', 'description'], webpage) + ['og:description', 'twitter:description', 'description'], webpage + ) if description == '…': description = None @@ -162,18 +206,26 @@ class VRTIE(VRTLoginIE): 'thumbnail': url_or_none(attrs.get('data-posterimage')), 'duration': float_or_none(attrs.get('data-duration'), 1000), '_old_archive_ids': [make_archive_id('Canvas', asset_id)], - **traverse_obj(data, { - 'title': ('title', {str}), - 'description': ('shortDescription', {str}), - 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), - 'thumbnail': ('posterImageUrl', {url_or_none}), - }), + **traverse_obj( + data, + { + 'title': ('title', {str}), + 'description': ('shortDescription', {str}), + 'duration': ( + 'duration', + {functools.partial(float_or_none, scale=1000)}, + ), + 'thumbnail': ('posterImageUrl', {url_or_none}), + }, + ), } class VrtNUIE(VRTLoginIE): IE_DESC = 'VRT MAX' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/(vrtmax|vrtnu)/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' + _VALID_URL = ( + r'https?://(?:www\.)?vrt\.be/(vrtmax|vrtnu)/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' + ) _TESTS = [ { 'url': 'https://www.vrt.be/vrtmax/a-z/pano/trailer/pano-trailer-najaar-2023/', @@ -218,7 +270,7 @@ class VrtNUIE(VRTLoginIE): 'thumbnail': 'https://images.vrt.be/orig/2023/11/07/37d244f0-7d8a-11ee-91d7-02b7b76bf47f.jpg', 'ext': 'mp4', }, - } + }, ] _NETRC_MACHINE = 'vrtnu' @@ -229,13 +281,31 @@ class VrtNUIE(VRTLoginIE): display_id = self._match_id(url) parsed_url = urllib.parse.urlparse(url) - self._request_webpage('https://www.vrt.be/vrtnu/sso/login', None, note='Getting tokens', errnote='Failed to get tokens') + self._request_webpage( + 'https://www.vrt.be/vrtnu/sso/login', + None, + note='Getting tokens', + errnote='Failed to get tokens', + ) metadata = self._download_json( 'https://www.vrt.be/vrtnu-api/graphql/v1', - display_id, 'Downloading asset JSON', 'Unable to download asset JSON', - headers={'Content-Type': 'application/json', 'Authorization': f'Bearer {self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_at").value}'}, - data=json.dumps({'operationName': 'VideoPage', 'query': self._VIDEOPAGE_QUERY, 'variables': {'pageId': f'{parsed_url.path.rstrip("/")}.model.json'}}).encode() + display_id, + 'Downloading asset JSON', + 'Unable to download asset JSON', + headers={ + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self._get_cookies("https://www.vrt.be").get("vrtnu-site_profile_at").value}', + }, + data=json.dumps( + { + 'operationName': 'VideoPage', + 'query': self._VIDEOPAGE_QUERY, + 'variables': { + 'pageId': f'{parsed_url.path.rstrip("/")}.model.json' + }, + } + ).encode(), )['data']['page'] video_id = metadata['episode']['watchAction']['streamId'] @@ -245,28 +315,41 @@ class VrtNUIE(VRTLoginIE): ld_json = {} streaming_info = self._call_api(video_id, client='vrtnu-web@PROD') - formats, subtitles = self._extract_formats_and_subtitles(streaming_info, video_id) + formats, subtitles = self._extract_formats_and_subtitles( + streaming_info, video_id + ) return { - **traverse_obj(metadata, { - 'title': ('seo', 'title', {str_or_none}), - 'season_number': ('episode', 'onTimeRaw', {lambda x: x[:4]}, {int_or_none}), - 'description': ('seo', 'description', {str_or_none}), - 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), - 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), - 'release_date': ('episode', 'onTimeRaw', {unified_strdate}), - 'upload_date': ('episode', 'onTimeRaw', {unified_strdate}), - 'series': ('episode', 'program', 'title'), - 'episode': ('episode', 'episodeNumberRaw', {str_or_none}), - 'episode_number': ('episode', 'episodeNumberRaw', {int_or_none}), - 'age_limit': ('episode', 'ageRaw', {parse_age_limit}), - 'display_id': ('episode', 'name', {parse_age_limit}), - }), - **traverse_obj(ld_json, { - 'season': ('partOfSeason', 'name'), - 'season_id': ('partOfSeason', '@id'), - 'episode_id': ('@id', {str_or_none}), - }), + **traverse_obj( + metadata, + { + 'title': ('seo', 'title', {str_or_none}), + 'season_number': ( + 'episode', + 'onTimeRaw', + {lambda x: x[:4]}, + {int_or_none}, + ), + 'description': ('seo', 'description', {str_or_none}), + 'timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), + 'release_timestamp': ('episode', 'onTimeRaw', {parse_iso8601}), + 'release_date': ('episode', 'onTimeRaw', {unified_strdate}), + 'upload_date': ('episode', 'onTimeRaw', {unified_strdate}), + 'series': ('episode', 'program', 'title'), + 'episode': ('episode', 'episodeNumberRaw', {str_or_none}), + 'episode_number': ('episode', 'episodeNumberRaw', {int_or_none}), + 'age_limit': ('episode', 'ageRaw', {parse_age_limit}), + 'display_id': ('episode', 'name', {parse_age_limit}), + }, + ), + **traverse_obj( + ld_json, + { + 'season': ('partOfSeason', 'name'), + 'season_id': ('partOfSeason', '@id'), + 'episode_id': ('@id', {str_or_none}), + }, + ), 'id': video_id, 'channel': 'VRT', 'formats': formats, @@ -279,26 +362,30 @@ class VrtNUIE(VRTLoginIE): class KetnetIE(VRTBaseIE): _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5', - 'info_dict': { - 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', - 'ext': 'mp4', - 'title': 'Meisjes', - 'episode': 'Reeks 6: Week 5', - 'season': 'Reeks 6', - 'series': 'Meisjes', - 'timestamp': 1685251800, - 'upload_date': '20230528', - }, - 'params': {'skip_download': 'm3u8'}, - }] + _TESTS = [ + { + 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5', + 'info_dict': { + 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + 'ext': 'mp4', + 'title': 'Meisjes', + 'episode': 'Reeks 6: Week 5', + 'season': 'Reeks 6', + 'series': 'Meisjes', + 'timestamp': 1685251800, + 'upload_date': '20230528', + }, + 'params': {'skip_download': 'm3u8'}, + } + ] def _real_extract(self, url): display_id = self._match_id(url) video = self._download_json( - 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'https://senior-bff.ketnet.be/graphql', + display_id, + query={ 'query': '''{ video(id: "content/ketnet/nl/%s.model.json") { description @@ -311,8 +398,10 @@ class KetnetIE(VRTBaseIE): subtitleVideodetail titleVideodetail } -}''' % display_id, - })['data']['video'] +}''' + % display_id, + }, + )['data']['video'] video_id = urllib.parse.unquote(video['mediaReference']) data = self._call_api(video_id, 'ketnet@PROD', version='v1') @@ -323,39 +412,45 @@ class KetnetIE(VRTBaseIE): 'formats': formats, 'subtitles': subtitles, '_old_archive_ids': [make_archive_id('Canvas', video_id)], - **traverse_obj(video, { - 'title': ('titleVideodetail', {str}), - 'description': ('description', {str}), - 'thumbnail': ('thumbnail', {url_or_none}), - 'timestamp': ('publicationDate', {parse_iso8601}), - 'series': ('programTitle', {str}), - 'season': ('seasonTitle', {str}), - 'episode': ('subtitleVideodetail', {str}), - 'episode_number': ('episodeNr', {int_or_none}), - }), + **traverse_obj( + video, + { + 'title': ('titleVideodetail', {str}), + 'description': ('description', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'timestamp': ('publicationDate', {parse_iso8601}), + 'series': ('programTitle', {str}), + 'season': ('seasonTitle', {str}), + 'episode': ('subtitleVideodetail', {str}), + 'episode_number': ('episodeNr', {int_or_none}), + }, + ), } class DagelijkseKostIE(VRTBaseIE): IE_DESC = 'dagelijksekost.een.be' _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', - 'info_dict': { - 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', - 'ext': 'mp4', - 'title': 'Hachis parmentier met witloof', - 'description': 'md5:9960478392d87f63567b5b117688cdc5', - 'display_id': 'hachis-parmentier-met-witloof', - }, - 'params': {'skip_download': 'm3u8'}, - }] + _TESTS = [ + { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'display_id': 'hachis-parmentier-met-witloof', + }, + 'params': {'skip_download': 'm3u8'}, + } + ] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._html_search_regex( - r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', group='id') + r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', group='id' + ) data = self._call_api(video_id, 'dako@prod', version='v1') formats, subtitles = self._extract_formats_and_subtitles(data, video_id) @@ -365,10 +460,13 @@ class DagelijkseKostIE(VRTBaseIE): 'formats': formats, 'subtitles': subtitles, 'display_id': display_id, - 'title': strip_or_none(get_element_by_class( - 'dish-metadata__title', webpage) or self._html_search_meta('twitter:title', webpage)), - 'description': clean_html(get_element_by_class( - 'dish-description', webpage)) or self._html_search_meta( - ['description', 'twitter:description', 'og:description'], webpage), + 'title': strip_or_none( + get_element_by_class('dish-metadata__title', webpage) + or self._html_search_meta('twitter:title', webpage) + ), + 'description': clean_html(get_element_by_class('dish-description', webpage)) + or self._html_search_meta( + ['description', 'twitter:description', 'og:description'], webpage + ), '_old_archive_ids': [make_archive_id('Canvas', video_id)], }