From a03cd32b71d5925d06e2ac3b36f266ccc812c810 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Mon, 3 Apr 2023 00:01:50 +0200 Subject: [PATCH 1/5] [ceskatelevize] update to March 2023 changes (#6539) Note: we could even skip downloading the player.ceskatelevize.cz page completely as we do not actually need it to get the information we used to need before the recent website changes. However, we would not catch the errors that are handled here and the resulting output could be quite confusing if one of them does happen. --- yt_dlp/extractor/ceskatelevize.py | 34 ++++++------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 156b6a324..9a0260f6d 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -119,12 +119,10 @@ class CeskaTelevizeIE(InfoExtractor): type_ = 'bonus' if not idec: raise ExtractorError('Failed to find IDEC id') - iframe_hash = self._download_webpage( - 'https://www.ceskatelevize.cz/v-api/iframe-hash/', - playlist_id, note='Getting IFRAME hash') - query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } + sidp = playlist_id.rsplit('-')[0] + query = {'origin': 'iVysilani', 'autoStart': 'true', 'sidp': sidp, type_: idec, } webpage = self._download_webpage( - 'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', + 'https://player.ceskatelevize.cz/', playlist_id, note='Downloading player', query=query) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' @@ -133,29 +131,9 @@ class CeskaTelevizeIE(InfoExtractor): if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) - type_ = None - episode_id = None - - playlist = self._parse_json( - self._search_regex( - r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', - default='{}'), playlist_id) - if playlist: - type_ = playlist.get('type') - episode_id = playlist.get('id') - - if not type_: - type_ = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', - webpage, 'type') - if not episode_id: - episode_id = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', - webpage, 'episode_id') - data = { - 'playlist[0][type]': type_, - 'playlist[0][id]': episode_id, + 'playlist[0][type]': 'episode', + 'playlist[0][id]': idec, 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -231,7 +209,7 @@ class CeskaTelevizeIE(InfoExtractor): if item.get('type') == 'VOD': subs = item.get('subtitles') if subs: - subtitles = self.extract_subtitles(episode_id, subs) + subtitles = self.extract_subtitles(idec, subs) if playlist_len == 1: final_title = playlist_title or title From 59cdcf779521286b66a45a97b1f71e736ad786ac Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 5 Apr 2023 16:04:03 +0530 Subject: [PATCH 2/5] Update yt_dlp/extractor/ceskatelevize.py --- yt_dlp/extractor/ceskatelevize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 9a0260f6d..e6a2e5129 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -120,7 +120,7 @@ class CeskaTelevizeIE(InfoExtractor): if not idec: raise ExtractorError('Failed to find IDEC id') sidp = playlist_id.rsplit('-')[0] - query = {'origin': 'iVysilani', 'autoStart': 'true', 'sidp': sidp, type_: idec, } + query = {'origin': 'iVysilani', 'autoStart': 'true', 'sidp': sidp, type_: idec} webpage = self._download_webpage( 'https://player.ceskatelevize.cz/', playlist_id, note='Downloading player', query=query) From e1b623cea19c60bba72854bfb46de2432d03225f Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 7 Apr 2023 01:19:15 +0200 Subject: [PATCH 3/5] [ceskatelevize] fix live broadcast After recent site changes, live streaming is handled in three different ways. Update the code to deal with all of them. Also update the test URLs. --- yt_dlp/extractor/ceskatelevize.py | 105 +++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index e6a2e5129..731d01f52 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -1,4 +1,5 @@ import re +import json from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse @@ -8,6 +9,7 @@ from ..utils import ( float_or_none, str_or_none, traverse_obj, + unescapeHTML, urlencode_postdata, ) @@ -19,9 +21,9 @@ USER_AGENTS = { class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', + 'url': 'https://www.ceskatelevize.cz/porady/10441294653-hyde-park-civilizace/bonus/20641/', 'info_dict': { - 'id': '61924494877028507', + 'id': '20641', 'ext': 'mp4', 'title': 'Bonus 01 - En - Hyde Park Civilizace', 'description': 'English Subtittles', @@ -34,7 +36,7 @@ class CeskaTelevizeIE(InfoExtractor): }, }, { # live stream - 'url': 'http://www.ceskatelevize.cz/zive/ct1/', + 'url': 'https://www.ceskatelevize.cz/zive/ct1/', 'info_dict': { 'id': '102', 'ext': 'mp4', @@ -48,7 +50,7 @@ class CeskaTelevizeIE(InfoExtractor): }, }, { # another - 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'url': 'https://www.ceskatelevize.cz/zive/sport/', 'only_matching': True, 'info_dict': { 'id': '402', @@ -106,7 +108,7 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_description: playlist_description = playlist_description.replace('\xa0', ' ') - type_ = 'IDEC' + type_ = 'episode' if re.search(r'(^/porady|/zive)/', parsed_url.path): next_data = self._search_nextjs_data(webpage, playlist_id) if '/zive/' in parsed_url.path: @@ -119,11 +121,65 @@ class CeskaTelevizeIE(InfoExtractor): type_ = 'bonus' if not idec: raise ExtractorError('Failed to find IDEC id') - sidp = playlist_id.rsplit('-')[0] + sidp = self._search_regex(r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/([0-9]+)-', url, playlist_id, default=playlist_id) + sidp = sidp.rsplit('-')[0] query = {'origin': 'iVysilani', 'autoStart': 'true', 'sidp': sidp, type_: idec} webpage = self._download_webpage( 'https://player.ceskatelevize.cz/', playlist_id, note='Downloading player', query=query) + playlistpage_url = 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/' + data = { + 'playlist[0][type]': type_, + 'playlist[0][id]': idec, + 'requestUrl': parsed_url.path, + 'requestSource': 'iVysilani', + } + elif parsed_url.path == '/' and parsed_url.fragment == 'live': + if self._search_regex(r'(?s)]+id=[\'"]live[\'"][^>]+data-ctcomp-data=\'([^\']+)\'[^>]*>', webpage, 'live video player', default=None): + # CT4 + ctcomp_data = self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]live[\'"][^>]+data-ctcomp-data=\'([^\']+)\'[^>]*>', + webpage, 'ctcomp data', fatal=True), + playlist_id, transform_source=unescapeHTML) + current_item = traverse_obj(ctcomp_data, ('items', ctcomp_data.get('currentItem'), 'items', 0, 'video', 'data', 'source', 'playlist', 0)) + playlistpage_url = 'https://playlist.ceskatelevize.cz/' + data = { + 'contentType': 'live', + 'items': [{ + 'id': current_item.get('id'), + 'key': current_item.get('key'), + 'assetId': current_item.get('assetId'), + 'playerType': 'dash', + 'date': current_item.get('date'), + 'requestSource': current_item.get('requestSource'), + 'drm': current_item.get('drm'), + 'quality': current_item.get('quality'), + }] + } + data = {'data': json.dumps(data).encode('utf-8')} + else: + # CT24 + lvp_url = self._search_regex( + r'(?s)]+id=[\'"]live-video-player[\'"][^>]+data-url=[\'"]([^\'"]+)[\'"][^>]*>', + webpage, 'live video player', fatal=True) + lvp_hash = self._search_regex( + r'(?s)media_ivysilani: *{ *hash *: *[\'"]([0-9a-f]+)[\'"] *}', + webpage, 'live video hash', fatal=True) + lvp_url += '&hash=' + lvp_hash + webpage = self._download_webpage(unescapeHTML(lvp_url), playlist_id) + playlistpage = self._search_regex( + r'(?s)getPlaylistUrl\((\[[^\]]+\])[,\)]', + webpage, 'playlist params', fatal=True) + playlistpage_params = self._parse_json(playlistpage, playlist_id)[0] + playlistpage_url = 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/' + idec = playlistpage_params.get('id') + data = { + 'playlist[0][type]': playlistpage_params.get('type'), + 'playlist[0][id]': idec, + 'requestUrl': '/ivysilani/embed/iFramePlayer.php', + 'requestSource': 'iVysilani', + } NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

' % NOT_AVAILABLE_STRING in webpage: @@ -131,20 +187,10 @@ class CeskaTelevizeIE(InfoExtractor): if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) - data = { - 'playlist[0][type]': 'episode', - 'playlist[0][id]': idec, - 'requestUrl': parsed_url.path, - 'requestSource': 'iVysilani', - } - entries = [] for user_agent in (None, USER_AGENTS['Safari']): - req = Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', - data=urlencode_postdata(data)) - + req = Request(playlistpage_url, data=urlencode_postdata(data)) req.headers['Content-type'] = 'application/x-www-form-urlencoded' req.headers['x-addr'] = '127.0.0.1' req.headers['X-Requested-With'] = 'XMLHttpRequest' @@ -157,18 +203,19 @@ class CeskaTelevizeIE(InfoExtractor): if not playlistpage: continue - playlist_url = playlistpage['url'] - if playlist_url == 'error_region': - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - - req = Request(compat_urllib_parse_unquote(playlist_url)) - req.headers['Referer'] = url - - playlist = self._download_json(req, playlist_id, fatal=False) - if not playlist: - continue + playlist_url = playlistpage.get('url') + if playlist_url: + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + req = Request(compat_urllib_parse_unquote(playlist_url)) + req.headers['Referer'] = url + playlist = self._download_json(req, playlist_id, fatal=False) + if not playlist: + continue + playlist = playlist.get('playlist') + else: + playlist = traverse_obj(playlistpage, ('RESULT', 'playlist')) - playlist = playlist.get('playlist') if not isinstance(playlist, list): continue @@ -200,7 +247,7 @@ class CeskaTelevizeIE(InfoExtractor): continue item_id = str_or_none(item.get('id') or item['assetId']) - title = item['title'] + title = item.get('title') or 'live' duration = float_or_none(item.get('duration')) thumbnail = item.get('previewImageUrl') From 4f31126f07cb4b2770945afdda168838354c0d4c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Mon, 10 Apr 2023 16:18:16 +0200 Subject: [PATCH 4/5] [ceskatelevize] more robust sidp detection For older style livestreams (all except CT4 Sport and CT24), sidp value cannot be determined from URL; the value used by browsers can be found as showID inside the next_data JSON. --- yt_dlp/extractor/ceskatelevize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 731d01f52..9cd7eecf4 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -113,15 +113,16 @@ class CeskaTelevizeIE(InfoExtractor): next_data = self._search_nextjs_data(webpage, playlist_id) if '/zive/' in parsed_url.path: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) + sidp = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'showId'), get_all=False) else: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) if not idec: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False) if idec: type_ = 'bonus' + sidp = self._search_regex(r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/([0-9]+)-', url, playlist_id, default=playlist_id) if not idec: raise ExtractorError('Failed to find IDEC id') - sidp = self._search_regex(r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/([0-9]+)-', url, playlist_id, default=playlist_id) sidp = sidp.rsplit('-')[0] query = {'origin': 'iVysilani', 'autoStart': 'true', 'sidp': sidp, type_: idec} webpage = self._download_webpage( From 6f1db75869dab4651ac8db15798b9ca646f39f4d Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Mon, 10 Apr 2023 16:21:05 +0200 Subject: [PATCH 5/5] [ceskatelevize] update selftests Most selftest metadata is no longer correct. - use 'live_status' to identify live broadcast; this can be no longer recognized from item['type'] (which is always 'VOD') so move the detection to the code path where we can actually find out - update outdated test metadata - drop georestricted test, the URL is no longer valid and I failed to find out what should be the right one - disable live broadcast tests for now; CT1 does not have a stable id any more and the selftest framework requires duration which depends on current program --- yt_dlp/extractor/ceskatelevize.py | 48 ++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 9cd7eecf4..d555eabab 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -23,12 +23,13 @@ class CeskaTelevizeIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.ceskatelevize.cz/porady/10441294653-hyde-park-civilizace/bonus/20641/', 'info_dict': { - 'id': '20641', + 'id': '61924494877028507', 'ext': 'mp4', 'title': 'Bonus 01 - En - Hyde Park Civilizace', 'description': 'English Subtittles', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 81.3, + 'live_status': 'not_live', }, 'params': { # m3u8 download @@ -37,12 +38,15 @@ class CeskaTelevizeIE(InfoExtractor): }, { # live stream 'url': 'https://www.ceskatelevize.cz/zive/ct1/', + 'only_matching': True, 'info_dict': { - 'id': '102', + 'id': '61924494878124436', 'ext': 'mp4', - 'title': r'ČT1 - živé vysílání online', + 'title': r're:^ČT1 - živé vysílání online \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.', - 'is_live': True, + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 5373.3, + 'live_status': 'is_live', }, 'params': { # m3u8 download @@ -53,15 +57,16 @@ class CeskaTelevizeIE(InfoExtractor): 'url': 'https://www.ceskatelevize.cz/zive/sport/', 'only_matching': True, 'info_dict': { - 'id': '402', + 'id': '422', 'ext': 'mp4', 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': True, + 'thumbnail': r're:^https?://.*\.jpg', + 'live_status': 'is_live', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, - # 'skip': 'Georestricted to Czech Republic', - }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', - 'only_matching': True, }, { # video with 18+ caution trailer 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', @@ -76,6 +81,7 @@ class CeskaTelevizeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Bogotart - Queer (Varování 18+)', 'duration': 11.9, + 'live_status': 'not_live', }, }, { 'info_dict': { @@ -84,6 +90,7 @@ class CeskaTelevizeIE(InfoExtractor): 'title': 'Bogotart - Queer (Queer)', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 1558.3, + 'live_status': 'not_live', }, }], 'params': { @@ -93,7 +100,19 @@ class CeskaTelevizeIE(InfoExtractor): }, { # iframe embed 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, + 'info_dict': { + 'id': '61924494877628660', + 'ext': 'mp4', + 'title': 'Epizoda 1/13 - Neviditelní', + 'description': 'Vypadají jako my, mluví jako my, ale mají něco navíc – gen, který jim umožňuje dýchat vodu. Aniž to tušíme, žijí mezi námi.', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3576.8, + 'live_status': 'not_live', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -109,11 +128,13 @@ class CeskaTelevizeIE(InfoExtractor): playlist_description = playlist_description.replace('\xa0', ' ') type_ = 'episode' + is_live = False if re.search(r'(^/porady|/zive)/', parsed_url.path): next_data = self._search_nextjs_data(webpage, playlist_id) if '/zive/' in parsed_url.path: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) sidp = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'showId'), get_all=False) + is_live = True else: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) if not idec: @@ -138,6 +159,7 @@ class CeskaTelevizeIE(InfoExtractor): elif parsed_url.path == '/' and parsed_url.fragment == 'live': if self._search_regex(r'(?s)]+id=[\'"]live[\'"][^>]+data-ctcomp-data=\'([^\']+)\'[^>]*>', webpage, 'live video player', default=None): # CT4 + is_live = True ctcomp_data = self._parse_json( self._search_regex( r'(?s)]+id=[\'"]live[\'"][^>]+data-ctcomp-data=\'([^\']+)\'[^>]*>', @@ -161,6 +183,7 @@ class CeskaTelevizeIE(InfoExtractor): data = {'data': json.dumps(data).encode('utf-8')} else: # CT24 + is_live = True lvp_url = self._search_regex( r'(?s)]+id=[\'"]live-video-player[\'"][^>]+data-url=[\'"]([^\'"]+)[\'"][^>]*>', webpage, 'live video player', fatal=True) @@ -223,7 +246,6 @@ class CeskaTelevizeIE(InfoExtractor): playlist_len = len(playlist) for num, item in enumerate(playlist): - is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): if 'playerType=flash' in stream_url: @@ -272,7 +294,7 @@ class CeskaTelevizeIE(InfoExtractor): 'duration': duration, 'formats': formats, 'subtitles': subtitles, - 'is_live': is_live, + 'live_status': 'is_live' if is_live else 'not_live', }) if len(entries) == 1: