From 2c98d998181c81ee49908be03c031204fd66d03d Mon Sep 17 00:00:00 2001 From: schnusch Date: Mon, 10 Oct 2022 22:31:01 +0200 Subject: [PATCH] [extractors/podbayfm] Add extractor (#4971) Authored by: schnusch --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/podbayfm.py | 75 +++++++++++++++++++++++++++++++++ yt_dlp/utils.py | 3 +- 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 yt_dlp/extractor/podbayfm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e5be35716..d514f9a89 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1345,6 +1345,7 @@ from .pluralsight import ( PluralsightIE, PluralsightCourseIE, ) +from .podbayfm import PodbayFMIE, PodbayFMChannelIE from .podchaser import PodchaserIE from .podomatic import PodomaticIE from .pokemon import ( diff --git a/yt_dlp/extractor/podbayfm.py b/yt_dlp/extractor/podbayfm.py new file mode 100644 index 000000000..2a26fd2b3 --- /dev/null +++ b/yt_dlp/extractor/podbayfm.py @@ -0,0 +1,75 @@ +from .common import InfoExtractor +from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call + + +def result_from_props(props, episode_id=None): + return { + 'id': props.get('podcast_id') or episode_id, + 'title': props.get('title'), + 'url': props['mediaURL'], + 'ext': 'mp3', + 'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']), + 'timestamp': props.get('timestamp'), + 'duration': int_or_none(props.get('duration')), + } + + +class PodbayFMIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400', + 'md5': '98b41285dcf7989d105a4ed0404054cf', + 'info_dict': { + 'id': '1647338400', + 'title': 'Part One: Kissinger', + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1647338400, + 'duration': 5001, + 'upload_date': '20220315', + }, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + data = self._search_nextjs_data(webpage, episode_id) + return result_from_props(data['props']['pageProps']['episode'], episode_id) + + +class PodbayFMChannelIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/(?P[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards', + 'info_dict': { + 'id': 'behind-the-bastards', + 'title': 'Behind the Bastards', + }, + }] + _PAGE_SIZE = 10 + + def _fetch_page(self, channel_id, pagenum): + return self._download_json( + f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}', + channel_id)['podcast'] + + @staticmethod + def _results_from_page(channel_id, page): + return [{ + **result_from_props(e), + 'extractor': PodbayFMIE.IE_NAME, + 'extractor_key': PodbayFMIE.ie_key(), + # somehow they use timestamps as the episode identifier + 'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}', + } for e in page['episodes']] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + first_page = self._fetch_page(channel_id, 0) + entries = OnDemandPagedList( + lambda pagenum: self._results_from_page( + channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page), + self._PAGE_SIZE) + + return self.playlist_result(entries, channel_id, first_page.get('title')) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 5a88a928d..c2327ae1d 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5499,7 +5499,8 @@ def jwt_encode_hs256(payload_data, key, headers={}): # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256 def jwt_decode_hs256(jwt): header_b64, payload_b64, signature_b64 = jwt.split('.') - payload_data = json.loads(base64.urlsafe_b64decode(payload_b64)) + # add trailing ='s that may have been stripped, superfluous ='s are ignored + payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}===')) return payload_data