From f80ba18ee9b45cab392ca753a71b5bf3bdb4bd40 Mon Sep 17 00:00:00 2001 From: "Renan D." Date: Fri, 3 May 2024 19:27:49 -0300 Subject: [PATCH] [threads] Add extractor --- supportedsites.md | 2 + yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/threads.py | 157 ++++++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 yt_dlp/extractor/threads.py diff --git a/supportedsites.md b/supportedsites.md index ba77c0feb..8de524a1d 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1449,6 +1449,8 @@ - **ThisVid** - **ThisVidMember** - **ThisVidPlaylist** + - **Threads** + - **ThreadsIOS**: Threads' iOS `barcelona://` URL - **ThreeSpeak** - **ThreeSpeakUser** - **TikTok** diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42034275b..c7e1174c3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1988,6 +1988,10 @@ from .thisvid import ( ThisVidMemberIE, ThisVidPlaylistIE, ) +from .threads import ( + ThreadsIE, + ThreadsIOSIE +) from .threespeak import ( ThreeSpeakIE, ThreeSpeakUserIE, diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py new file mode 100644 index 000000000..890fd8b97 --- /dev/null +++ b/yt_dlp/extractor/threads.py @@ -0,0 +1,157 @@ +from .common import InfoExtractor +from ..utils import ( + strftime_or_none, + traverse_obj, + remove_end, + strip_or_none +) + + +class ThreadsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?threads\.net/(?P[^/]+)/post/(?P[^/?#&]+)/?(?Pembed.*?)?' + + _TESTS = [{ + 'url': 'https://www.threads.net/@tntsportsbr/post/C6cqebdCfBi', + 'info_dict': { + 'id': 'C6cqebdCfBi', + 'ext': 'mp4', + 'title': 'md5:062673d04195aa2d99b8d7a11798cb9d', + 'description': 'md5:fe0c73f9a892fb92efcc67cc075561b0', + 'uploader': 'TNT Sports Brasil', + 'uploader_id': 'tntsportsbr', + 'uploader_url': 'https://www.threads.net/@tntsportsbr', + 'channel': 'tntsportsbr', + 'channel_url': 'https://www.threads.net/@tntsportsbr', + 'timestamp': 1714613811, + 'upload_date': '20240502', + 'like_count': int, + 'channel_is_verified': bool, + 'thumbnail': r're:^https?://.*\.jpg' + } + }, { + 'url': 'https://www.threads.net/@felipebecari/post/C6cM_yNPHCF', + 'info_dict': { + 'id': 'C6cM_yNPHCF', + 'ext': 'mp4', + 'title': '@felipebecari • Sobre o futuro dos dois últimos resgatados: tem muita notícia boa! 🐶❤️', + 'description': 'Sobre o futuro dos dois últimos resgatados: tem muita notícia boa! 🐶❤️', + 'uploader': 'Felipe Becari', + 'uploader_id': 'felipebecari', + 'uploader_url': 'https://www.threads.net/@felipebecari', + 'channel': 'felipebecari', + 'channel_url': 'https://www.threads.net/@felipebecari', + 'timestamp': 1714598318, + 'upload_date': '20240501', + 'like_count': int, + 'channel_is_verified': bool, + 'thumbnail': r're:^https?://.*\.jpg' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + metadata = {} + + # Try getting videos from json + json_data = self._search_regex( + r']+>(.*"code":"%s".*)' % video_id, + webpage, 'main json', fatal=True) + + result = self._search_json( + r'"result":', json_data, + 'result data', video_id, fatal=True) + + edges = traverse_obj(result, ('data', 'data', 'edges')) + + for node in edges: + items = traverse_obj(node, ('node', 'thread_items')) + + for item in items: + post = item.get('post') + + if post and post.get('code') == video_id: + formats = [] + thumbnails = [] + + # Videos + if (post.get('carousel_media') is not None): # Handle multiple videos posts + media_list = post.get('carousel_media') + else: + media_list = [post] + + for media in media_list: + videos = media.get('video_versions') + + for video in videos: + formats.append({ + 'format_id': '%s-%s' % (media.get('pk'), video['type']), # id-type + 'url': video['url'], + 'width': media.get('original_width'), + 'height': media.get('original_height'), + }) + + # Thumbnails + thumbs = traverse_obj(post, ('image_versions2', 'candidates')) + + for thumb in thumbs: + thumbnails.append({ + 'url': thumb['url'], + 'width': thumb['width'], + 'height': thumb['height'], + }) + + # Metadata + metadata.setdefault('uploader_id', traverse_obj(post, ('user', 'username'))) + metadata.setdefault('channel_is_verified', traverse_obj(post, ('user', 'is_verified'))) + metadata.setdefault('uploader_url', 'https://www.threads.net/@%s' % traverse_obj(post, ('user', 'username'))) + metadata.setdefault('timestamp', post.get('taken_at')) + metadata.setdefault('like_count', post.get('like_count')) + + # Try getting metadata + metadata['id'] = video_id + metadata['title'] = strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads')) + metadata['description'] = self._og_search_description(webpage) + + metadata['channel'] = metadata.get('uploader_id') + metadata['channel_url'] = metadata.get('uploader_url') + metadata['uploader'] = self._search_regex(r'(.*?) \(', self._og_search_title(webpage), 'uploader') + metadata['upload_date'] = strftime_or_none(metadata.get('timestamp')) + + return { + **metadata, + 'formats': formats, + 'thumbnails': thumbnails + } + + +class ThreadsIOSIE(InfoExtractor): + IE_DESC = 'IOS barcelona:// URL' + _VALID_URL = r'barcelona://media\?shortcode=(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'barcelona://media?shortcode=C6fDehepo5D', + 'info_dict': { + 'id': 'C6fDehepo5D', + 'ext': 'mp4', + 'title': 'md5:dc92f960981b8b3a33eba9681e9fdfc6', + 'description': 'md5:0c36a7e67e1517459bc0334dba932164', + 'uploader': 'Sa\u0303o Paulo Futebol Clube', + 'uploader_id': 'saopaulofc', + 'uploader_url': 'https://www.threads.net/@saopaulofc', + 'channel': 'saopaulofc', + 'channel_url': 'https://www.threads.net/@saopaulofc', + 'timestamp': 1714694014, + 'upload_date': '20240502', + 'like_count': int, + 'channel_is_verified': bool, + 'thumbnail': r're:^https?://.*\.jpg' + }, + 'add_ie': ['Threads'] + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Threads doesn't care about the user url, it redirects to the right one + # So we use ** instead so that we don't need to find it + return self.url_result(f'http://www.threads.net/**/post/{video_id}', ThreadsIE, video_id)