[extractor/rheinmaintv] Add extractor (#7311)

Authored by: barthelmannk

Co-authored-by: barthelmannk <81305638+barthelmannk@users.noreply.github.com>
pull/7390/head
bashonly 11 months ago committed by GitHub
parent 774aa09dd6
commit 98cb1eda7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1617,6 +1617,7 @@ from .rentv import (
from .restudy import RestudyIE from .restudy import RestudyIE
from .reuters import ReutersIE from .reuters import ReutersIE
from .reverbnation import ReverbNationIE from .reverbnation import ReverbNationIE
from .rheinmaintv import RheinMainTVIE
from .rice import RICEIE from .rice import RICEIE
from .rmcdecouverte import RMCDecouverteIE from .rmcdecouverte import RMCDecouverteIE
from .rockstargames import RockstarGamesIE from .rockstargames import RockstarGamesIE

@ -0,0 +1,94 @@
from .common import InfoExtractor
from ..utils import extract_attributes, merge_dicts, remove_end
class RheinMainTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)'
_TESTS = [{
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/',
'info_dict': {
'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022',
'ext': 'ismv', # ismv+isma will be merged into mp4
'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft',
'title': 'Auf dem Weg zur Deutschen Meisterschaft',
'upload_date': '20221108',
'view_count': int,
'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft',
'thumbnail': r're:^https://.+\.jpg',
'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9',
'timestamp': 1667933057,
'duration': 243.0,
},
'params': {'skip_download': 'ism'},
}, {
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
'info_dict': {
'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022',
'ext': 'ismv',
'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
'timestamp': 1668526214,
'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften',
'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
'view_count': int,
'thumbnail': r're:^https://.+\.jpg',
'duration': 345.0,
'description': 'md5:9370ba29526984006c2cba1372e5c5a0',
'upload_date': '20221115',
},
'params': {'skip_download': 'ism'},
}, {
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
'info_dict': {
'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022',
'ext': 'ismv',
'title': 'Casino Mainz bei den Deutschen Meisterschaften',
'view_count': int,
'timestamp': 1668527402,
'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften',
'upload_date': '20221115',
'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften',
'duration': 348.0,
'thumbnail': r're:^https://.+\.jpg',
'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa',
},
'params': {'skip_download': 'ism'},
}, {
'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
display_id = mobj.group('display_id')
video_id = mobj.group('video_id').replace('/', '-')
webpage = self._download_webpage(url, video_id)
source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)',
webpage, 'video', group=('source', 'img'))
source = extract_attributes(source)
img = extract_attributes(img)
raw_json_ld = list(self._yield_json_ld(webpage, video_id))
json_ld = self._json_ld(raw_json_ld, video_id)
json_ld.pop('url', None)
ism_manifest_url = (
source.get('src')
or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject')
)
formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id)
return merge_dicts({
'id': video_id,
'display_id': display_id,
'title':
self._html_search_regex(r'<h1><span class="title">([^<]*)</span>',
webpage, 'headline', default=None)
or img.get('title') or json_ld.get('title') or self._og_search_title(webpage)
or remove_end(self._html_extract_title(webpage), ' -'),
'alt_title': img.get('alt'),
'description': json_ld.get('description') or self._og_search_description(webpage),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'),
}, json_ld)
Loading…
Cancel
Save