X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/9160a0c6a251312917ea34ff60fdc9a22e364f11..e63faa101cf7b9bf9f899cabb74ce03c7f893572:/yt_dlp/extractor/nbc.py diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 2f25b9e7b..3de8c1508 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import base64 import json import re @@ -9,13 +7,20 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( + ExtractorError, int_or_none, parse_age_limit, parse_duration, + RegexNotFoundError, smuggle_url, + str_or_none, + traverse_obj, try_get, + unified_strdate, unified_timestamp, update_url_query, + url_basename, + variadic, ) @@ -85,7 +90,7 @@ class NBCIE(ThePlatformIE): ] def _real_extract(self, url): - permalink, video_id = re.match(self._VALID_URL, url).groups() + permalink, video_id = self._match_valid_url(url).groups() permalink = 'http' + compat_urllib_parse_unquote(permalink) video_data = self._download_json( 'https://friendship.nbc.co/v2/graphql', video_id, query={ @@ -185,6 +190,7 @@ def _real_extract(self, url): class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _EMBED_REGEX = [r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P%s[^\"]+)' % _VALID_URL_BASE] _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -196,27 +202,22 @@ class NBCSportsVPlayerIE(InfoExtractor): 'timestamp': 1426270238, 'upload_date': '20150313', 'uploader': 'NBCU-SPORTS', + 'duration': 72.818, + 'chapters': [], + 'thumbnail': r're:^https?://.*\.jpg$' } }, { - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2', 'only_matching': True, }, { 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - iframe_m = re.search( - r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) - if iframe_m: - return iframe_m.group('url') - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage).replace( - 'vplayer.nbcsports.com', 'player.theplatform.com') + theplatform_url = self._html_search_regex(r'tp:releaseUrl="(.+?)"', webpage, 'url') return self.url_result(theplatform_url, 'ThePlatform') @@ -234,6 +235,9 @@ class NBCSportsIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, + 'chapters': [], + 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg', + 'duration': 528.395, } }, { # data-mpx-src @@ -304,7 +308,7 @@ def _real_extract(self, url): self._sort_formats(formats) return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': live_source.get('description'), 'formats': formats, 'is_live': is_live, @@ -313,6 +317,7 @@ def _real_extract(self, url): class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' + _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1'] _TESTS = [ { @@ -402,9 +407,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r']+id="__NEXT_DATA__"[^>]*>({.+?})', - webpage, 'bootstrap json'), video_id)['props']['initialState'] + data = self._search_nextjs_data(webpage, video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] @@ -460,7 +463,7 @@ def _real_extract(self, url): class NBCOlympicsIE(InfoExtractor): IE_NAME = 'nbcolympics' - _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P[a-z-]+)' + _VALID_URL = r'https?://www\.nbcolympics\.com/videos?/(?P[0-9a-z-]+)' _TEST = { # Geo-restricted to US @@ -483,13 +486,18 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) + try: + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) - iframe_url = drupal_settings['vod']['iframe_url'] - theplatform_url = iframe_url.replace( - 'vplayer.nbcolympics.com', 'player.theplatform.com') + iframe_url = drupal_settings['vod']['iframe_url'] + theplatform_url = iframe_url.replace( + 'vplayer.nbcolympics.com', 'player.theplatform.com') + except RegexNotFoundError: + theplatform_url = self._search_regex( + r"([\"'])embedUrl\1: *([\"'])(?P.+)\2", + webpage, 'embedding URL', group="embedUrl") return { '_type': 'url_transparent', @@ -502,43 +510,77 @@ def _real_extract(self, url): class NBCOlympicsStreamIE(AdobePassIE): IE_NAME = 'nbcolympics:stream' _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P[0-9a-z-]+)' - _TEST = { - 'url': 'http://stream.nbcolympics.com/2018-winter-olympics-nbcsn-evening-feb-8', - 'info_dict': { - 'id': '203493', - 'ext': 'mp4', - 'title': 're:Curling, Alpine, Luge [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - }, - 'params': { - # m3u8 download - 'skip_download': True, + _TESTS = [ + { + 'note': 'Tokenized m3u8 source URL', + 'url': 'https://stream.nbcolympics.com/womens-soccer-group-round-11', + 'info_dict': { + 'id': '2019740', + 'ext': 'mp4', + 'title': r"re:Women's Group Stage - Netherlands vs\. Brazil [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$", + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'note': 'Plain m3u8 source URL', + 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars', + 'info_dict': { + 'id': '2021729', + 'ext': 'mp4', + 'title': r're:Event Finals: M Floor, W Vault, M Pommel, W Uneven Bars [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + 'skip_download': 'm3u8', + }, }, - } - _DATA_URL_TEMPLATE = 'http://stream.nbcolympics.com/data/%s_%s.json' + ] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid') - resource = self._search_regex( - r"resource\s*=\s*'(.+)';", webpage, - 'resource').replace("' + pid + '", pid) + event_config = self._download_json( - self._DATA_URL_TEMPLATE % ('event_config', pid), - pid)['eventConfig'] - title = self._live_title(event_config['eventTitle']) + f'http://stream.nbcolympics.com/data/event_config_{pid}.json', + pid, 'Downloading event config')['eventConfig'] + + title = event_config['eventTitle'] + is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus')) + source_url = self._download_json( - self._DATA_URL_TEMPLATE % ('live_sources', pid), - pid)['videoSources'][0]['sourceUrl'] - media_token = self._extract_mvpd_auth( - url, pid, event_config.get('requestorId', 'NBCOlympics'), resource) - formats = self._extract_m3u8_formats(self._download_webpage( - 'http://sp.auth.adobe.com/tvs/v1/sign', pid, query={ - 'cdn': 'akamai', - 'mediaToken': base64.b64encode(media_token.encode()), - 'resource': base64.b64encode(resource.encode()), - 'url': source_url, - }), pid, 'mp4') + f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging', + pid, 'Downloading leap config' + )['videoSources'][0]['cdnSources']['primary'][0]['sourceUrl'] + + if event_config.get('cdnToken'): + ap_resource = self._get_mvpd_resource( + event_config.get('resourceId', 'NBCOlympics'), + re.sub(r'[^\w\d ]+', '', event_config['eventTitle']), pid, + event_config.get('ratingId', 'NO VALUE')) + media_token = self._extract_mvpd_auth(url, pid, event_config.get('requestorId', 'NBCOlympics'), ap_resource) + + source_url = self._download_json( + 'https://tokens.playmakerservices.com/', pid, 'Retrieving tokenized URL', + data=json.dumps({ + 'application': 'NBCSports', + 'authentication-type': 'adobe-pass', + 'cdn': 'akamai', + 'pid': pid, + 'platform': 'desktop', + 'requestorId': 'NBCOlympics', + 'resourceId': base64.b64encode(ap_resource.encode()).decode(), + 'token': base64.b64encode(media_token.encode()).decode(), + 'url': source_url, + 'version': 'v1', + }).encode(), + )['akamai'][0]['tokenizedUrl'] + + formats = self._extract_m3u8_formats(source_url, pid, 'mp4', live=is_live) + for f in formats: + # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to + # download with ffmpeg without this option + f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0', '-http_seekable', '0', '-icy', '0']} self._sort_formats(formats) return { @@ -546,5 +588,171 @@ def _real_extract(self, url): 'display_id': display_id, 'title': title, 'formats': formats, - 'is_live': True, + 'is_live': is_live, + } + + +class NBCStationsIE(InfoExtractor): + _DOMAIN_RE = '|'.join(map(re.escape, ( + 'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles', + 'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington', + 'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra', + ))) + _VALID_URL = rf'https?://(?:www\.)?(?P{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P[^/?#]+)/?(?:$|[#?])' + + _TESTS = [{ + 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', + 'md5': '462041d91bd762ef5a38b7d85d6dc18f', + 'info_dict': { + 'id': '2968618', + 'ext': 'mp4', + 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', + 'description': None, + 'timestamp': 1661135892, + 'upload_date': '20220821', + 'uploader': 'NBC 4', + 'uploader_id': 'KNBC', + 'channel': 'nbclosangeles', + }, + }, { + 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', + 'md5': '0917dcf7885be1023a9220630d415f67', + 'info_dict': { + 'id': '2247002', + 'ext': 'mp4', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'timestamp': 1660886507, + 'upload_date': '20220819', + 'uploader': 'Telemundo Arizona', + 'uploader_id': 'KTAZ', + 'channel': 'telemundoarizona', + }, + }] + + _RESOLUTIONS = { + '1080': '1920', + '720': '1280', + '540': '960', + '360': '640', + '234': '416', + } + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('site', 'id') + webpage = self._download_webpage(url, video_id) + + nbc_data = self._search_json( + r'