X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/bb36a55c415bc3538c841080f1e2366fb361c4a1..9f14daf22b4080ae1531a772ee7574959af4e2fa:/yt_dlp/extractor/nbc.py diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 5ebb1c869..1ea6355b5 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import base64 import json import re @@ -9,18 +7,24 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( + ExtractorError, int_or_none, parse_age_limit, parse_duration, RegexNotFoundError, smuggle_url, + str_or_none, + traverse_obj, try_get, + unified_strdate, unified_timestamp, update_url_query, + url_basename, + variadic, ) -class NBCIE(ThePlatformIE): +class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?Pn?\d+))' _TESTS = [ @@ -86,7 +90,7 @@ class NBCIE(ThePlatformIE): ] def _real_extract(self, url): - permalink, video_id = re.match(self._VALID_URL, url).groups() + permalink, video_id = self._match_valid_url(url).groups() permalink = 'http' + compat_urllib_parse_unquote(permalink) video_data = self._download_json( 'https://friendship.nbc.co/v2/graphql', video_id, query={ @@ -186,6 +190,7 @@ def _real_extract(self, url): class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _EMBED_REGEX = [r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P%s[^\"]+)' % _VALID_URL_BASE] _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -197,27 +202,22 @@ class NBCSportsVPlayerIE(InfoExtractor): 'timestamp': 1426270238, 'upload_date': '20150313', 'uploader': 'NBCU-SPORTS', + 'duration': 72.818, + 'chapters': [], + 'thumbnail': r're:^https?://.*\.jpg$' } }, { - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2', 'only_matching': True, }, { 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - iframe_m = re.search( - r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) - if iframe_m: - return iframe_m.group('url') - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage).replace( - 'vplayer.nbcsports.com', 'player.theplatform.com') + theplatform_url = self._html_search_regex(r'tp:releaseUrl="(.+?)"', webpage, 'url') return self.url_result(theplatform_url, 'ThePlatform') @@ -235,6 +235,9 @@ class NBCSportsIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, + 'chapters': [], + 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg', + 'duration': 528.395, } }, { # data-mpx-src @@ -302,18 +305,18 @@ def _real_extract(self, url): 'resourceId': base64.b64encode(resource.encode()).decode(), }).encode())['tokenizedUrl'] formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': live_source.get('description'), 'formats': formats, 'is_live': is_live, } -class NBCNewsIE(ThePlatformIE): +class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' + _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1'] _TESTS = [ { @@ -403,9 +406,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r']+id="__NEXT_DATA__"[^>]*>({.+?})', - webpage, 'bootstrap json'), video_id)['props']['initialState'] + data = self._search_nextjs_data(webpage, video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] @@ -435,7 +436,6 @@ def _real_extract(self, url): 'tbr': tbr, 'ext': 'mp4', }) - self._sort_formats(formats) subtitles = {} closed_captioning = video_data.get('closedCaptioning') @@ -545,8 +545,6 @@ def _real_extract(self, url): title = event_config['eventTitle'] is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus')) - if is_live: - title = self._live_title(title) source_url = self._download_json( f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging', @@ -580,8 +578,7 @@ def _real_extract(self, url): for f in formats: # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to # download with ffmpeg without this option - f['_ffmpeg_args'] = ['-seekable', '0', '-http_seekable', '0', '-icy', '0'] - self._sort_formats(formats) + f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0', '-http_seekable', '0', '-icy', '0']} return { 'id': pid, @@ -590,3 +587,168 @@ def _real_extract(self, url): 'formats': formats, 'is_live': is_live, } + + +class NBCStationsIE(InfoExtractor): + _DOMAIN_RE = '|'.join(map(re.escape, ( + 'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles', + 'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington', + 'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra', + ))) + _VALID_URL = rf'https?://(?:www\.)?(?P{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P[^/?#]+)/?(?:$|[#?])' + + _TESTS = [{ + 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', + 'md5': '462041d91bd762ef5a38b7d85d6dc18f', + 'info_dict': { + 'id': '2968618', + 'ext': 'mp4', + 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', + 'description': None, + 'timestamp': 1661135892, + 'upload_date': '20220821', + 'uploader': 'NBC 4', + 'uploader_id': 'KNBC', + 'channel': 'nbclosangeles', + }, + }, { + 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', + 'md5': '0917dcf7885be1023a9220630d415f67', + 'info_dict': { + 'id': '2247002', + 'ext': 'mp4', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'timestamp': 1660886507, + 'upload_date': '20220819', + 'uploader': 'Telemundo Arizona', + 'uploader_id': 'KTAZ', + 'channel': 'telemundoarizona', + }, + }] + + _RESOLUTIONS = { + '1080': '1920', + '720': '1280', + '540': '960', + '360': '640', + '234': '416', + } + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('site', 'id') + webpage = self._download_webpage(url, video_id) + + nbc_data = self._search_json( + r'