X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/bfd973ece3369c593b5e82a88cc16de80088a73e..0dd53faeca2ba0ce138e4092d07b5f2dbf2422f9:/yt_dlp/extractor/nbc.py diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 910cbedf6..267fa8353 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -1,25 +1,37 @@ import base64 import json import re +import xml.etree.ElementTree from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .theplatform import ThePlatformIE, default_ns from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest from ..utils import ( + ExtractorError, + RegexNotFoundError, + UserNotLive, + clean_html, + determine_ext, + float_or_none, int_or_none, + mimetype2ext, parse_age_limit, parse_duration, - RegexNotFoundError, + remove_end, smuggle_url, + traverse_obj, try_get, + unescapeHTML, unified_timestamp, update_url_query, + url_basename, ) -class NBCIE(ThePlatformIE): - _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?Pn?\d+))' +class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE + _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P(?:NBCE|n)?\d+))' _TESTS = [ { @@ -32,10 +44,20 @@ class NBCIE(ThePlatformIE): 'timestamp': 1424246400, 'upload_date': '20150218', 'uploader': 'NBCU-COM', + 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', + 'episode_number': 86, + 'season': 'Season 2', + 'season_number': 2, + 'series': 'Tonight Show: Jimmy Fallon', + 'duration': 237.0, + 'chapters': 'count:1', + 'tags': 'count:4', + 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'], + 'media_type': 'Full Episode', }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { @@ -49,11 +71,7 @@ class NBCIE(ThePlatformIE): 'upload_date': '20141206', 'uploader': 'NBCU-COM', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Only works from US', + 'skip': 'page not found', }, { # HLS streams requires the 'hdnea3' cookie @@ -67,10 +85,60 @@ class NBCIE(ThePlatformIE): 'upload_date': '20090315', 'uploader': 'NBCU-COM', }, + 'skip': 'page not found', + }, + { + # manifest url does not have extension + 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', + 'info_dict': { + 'id': '3646439', + 'ext': 'mp4', + 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode_number': 1, + 'season': 'Season 75', + 'season_number': 75, + 'series': 'The Golden Globe Awards', + 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', + 'uploader': 'NBCU-COM', + 'upload_date': '20180107', + 'timestamp': 1515312000, + 'duration': 570.0, + 'tags': 'count:8', + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:1', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + { + # new video_id format + 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + 'info_dict': { + 'id': 'NBCE125189978', + 'ext': 'mp4', + 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e', + 'uploader': 'NBCU-COM', + 'series': 'Quantum Leap', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'episode_number': 1, + 'duration': 170.171, + 'chapters': [], + 'timestamp': 1663956155, + 'upload_date': '20220923', + 'tags': 'count:10', + 'age_limit': 0, + 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/Quantum Leap 2022'], + 'media_type': 'Highlight', + }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', }, - 'skip': 'Only works from US', }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', @@ -130,6 +198,7 @@ def _real_extract(self, url): query = { 'mbr': 'true', 'manifest': 'm3u', + 'switch': 'HLSServiceSecure', } video_id = video_data['mpxGuid'] tp_path = 'NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id) @@ -220,7 +289,7 @@ class NBCSportsIE(InfoExtractor): _TESTS = [{ # iframe src - 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'url': 'https://www.nbcsports.com/watch/nfl/profootballtalk/pft-pm/unpacking-addisons-reckless-driving-citation', 'info_dict': { 'id': 'PHJSaFWbrTY9', 'ext': 'mp4', @@ -299,7 +368,6 @@ def _real_extract(self, url): 'resourceId': base64.b64encode(resource.encode()).decode(), }).encode())['tokenizedUrl'] formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, 'title': title, @@ -309,14 +377,14 @@ def _real_extract(self, url): } -class NBCNewsIE(ThePlatformIE): +class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1'] _TESTS = [ { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', + 'md5': 'fb3dcd2d7b1dd9804305fa2fc95ab610', # md5 tends to fluctuate 'info_dict': { 'id': '269389891880', 'ext': 'mp4', @@ -324,6 +392,8 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', 'timestamp': 1401363060, 'upload_date': '20140529', + 'duration': 46.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/140529/p_tweet_snow_140529.jpg', }, }, { @@ -339,7 +409,7 @@ class NBCNewsIE(ThePlatformIE): }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '8eb831eca25bfa7d25ddd83e85946548', + 'md5': '40d0e48c68896359c80372306ece0fc3', 'info_dict': { 'id': '394064451844', 'ext': 'mp4', @@ -347,11 +417,13 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'timestamp': 1423104900, 'upload_date': '20150205', + 'duration': 1236.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/nn_netcast_150204.jpg', }, }, { 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', + 'md5': 'ffb59bcf0733dc3c7f0ace907f5e3939', 'info_dict': { 'id': 'n431456', 'ext': 'mp4', @@ -359,11 +431,13 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', 'upload_date': '20150922', 'timestamp': 1442917800, + 'duration': 37.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/x_lon_vwhorn_150922.jpg', }, }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', - 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'md5': '693d1fa21d23afcc9b04c66b227ed9ff', 'info_dict': { 'id': '669831235788', 'ext': 'mp4', @@ -371,6 +445,8 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'duration': 69.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/201604/2016-04-20T11-35-09-133Z--1280x720.jpg', }, }, { @@ -384,6 +460,7 @@ class NBCNewsIE(ThePlatformIE): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'duration': 940.0, }, }, { @@ -431,7 +508,6 @@ def _real_extract(self, url): 'tbr': tbr, 'ext': 'mp4', }) - self._sort_formats(formats) subtitles = {} closed_captioning = video_data.get('closedCaptioning') @@ -473,6 +549,7 @@ class NBCOlympicsIE(InfoExtractor): 'upload_date': '20160815', 'uploader': 'NBCU-SPORTS', }, + 'skip': '404 Not Found', } def _real_extract(self, url): @@ -516,6 +593,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, { 'note': 'Plain m3u8 source URL', 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars', @@ -527,6 +605,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, ] @@ -575,7 +654,6 @@ def _real_extract(self, url): # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to # download with ffmpeg without this option f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0', '-http_seekable', '0', '-icy', '0']} - self._sort_formats(formats) return { 'id': pid, @@ -584,3 +662,190 @@ def _real_extract(self, url): 'formats': formats, 'is_live': is_live, } + + +class NBCStationsIE(InfoExtractor): + _DOMAIN_RE = '|'.join(map(re.escape, ( + 'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles', + 'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington', + 'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra', + ))) + _VALID_URL = rf'https?://(?:www\.)?(?P{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P[^/?#]+)/?(?:$|[#?])' + + _TESTS = [{ + 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', + 'info_dict': { + 'id': '2968618', + 'ext': 'mp4', + 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', + 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182', + 'duration': 112.513, + 'timestamp': 1661135892, + 'upload_date': '20220822', + 'uploader': 'NBC 4', + 'channel_id': 'KNBC', + 'channel': 'nbclosangeles', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', + 'info_dict': { + 'id': '2247002', + 'ext': 'mp4', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'duration': 172.406, + 'timestamp': 1660886507, + 'upload_date': '20220819', + 'uploader': 'Telemundo Arizona', + 'channel_id': 'KTAZ', + 'channel': 'telemundoarizona', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # direct mp4 link + 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/', + 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85', + 'info_dict': { + 'id': '2961135', + 'ext': 'mp4', + 'title': 'Highs Near Freezing in Boston on Wednesday', + 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b', + 'duration': 235.669, + 'timestamp': 1675268656, + 'upload_date': '20230201', + 'uploader': '', + 'channel_id': 'WBTS', + 'channel': 'nbcboston', + }, + }] + + _RESOLUTIONS = { + '1080': '1920', + '720': '1280', + '540': '960', + '360': '640', + '234': '416', + } + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('site', 'id') + webpage = self._download_webpage(url, video_id) + + nbc_data = self._search_json( + r'