[yt-dlp.git] / youtube_dl / extractor / nbc.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import (
    compat_str,
    compat_HTTPError,
)
from ..utils import (
    ExtractorError,
    find_xpath_attr,
)


class NBCIE(InfoExtractor):
    _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'

    _TESTS = [
        {
            'url': 'http://www.nbc.com/the-tonight-show/segments/112966',
            # md5 checksum is not stable
            'info_dict': {
                'id': 'c9xnCo0YPOPH',
                'ext': 'flv',
                'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
                'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
            },
        },
        {
            'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
            'info_dict': {
                'id': 'XwU9KZkp98TH',
                'ext': 'flv',
                'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
                'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
            },
            'skip': 'Only works from US',
        },
    ]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        theplatform_url = self._search_regex(
            '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
            webpage, 'theplatform url').replace('_no_endcard', '')
        if theplatform_url.startswith('//'):
            theplatform_url = 'http:' + theplatform_url
        return self.url_result(theplatform_url)


class NBCSportsVPlayerIE(InfoExtractor):
    _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'

    _TESTS = [{
        'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI',
        'info_dict': {
            'id': '9CsDKds0kvHI',
            'ext': 'flv',
            'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
            'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
        }
    }, {
        'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_url(webpage):
        iframe_m = re.search(
            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
        if iframe_m:
            return iframe_m.group('url')

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        theplatform_url = self._og_search_video_url(webpage)
        return self.url_result(theplatform_url, 'ThePlatform')


class NBCSportsIE(InfoExtractor):
    # Does not include https becuase its certificate is invalid
    _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'

    _TEST = {
        'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
        'info_dict': {
            'id': 'PHJSaFWbrTY9',
            'ext': 'flv',
            'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
            'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
        }
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        return self.url_result(
            NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')


class NBCNewsIE(InfoExtractor):
    _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
        (?:video/.+?/(?P<id>\d+)|
        (?:feature|nightly-news)/[^/]+/(?P<title>.+))
        '''

    _TESTS = [
        {
            'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
            'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
            'info_dict': {
                'id': '52753292',
                'ext': 'flv',
                'title': 'Crew emerges after four-month Mars food study',
                'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
            },
        },
        {
            'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236',
            'md5': 'b2421750c9f260783721d898f4c42063',
            'info_dict': {
                'id': 'I1wpAI_zmhsQ',
                'ext': 'mp4',
                'title': 'How Twitter Reacted To The Snowden Interview',
                'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
            },
            'add_ie': ['ThePlatform'],
        },
        {
            'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
            'md5': 'fdbf39ab73a72df5896b6234ff98518a',
            'info_dict': {
                'id': 'Wjf9EDR3A_60',
                'ext': 'mp4',
                'title': 'FULL EPISODE: Family Business',
                'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
            },
        },
        {
            'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
            'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d',
            'info_dict': {
                'id': 'sekXqyTVnmN3',
                'ext': 'mp4',
                'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
                'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
            },
        },
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        if video_id is not None:
            all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
            info = all_info.find('video')

            return {
                'id': video_id,
                'title': info.find('headline').text,
                'ext': 'flv',
                'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
                'description': compat_str(info.find('caption').text),
                'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
            }
        else:
            # "feature" and "nightly-news" pages use theplatform.com
            title = mobj.group('title')
            webpage = self._download_webpage(url, title)
            bootstrap_json = self._search_regex(
                r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
                webpage, 'bootstrap json', flags=re.MULTILINE)
            bootstrap = self._parse_json(bootstrap_json, video_id)
            info = bootstrap['results'][0]['video']
            mpxid = info['mpxId']

            base_urls = [
                info['fallbackPlaylistUrl'],
                info['associatedPlaylistUrl'],
            ]

            for base_url in base_urls:
                if not base_url:
                    continue
                playlist_url = base_url + '?form=MPXNBCNewsAPI'

                try:
                    all_videos = self._download_json(playlist_url, title)
                except ExtractorError as ee:
                    if isinstance(ee.cause, compat_HTTPError):
                        continue
                    raise

                if not all_videos or 'videos' not in all_videos:
                    continue

                try:
                    info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
                    break
                except StopIteration:
                    continue

            if info is None:
                raise ExtractorError('Could not find video in playlists')

            return {
                '_type': 'url',
                # We get the best quality video
                'url': info['videoAssets'][-1]['publicUrl'],
                'ie_key': 'ThePlatform',
            }
Commit	Line	Data
cd7ee7aa JMF	1	from __future__ import unicode_literals
cd7ee7aa JMF	2
0bc56fa6	3	import re
0bc56fa6 JMF	4
0bc56fa6 JMF	5	from .common import InfoExtractor
1cc79574	6	from ..compat import (
37e64add	7	compat_str,
2df54b4b	8	compat_HTTPError,
1cc79574 PH	9	)
1cc79574 PH	10	from ..utils import (
37e64add PH	11	ExtractorError,
	12	find_xpath_attr,
	13	)
0bc56fa6 JMF	14
0bc56fa6 JMF	15
020cf5eb	16	class NBCIE(InfoExtractor):
cb88671e	17	_VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
58c1f6f0 S	18
	19	_TESTS = [
	20	{
5c8a3f86	21	'url': 'http://www.nbc.com/the-tonight-show/segments/112966',
58c1f6f0 S	22	# md5 checksum is not stable
58c1f6f0 S	23	'info_dict': {
5c8a3f86	24	'id': 'c9xnCo0YPOPH',
58c1f6f0	25	'ext': 'flv',
5c8a3f86 JMF	26	'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
5c8a3f86 JMF	27	'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
58c1f6f0	28	},
020cf5eb	29	},
58c1f6f0 S	30	{
	31	'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
	32	'info_dict': {
	33	'id': 'XwU9KZkp98TH',
	34	'ext': 'flv',
	35	'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
	36	'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
	37	},
	38	'skip': 'Only works from US',
	39	},
	40	]
020cf5eb JMF	41
020cf5eb JMF	42	def _real_extract(self, url):
10e3d734	43	video_id = self._match_id(url)
020cf5eb	44	webpage = self._download_webpage(url, video_id)
58c1f6f0 S	45	theplatform_url = self._search_regex(
	46	'(?:class="video-player video-player-full" data-mpx-url\|class="player" src)="(.*?)"',
	47	webpage, 'theplatform url').replace('_no_endcard', '')
020cf5eb JMF	48	if theplatform_url.startswith('//'):
	49	theplatform_url = 'http:' + theplatform_url
	50	return self.url_result(theplatform_url)
	51
	52
a2a4d5fa	53	class NBCSportsVPlayerIE(InfoExtractor):
a2edf2e7	54	_VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
a28ccbab	55
5cbb2699	56	_TESTS = [{
a28ccbab	57	'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI',
a28ccbab YCH	58	'info_dict': {
	59	'id': '9CsDKds0kvHI',
	60	'ext': 'flv',
	61	'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
	62	'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
	63	}
5cbb2699	64	}, {
5cbb2699 YCH	65	'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z',
	66	'only_matching': True,
	67	}]
a28ccbab	68
a2a4d5fa YCH	69	@staticmethod
	70	def _extract_url(webpage):
	71	iframe_m = re.search(
	72	r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
	73	if iframe_m:
	74	return iframe_m.group('url')
	75
a28ccbab YCH	76	def _real_extract(self, url):
	77	video_id = self._match_id(url)
	78	webpage = self._download_webpage(url, video_id)
	79	theplatform_url = self._og_search_video_url(webpage)
	80	return self.url_result(theplatform_url, 'ThePlatform')
	81
	82
a2a4d5fa YCH	83	class NBCSportsIE(InfoExtractor):
	84	# Does not include https becuase its certificate is invalid
	85	_VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
	86
	87	_TEST = {
	88	'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
a2a4d5fa YCH	89	'info_dict': {
	90	'id': 'PHJSaFWbrTY9',
	91	'ext': 'flv',
	92	'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
	93	'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
	94	}
	95	}
	96
	97	def _real_extract(self, url):
	98	video_id = self._match_id(url)
	99	webpage = self._download_webpage(url, video_id)
	100	return self.url_result(
	101	NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
	102
	103
0bc56fa6	104	class NBCNewsIE(InfoExtractor):
a4f3d779 S	105	_VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
	106	(?:video/.+?/(?P<id>\d+)\|
	107	(?:feature\|nightly-news)/[^/]+/(?P<title>.+))
87fe568c	108	'''
0bc56fa6	109
87fe568c JMF	110	_TESTS = [
	111	{
	112	'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
	113	'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
	114	'info_dict': {
	115	'id': '52753292',
	116	'ext': 'flv',
	117	'title': 'Crew emerges after four-month Mars food study',
	118	'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
	119	},
0bc56fa6	120	},
87fe568c JMF	121	{
	122	'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236',
	123	'md5': 'b2421750c9f260783721d898f4c42063',
	124	'info_dict': {
	125	'id': 'I1wpAI_zmhsQ',
10e3d734	126	'ext': 'mp4',
87fe568c JMF	127	'title': 'How Twitter Reacted To The Snowden Interview',
	128	'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
	129	},
	130	'add_ie': ['ThePlatform'],
	131	},
2df54b4b S	132	{
	133	'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
	134	'md5': 'fdbf39ab73a72df5896b6234ff98518a',
	135	'info_dict': {
	136	'id': 'Wjf9EDR3A_60',
	137	'ext': 'mp4',
	138	'title': 'FULL EPISODE: Family Business',
	139	'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
	140	},
	141	},
d9aa2b78 RS	142	{
	143	'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
	144	'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d',
	145	'info_dict': {
	146	'id': 'sekXqyTVnmN3',
	147	'ext': 'mp4',
	148	'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
	149	'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
	150	},
	151	},
87fe568c	152	]
0bc56fa6 JMF	153
	154	def _real_extract(self, url):
	155	mobj = re.match(self._VALID_URL, url)
	156	video_id = mobj.group('id')
87fe568c JMF	157	if video_id is not None:
	158	all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
	159	info = all_info.find('video')
0bc56fa6	160
87fe568c JMF	161	return {
	162	'id': video_id,
	163	'title': info.find('headline').text,
	164	'ext': 'flv',
	165	'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
	166	'description': compat_str(info.find('caption').text),
	167	'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
	168	}
	169	else:
d9aa2b78	170	# "feature" and "nightly-news" pages use theplatform.com
87fe568c JMF	171	title = mobj.group('title')
87fe568c JMF	172	webpage = self._download_webpage(url, title)
a4f3d779 S	173	bootstrap_json = self._search_regex(
	174	r'var\s+(?:bootstrapJson\|playlistData)\s=\s({.+});?\s*$',
	175	webpage, 'bootstrap json', flags=re.MULTILINE)
	176	bootstrap = self._parse_json(bootstrap_json, video_id)
87fe568c	177	info = bootstrap['results'][0]['video']
87fe568c	178	mpxid = info['mpxId']
754d8a03 PH	179
	180	base_urls = [
	181	info['fallbackPlaylistUrl'],
	182	info['associatedPlaylistUrl'],
	183	]
	184
	185	for base_url in base_urls:
10e3d734 PH	186	if not base_url:
10e3d734 PH	187	continue
754d8a03	188	playlist_url = base_url + '?form=MPXNBCNewsAPI'
754d8a03 PH	189
754d8a03 PH	190	try:
2df54b4b S	191	all_videos = self._download_json(playlist_url, title)
	192	except ExtractorError as ee:
	193	if isinstance(ee.cause, compat_HTTPError):
	194	continue
	195	raise
	196
47e0e1e0	197	if not all_videos or 'videos' not in all_videos:
2df54b4b S	198	continue
	199
	200	try:
	201	info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
754d8a03 PH	202	break
	203	except StopIteration:
	204	continue
	205
	206	if info is None:
	207	raise ExtractorError('Could not find video in playlists')
87fe568c JMF	208
	209	return {
	210	'_type': 'url',
	211	# We get the best quality video
	212	'url': info['videoAssets'][-1]['publicUrl'],
	213	'ie_key': 'ThePlatform',
	214	}