[yt-dlp.git] / youtube_dl / extractor / nbc.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import (
    compat_str,
    compat_HTTPError,
)
from ..utils import (
    ExtractorError,
    find_xpath_attr,
)


class NBCIE(InfoExtractor):
    _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'

    _TESTS = [
        {
            'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
            # md5 checksum is not stable
            'info_dict': {
                'id': 'bTmnLCvIbaaH',
                'ext': 'flv',
                'title': 'I Am a Firefighter',
                'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
            },
        },
        {
            'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
            'info_dict': {
                'id': 'XwU9KZkp98TH',
                'ext': 'flv',
                'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
                'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
            },
            'skip': 'Only works from US',
        },
    ]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        theplatform_url = self._search_regex(
            '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
            webpage, 'theplatform url').replace('_no_endcard', '')
        if theplatform_url.startswith('//'):
            theplatform_url = 'http:' + theplatform_url
        return self.url_result(theplatform_url)


class NBCNewsIE(InfoExtractor):
    _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
        (?:video/.+?/(?P<id>\d+)|
        (?:feature|nightly-news)/[^/]+/(?P<title>.+))
        '''

    _TESTS = [
        {
            'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
            'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
            'info_dict': {
                'id': '52753292',
                'ext': 'flv',
                'title': 'Crew emerges after four-month Mars food study',
                'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
            },
        },
        {
            'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236',
            'md5': 'b2421750c9f260783721d898f4c42063',
            'info_dict': {
                'id': 'I1wpAI_zmhsQ',
                'ext': 'mp4',
                'title': 'How Twitter Reacted To The Snowden Interview',
                'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
            },
            'add_ie': ['ThePlatform'],
        },
        {
            'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
            'md5': 'fdbf39ab73a72df5896b6234ff98518a',
            'info_dict': {
                'id': 'Wjf9EDR3A_60',
                'ext': 'mp4',
                'title': 'FULL EPISODE: Family Business',
                'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
            },
        },
        {
            'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
            'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d',
            'info_dict': {
                'id': 'sekXqyTVnmN3',
                'ext': 'mp4',
                'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
                'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
            },
        },
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        if video_id is not None:
            all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
            info = all_info.find('video')

            return {
                'id': video_id,
                'title': info.find('headline').text,
                'ext': 'flv',
                'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
                'description': compat_str(info.find('caption').text),
                'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
            }
        else:
            # "feature" and "nightly-news" pages use theplatform.com
            title = mobj.group('title')
            webpage = self._download_webpage(url, title)
            bootstrap_json = self._search_regex(
                r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
                webpage, 'bootstrap json', flags=re.MULTILINE)
            bootstrap = self._parse_json(bootstrap_json, video_id)
            info = bootstrap['results'][0]['video']
            mpxid = info['mpxId']

            base_urls = [
                info['fallbackPlaylistUrl'],
                info['associatedPlaylistUrl'],
            ]

            for base_url in base_urls:
                if not base_url:
                    continue
                playlist_url = base_url + '?form=MPXNBCNewsAPI'

                try:
                    all_videos = self._download_json(playlist_url, title)
                except ExtractorError as ee:
                    if isinstance(ee.cause, compat_HTTPError):
                        continue
                    raise

                if not all_videos or 'videos' not in all_videos:
                    continue

                try:
                    info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
                    break
                except StopIteration:
                    continue

            if info is None:
                raise ExtractorError('Could not find video in playlists')

            return {
                '_type': 'url',
                # We get the best quality video
                'url': info['videoAssets'][-1]['publicUrl'],
                'ie_key': 'ThePlatform',
            }
Commit	Line	Data
cd7ee7aa JMF	1	from __future__ import unicode_literals
cd7ee7aa JMF	2
0bc56fa6	3	import re
0bc56fa6 JMF	4
0bc56fa6 JMF	5	from .common import InfoExtractor
1cc79574	6	from ..compat import (
37e64add	7	compat_str,
2df54b4b	8	compat_HTTPError,
1cc79574 PH	9	)
1cc79574 PH	10	from ..utils import (
37e64add PH	11	ExtractorError,
	12	find_xpath_attr,
	13	)
0bc56fa6 JMF	14
0bc56fa6 JMF	15
020cf5eb	16	class NBCIE(InfoExtractor):
58c1f6f0 S	17	_VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
	18
	19	_TESTS = [
	20	{
	21	'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
	22	# md5 checksum is not stable
	23	'info_dict': {
	24	'id': 'bTmnLCvIbaaH',
	25	'ext': 'flv',
	26	'title': 'I Am a Firefighter',
	27	'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
	28	},
020cf5eb	29	},
58c1f6f0 S	30	{
	31	'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
	32	'info_dict': {
	33	'id': 'XwU9KZkp98TH',
	34	'ext': 'flv',
	35	'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
	36	'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
	37	},
	38	'skip': 'Only works from US',
	39	},
	40	]
020cf5eb JMF	41
020cf5eb JMF	42	def _real_extract(self, url):
10e3d734	43	video_id = self._match_id(url)
020cf5eb	44	webpage = self._download_webpage(url, video_id)
58c1f6f0 S	45	theplatform_url = self._search_regex(
	46	'(?:class="video-player video-player-full" data-mpx-url\|class="player" src)="(.*?)"',
	47	webpage, 'theplatform url').replace('_no_endcard', '')
020cf5eb JMF	48	if theplatform_url.startswith('//'):
	49	theplatform_url = 'http:' + theplatform_url
	50	return self.url_result(theplatform_url)
	51
	52
0bc56fa6	53	class NBCNewsIE(InfoExtractor):
a4f3d779 S	54	_VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
	55	(?:video/.+?/(?P<id>\d+)\|
	56	(?:feature\|nightly-news)/[^/]+/(?P<title>.+))
87fe568c	57	'''
0bc56fa6	58
87fe568c JMF	59	_TESTS = [
	60	{
	61	'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
	62	'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
	63	'info_dict': {
	64	'id': '52753292',
	65	'ext': 'flv',
	66	'title': 'Crew emerges after four-month Mars food study',
	67	'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
	68	},
0bc56fa6	69	},
87fe568c JMF	70	{
	71	'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236',
	72	'md5': 'b2421750c9f260783721d898f4c42063',
	73	'info_dict': {
	74	'id': 'I1wpAI_zmhsQ',
10e3d734	75	'ext': 'mp4',
87fe568c JMF	76	'title': 'How Twitter Reacted To The Snowden Interview',
	77	'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
	78	},
	79	'add_ie': ['ThePlatform'],
	80	},
2df54b4b S	81	{
	82	'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
	83	'md5': 'fdbf39ab73a72df5896b6234ff98518a',
	84	'info_dict': {
	85	'id': 'Wjf9EDR3A_60',
	86	'ext': 'mp4',
	87	'title': 'FULL EPISODE: Family Business',
	88	'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
	89	},
	90	},
d9aa2b78 RS	91	{
	92	'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
	93	'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d',
	94	'info_dict': {
	95	'id': 'sekXqyTVnmN3',
	96	'ext': 'mp4',
	97	'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
	98	'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
	99	},
	100	},
87fe568c	101	]
0bc56fa6 JMF	102
	103	def _real_extract(self, url):
	104	mobj = re.match(self._VALID_URL, url)
	105	video_id = mobj.group('id')
87fe568c JMF	106	if video_id is not None:
	107	all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
	108	info = all_info.find('video')
0bc56fa6	109
87fe568c JMF	110	return {
	111	'id': video_id,
	112	'title': info.find('headline').text,
	113	'ext': 'flv',
	114	'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
	115	'description': compat_str(info.find('caption').text),
	116	'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
	117	}
	118	else:
d9aa2b78	119	# "feature" and "nightly-news" pages use theplatform.com
87fe568c JMF	120	title = mobj.group('title')
87fe568c JMF	121	webpage = self._download_webpage(url, title)
a4f3d779 S	122	bootstrap_json = self._search_regex(
	123	r'var\s+(?:bootstrapJson\|playlistData)\s=\s({.+});?\s*$',
	124	webpage, 'bootstrap json', flags=re.MULTILINE)
	125	bootstrap = self._parse_json(bootstrap_json, video_id)
87fe568c	126	info = bootstrap['results'][0]['video']
87fe568c	127	mpxid = info['mpxId']
754d8a03 PH	128
	129	base_urls = [
	130	info['fallbackPlaylistUrl'],
	131	info['associatedPlaylistUrl'],
	132	]
	133
	134	for base_url in base_urls:
10e3d734 PH	135	if not base_url:
10e3d734 PH	136	continue
754d8a03	137	playlist_url = base_url + '?form=MPXNBCNewsAPI'
754d8a03 PH	138
754d8a03 PH	139	try:
2df54b4b S	140	all_videos = self._download_json(playlist_url, title)
	141	except ExtractorError as ee:
	142	if isinstance(ee.cause, compat_HTTPError):
	143	continue
	144	raise
	145
47e0e1e0	146	if not all_videos or 'videos' not in all_videos:
2df54b4b S	147	continue
	148
	149	try:
	150	info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
754d8a03 PH	151	break
	152	except StopIteration:
	153	continue
	154
	155	if info is None:
	156	raise ExtractorError('Could not find video in playlists')
87fe568c JMF	157
	158	return {
	159	'_type': 'url',
	160	# We get the best quality video
	161	'url': info['videoAssets'][-1]['publicUrl'],
	162	'ie_key': 'ThePlatform',
	163	}