[yt-dlp.git] / yt_dlp / extractor / cbs.py

from __future__ import unicode_literals

from .theplatform import ThePlatformFeedIE
from ..utils import (
    ExtractorError,
    int_or_none,
    find_xpath_attr,
    xpath_element,
    xpath_text,
    update_url_query,
)


class CBSBaseIE(ThePlatformFeedIE):
    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
        subtitles = {}
        for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]:
            cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k)
            if cc_e is not None:
                cc_url = cc_e.get('value')
                if cc_url:
                    subtitles.setdefault(subtitles_lang, []).append({
                        'ext': ext,
                        'url': cc_url,
                    })
        return subtitles


class CBSIE(CBSBaseIE):
    _VALID_URL = r'''(?x)
        (?:
            cbs:|
            https?://(?:www\.)?(?:
                (?:cbs|paramountplus)\.com/(?:shows/[^/]+/video|movies/[^/]+)/|
                colbertlateshow\.com/(?:video|podcasts)/)
        )(?P<id>[\w-]+)'''

    _TESTS = [{
        'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
        'info_dict': {
            'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
            'ext': 'mp4',
            'title': 'Connect Chat feat. Garth Brooks',
            'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
            'duration': 1495,
            'timestamp': 1385585425,
            'upload_date': '20131127',
            'uploader': 'CBSI-NEW',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
        '_skip': 'Blocked outside the US',
    }, {
        'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
        'only_matching': True,
    }, {
        'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
        'only_matching': True,
    }, {
        'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/',
        'only_matching': True,
    }, {
        'url': 'https://www.paramountplus.com/movies/million-dollar-american-princesses-meghan-and-harry/C0LpgNwXYeB8txxycdWdR9TjxpJOsdCq',
        'only_matching': True,
    }]

    def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
        items_data = self._download_xml(
            'https://can.cbs.com/thunder/player/videoPlayerService.php',
            content_id, query={'partner': site, 'contentId': content_id})
        video_data = xpath_element(items_data, './/item')
        title = xpath_text(video_data, 'videoTitle', 'title') or xpath_text(video_data, 'videotitle', 'title')
        tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)
        tp_release_url = 'https://link.theplatform.com/s/' + tp_path

        asset_types = []
        subtitles = {}
        formats = []
        last_e = None
        for item in items_data.findall('.//item'):
            asset_type = xpath_text(item, 'assetType')
            if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type:
                continue
            asset_types.append(asset_type)
            query = {
                'mbr': 'true',
                'assetTypes': asset_type,
            }
            if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'):
                query['formats'] = 'MPEG4,M3U'
            elif asset_type in ('RTMP', 'WIFI', '3G'):
                query['formats'] = 'MPEG4,FLV'
            try:
                tp_formats, tp_subtitles = self._extract_theplatform_smil(
                    update_url_query(tp_release_url, query), content_id,
                    'Downloading %s SMIL data' % asset_type)
            except ExtractorError as e:
                last_e = e
                continue
            formats.extend(tp_formats)
            subtitles = self._merge_subtitles(subtitles, tp_subtitles)
        if last_e and not formats:
            raise last_e
        self._sort_formats(formats)

        info = self._extract_theplatform_metadata(tp_path, content_id)
        info.update({
            'id': content_id,
            'title': title,
            'series': xpath_text(video_data, 'seriesTitle'),
            'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
            'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
            'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
            'thumbnail': xpath_text(video_data, 'previewImageURL'),
            'formats': formats,
            'subtitles': subtitles,
        })
        return info

    def _real_extract(self, url):
        content_id = self._match_id(url)
        return self._extract_video_info(content_id)
Commit	Line	Data
e42a692f PH	1	from __future__ import unicode_literals
e42a692f PH	2
43518503	3	from .theplatform import ThePlatformFeedIE
5c2266df	4	from ..utils import (
21dedcb5	5	ExtractorError,
63c55e9f	6	int_or_none,
63c55e9f	7	find_xpath_attr,
45cae3b0 RA	8	xpath_element,
	9	xpath_text,
	10	update_url_query,
5c2266df	11	)
fa3ae234 PH	12
fa3ae234 PH	13
43518503	14	class CBSBaseIE(ThePlatformFeedIE):
3e0c3d14	15	def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
180a9dff RA	16	subtitles = {}
	17	for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]:
	18	cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k)
	19	if cc_e is not None:
	20	cc_url = cc_e.get('value')
	21	if cc_url:
	22	subtitles.setdefault(subtitles_lang, []).append({
	23	'ext': ext,
	24	'url': cc_url,
	25	})
	26	return subtitles
3e0c3d14	27
	28
	29	class CBSIE(CBSBaseIE):
c755f190	30	_VALID_URL = r'''(?x)
	31	(?:
	32	cbs:\|
	33	https?://(?:www\.)?(?:
	34	(?:cbs\|paramountplus)\.com/(?:shows/[^/]+/video\|movies/[^/]+)/\|
	35	colbertlateshow\.com/(?:video\|podcasts)/)
	36	)(?P<id>[\w-]+)'''
fa3ae234	37
2871d489	38	_TESTS = [{
2c736b4f	39	'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
e42a692f	40	'info_dict': {
63c55e9f	41	'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
63c55e9f	42	'ext': 'mp4',
e42a692f PH	43	'title': 'Connect Chat feat. Garth Brooks',
	44	'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
	45	'duration': 1495,
79ba9140	46	'timestamp': 1385585425,
	47	'upload_date': '20131127',
	48	'uploader': 'CBSI-NEW',
fa3ae234	49	},
dabe1570 RA	50	'params': {
	51	# m3u8 download
	52	'skip_download': True,
	53	},
e42a692f	54	'_skip': 'Blocked outside the US',
9bf99891 S	55	}, {
	56	'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
	57	'only_matching': True,
	58	}, {
9d581f3d	59	'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
9bf99891	60	'only_matching': True,
2c736b4f	61	}, {
10db0d2f	62	'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/',
2c736b4f	63	'only_matching': True,
c755f190	64	}, {
	65	'url': 'https://www.paramountplus.com/movies/million-dollar-american-princesses-meghan-and-harry/C0LpgNwXYeB8txxycdWdR9TjxpJOsdCq',
	66	'only_matching': True,
2871d489	67	}]
dabe1570	68
96820c1c	69	def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
45cae3b0	70	items_data = self._download_xml(
2c736b4f	71	'https://can.cbs.com/thunder/player/videoPlayerService.php',
96820c1c	72	content_id, query={'partner': site, 'contentId': content_id})
45cae3b0	73	video_data = xpath_element(items_data, './/item')
430c2757	74	title = xpath_text(video_data, 'videoTitle', 'title') or xpath_text(video_data, 'videotitle', 'title')
96820c1c	75	tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)
2c736b4f	76	tp_release_url = 'https://link.theplatform.com/s/' + tp_path
45cae3b0 RA	77
	78	asset_types = []
	79	subtitles = {}
	80	formats = []
21dedcb5	81	last_e = None
45cae3b0 RA	82	for item in items_data.findall('.//item'):
45cae3b0 RA	83	asset_type = xpath_text(item, 'assetType')
35c2dd48	84	if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type:
dabe1570	85	continue
45cae3b0 RA	86	asset_types.append(asset_type)
	87	query = {
	88	'mbr': 'true',
	89	'assetTypes': asset_type,
	90	}
	91	if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'):
	92	query['formats'] = 'MPEG4,M3U'
	93	elif asset_type in ('RTMP', 'WIFI', '3G'):
	94	query['formats'] = 'MPEG4,FLV'
21dedcb5 S	95	try:
	96	tp_formats, tp_subtitles = self._extract_theplatform_smil(
	97	update_url_query(tp_release_url, query), content_id,
	98	'Downloading %s SMIL data' % asset_type)
	99	except ExtractorError as e:
	100	last_e = e
	101	continue
45cae3b0 RA	102	formats.extend(tp_formats)
45cae3b0 RA	103	subtitles = self._merge_subtitles(subtitles, tp_subtitles)
21dedcb5 S	104	if last_e and not formats:
21dedcb5 S	105	raise last_e
dabe1570	106	self._sort_formats(formats)
45cae3b0 RA	107
45cae3b0 RA	108	info = self._extract_theplatform_metadata(tp_path, content_id)
dabe1570	109	info.update({
45cae3b0 RA	110	'id': content_id,
	111	'title': title,
	112	'series': xpath_text(video_data, 'seriesTitle'),
	113	'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
	114	'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
	115	'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
	116	'thumbnail': xpath_text(video_data, 'previewImageURL'),
dabe1570 RA	117	'formats': formats,
dabe1570 RA	118	'subtitles': subtitles,
dabe1570 RA	119	})
dabe1570 RA	120	return info
63c55e9f	121
fa3ae234	122	def _real_extract(self, url):
43518503	123	content_id = self._match_id(url)
dabe1570	124	return self._extract_video_info(content_id)