[yt-dlp.git] / youtube_dl / extractor / cbsnews.py

# encoding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from .theplatform import ThePlatformIE
from ..utils import (
    parse_duration,
    find_xpath_attr,
)


class CBSNewsIE(ThePlatformIE):
    IE_DESC = 'CBS News'
    _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'

    _TESTS = [
        {
            'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/',
            'info_dict': {
                'id': 'tesla-and-spacex-elon-musks-industrial-empire',
                'ext': 'flv',
                'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire',
                'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg',
                'duration': 791,
            },
            'params': {
                # rtmp download
                'skip_download': True,
            },
        },
        {
            'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
            'info_dict': {
                'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
                'ext': 'mp4',
                'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
                'thumbnail': 're:^https?://.*\.jpg$',
                'duration': 205,
                'subtitles': {
                    'en': [{
                        'ext': 'ttml',
                    }],
                },
            },
            'params': {
                # m3u8 download
                'skip_download': True,
            },
        },
    ]

    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
        closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL')
        return {
            'en': [{
                'ext': 'ttml',
                'url': closed_caption_e.attrib['value'],
            }]
        } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        video_info = self._parse_json(self._html_search_regex(
            r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
            webpage, 'video JSON info'), video_id)

        item = video_info['item'] if 'item' in video_info else video_info
        title = item.get('articleTitle') or item.get('hed')
        duration = item.get('duration')
        thumbnail = item.get('mediaImage') or item.get('thumbnail')

        subtitles = {}
        formats = []
        for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
            pid = item.get('media' + format_id)
            if not pid:
                continue
            release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid
            tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid)
            formats.extend(tp_formats)
            subtitles = self._merge_subtitles(subtitles, tp_subtitles)
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'thumbnail': thumbnail,
            'duration': duration,
            'formats': formats,
            'subtitles': subtitles,
        }


class CBSNewsLiveVideoIE(InfoExtractor):
    IE_DESC = 'CBS News Live Videos'
    _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)'

    _TEST = {
        'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
        'info_dict': {
            'id': 'clinton-sanders-prepare-to-face-off-in-nh',
            'ext': 'flv',
            'title': 'Clinton, Sanders Prepare To Face Off In NH',
            'duration': 334,
        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        video_info = self._parse_json(self._html_search_regex(
            r'data-story-obj=\'({.+?})\'', webpage, 'video JSON info'), video_id)['story']

        hdcore_sign = 'hdcore=3.3.1'
        f4m_formats = self._extract_f4m_formats(video_info['url'] + '&' + hdcore_sign, video_id)
        if f4m_formats:
            for entry in f4m_formats:
                # URLs without the extra param induce an 404 error
                entry.update({'extra_param_to_segment_url': hdcore_sign})

        return {
            'id': video_id,
            'title': video_info['headline'],
            'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'),
            'duration': parse_duration(video_info.get('segmentDur')),
            'formats': f4m_formats,
        }
Commit	Line	Data
85e787f5 S	1	# encoding: utf-8
	2	from __future__ import unicode_literals
	3
fd3a1f3d	4	from .common import InfoExtractor
f125d911	5	from .theplatform import ThePlatformIE
8b809a07	6	from ..utils import (
	7	parse_duration,
	8	find_xpath_attr,
	9	)
85e787f5 S	10
85e787f5 S	11
f125d911	12	class CBSNewsIE(ThePlatformIE):
85e787f5	13	IE_DESC = 'CBS News'
5886b38d	14	_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news\|videos)/(?P<id>[\da-z_-]+)'
85e787f5 S	15
	16	_TESTS = [
	17	{
	18	'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/',
	19	'info_dict': {
	20	'id': 'tesla-and-spacex-elon-musks-industrial-empire',
	21	'ext': 'flv',
	22	'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire',
	23	'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg',
	24	'duration': 791,
	25	},
	26	'params': {
	27	# rtmp download
	28	'skip_download': True,
	29	},
	30	},
	31	{
	32	'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
	33	'info_dict': {
	34	'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
f125d911	35	'ext': 'mp4',
85e787f5	36	'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
e8cfacae	37	'thumbnail': 're:^https?://.*\.jpg$',
85e787f5	38	'duration': 205,
220ee33f S	39	'subtitles': {
	40	'en': [{
	41	'ext': 'ttml',
	42	}],
	43	},
4118cc02 JA	44	},
4118cc02 JA	45	'params': {
f125d911	46	# m3u8 download
4118cc02 JA	47	'skip_download': True,
	48	},
	49	},
85e787f5 S	50	]
85e787f5 S	51
778433cb	52	def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
8b809a07	53	closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL')
778433cb	54	return {
	55	'en': [{
	56	'ext': 'ttml',
	57	'url': closed_caption_e.attrib['value'],
	58	}]
	59	} if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
	60
85e787f5	61	def _real_extract(self, url):
fd3a1f3d	62	video_id = self._match_id(url)
85e787f5 S	63
	64	webpage = self._download_webpage(url, video_id)
	65
fd3a1f3d	66	video_info = self._parse_json(self._html_search_regex(
85e787f5	67	r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info\|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
fd3a1f3d	68	webpage, 'video JSON info'), video_id)
85e787f5 S	69
	70	item = video_info['item'] if 'item' in video_info else video_info
	71	title = item.get('articleTitle') or item.get('hed')
	72	duration = item.get('duration')
	73	thumbnail = item.get('mediaImage') or item.get('thumbnail')
	74
220ee33f	75	subtitles = {}
f125d911	76	formats = []
	77	for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
	78	pid = item.get('media' + format_id)
	79	if not pid:
	80	continue
4c92fd2e	81	release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid
f125d911	82	tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid)
	83	formats.extend(tp_formats)
	84	subtitles = self._merge_subtitles(subtitles, tp_subtitles)
	85	self._sort_formats(formats)
	86
85e787f5 S	87	return {
	88	'id': video_id,
	89	'title': title,
	90	'thumbnail': thumbnail,
	91	'duration': duration,
	92	'formats': formats,
4118cc02	93	'subtitles': subtitles,
5f6a1245	94	}
fd3a1f3d	95
	96
	97	class CBSNewsLiveVideoIE(InfoExtractor):
	98	IE_DESC = 'CBS News Live Videos'
5886b38d	99	_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)'
fd3a1f3d	100
	101	_TEST = {
	102	'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
	103	'info_dict': {
	104	'id': 'clinton-sanders-prepare-to-face-off-in-nh',
	105	'ext': 'flv',
	106	'title': 'Clinton, Sanders Prepare To Face Off In NH',
	107	'duration': 334,
	108	},
	109	}
	110
	111	def _real_extract(self, url):
	112	video_id = self._match_id(url)
	113
	114	webpage = self._download_webpage(url, video_id)
	115
	116	video_info = self._parse_json(self._html_search_regex(
	117	r'data-story-obj=\'({.+?})\'', webpage, 'video JSON info'), video_id)['story']
	118
	119	hdcore_sign = 'hdcore=3.3.1'
	120	f4m_formats = self._extract_f4m_formats(video_info['url'] + '&' + hdcore_sign, video_id)
	121	if f4m_formats:
	122	for entry in f4m_formats:
	123	# URLs without the extra param induce an 404 error
	124	entry.update({'extra_param_to_segment_url': hdcore_sign})
	125
	126	return {
	127	'id': video_id,
	128	'title': video_info['headline'],
	129	'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'),
	130	'duration': parse_duration(video_info.get('segmentDur')),
	131	'formats': f4m_formats,
	132	}