[yt-dlp.git] / yt_dlp / extractor / huffpost.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    parse_duration,
    unified_strdate,
)


class HuffPostIE(InfoExtractor):
    IE_DESC = 'Huffington Post'
    _VALID_URL = r'''(?x)
        https?://(embed\.)?live\.huffingtonpost\.com/
        (?:
            r/segment/[^/]+/|
            HPLEmbedPlayer/\?segmentId=
        )
        (?P<id>[0-9a-f]+)'''

    _TEST = {
        'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
        'md5': '55f5e8981c1c80a64706a44b74833de8',
        'info_dict': {
            'id': '52dd3e4b02a7602131000677',
            'ext': 'mp4',
            'title': 'Legalese It! with @MikeSacksHP',
            'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more.  ',
            'duration': 1549,
            'upload_date': '20140124',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
        'expected_warnings': ['HTTP Error 404: Not Found'],
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
        data = self._download_json(api_url, video_id)['data']

        video_title = data['title']
        duration = parse_duration(data.get('running_time'))
        upload_date = unified_strdate(
            data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
        description = data.get('description')

        thumbnails = []
        for url in filter(None, data['images'].values()):
            m = re.match(r'.*-([0-9]+x[0-9]+)\.', url)
            if not m:
                continue
            thumbnails.append({
                'url': url,
                'resolution': m.group(1),
            })

        formats = []
        sources = data.get('sources', {})
        live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
        for key, url in live_sources:
            ext = determine_ext(url)
            if ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
            elif ext == 'f4m':
                formats.extend(self._extract_f4m_formats(
                    url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
            else:
                formats.append({
                    'format': key,
                    'format_id': key.replace('/', '.'),
                    'ext': 'mp4',
                    'url': url,
                    'vcodec': 'none' if key.startswith('audio/') else None,
                })

        if not formats and data.get('fivemin_id'):
            return self.url_result('5min:%s' % data['fivemin_id'])

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': video_title,
            'description': description,
            'formats': formats,
            'duration': duration,
            'upload_date': upload_date,
            'thumbnails': thumbnails,
        }
Commit	Line	Data
db1f3888 PH	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
f0ec61b5	7	determine_ext,
db1f3888 PH	8	parse_duration,
	9	unified_strdate,
	10	)
	11
	12
	13	class HuffPostIE(InfoExtractor):
	14	IE_DESC = 'Huffington Post'
	15	_VALID_URL = r'''(?x)
	16	https?://(embed\.)?live\.huffingtonpost\.com/
	17	(?:
	18	r/segment/[^/]+/\|
	19	HPLEmbedPlayer/\?segmentId=
	20	)
	21	(?P<id>[0-9a-f]+)'''
	22
	23	_TEST = {
	24	'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
d16076ff	25	'md5': '55f5e8981c1c80a64706a44b74833de8',
db1f3888	26	'info_dict': {
2583a030 JMF	27	'id': '52dd3e4b02a7602131000677',
2583a030 JMF	28	'ext': 'mp4',
d16076ff PH	29	'title': 'Legalese It! with @MikeSacksHP',
d16076ff PH	30	'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ',
db1f3888	31	'duration': 1549,
d16076ff	32	'upload_date': '20140124',
f0ec61b5 YCH	33	},
	34	'params': {
	35	# m3u8 download
	36	'skip_download': True,
	37	},
	38	'expected_warnings': ['HTTP Error 404: Not Found'],
db1f3888 PH	39	}
	40
	41	def _real_extract(self, url):
4c1ce987	42	video_id = self._match_id(url)
db1f3888 PH	43
	44	api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
	45	data = self._download_json(api_url, video_id)['data']
	46
	47	video_title = data['title']
398133cf S	48	duration = parse_duration(data.get('running_time'))
	49	upload_date = unified_strdate(
	50	data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
d16076ff	51	description = data.get('description')
db1f3888 PH	52
db1f3888 PH	53	thumbnails = []
f0ec61b5	54	for url in filter(None, data['images'].values()):
ec85ded8	55	m = re.match(r'.*-([0-9]+x[0-9]+)\.', url)
db1f3888 PH	56	if not m:
	57	continue
	58	thumbnails.append({
	59	'url': url,
	60	'resolution': m.group(1),
	61	})
	62
f0ec61b5 YCH	63	formats = []
	64	sources = data.get('sources', {})
	65	live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
	66	for key, url in live_sources:
	67	ext = determine_ext(url)
	68	if ext == 'm3u8':
	69	formats.extend(self._extract_m3u8_formats(
	70	url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
	71	elif ext == 'f4m':
9154c87f	72	formats.extend(self._extract_f4m_formats(
f0ec61b5 YCH	73	url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
	74	else:
	75	formats.append({
	76	'format': key,
	77	'format_id': key.replace('/', '.'),
	78	'ext': 'mp4',
	79	'url': url,
	80	'vcodec': 'none' if key.startswith('audio/') else None,
	81	})
398133cf S	82
	83	if not formats and data.get('fivemin_id'):
	84	return self.url_result('5min:%s' % data['fivemin_id'])
	85
db1f3888 PH	86	self._sort_formats(formats)
	87
	88	return {
	89	'id': video_id,
	90	'title': video_title,
d16076ff	91	'description': description,
db1f3888 PH	92	'formats': formats,
	93	'duration': duration,
	94	'upload_date': upload_date,
	95	'thumbnails': thumbnails,
	96	}