[yt-dlp.git] / yt_dlp / extractor / huffpost.py

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    parse_duration,
    unified_strdate,
)


class HuffPostIE(InfoExtractor):
    IE_DESC = 'Huffington Post'
    _VALID_URL = r'''(?x)
        https?://(embed\.)?live\.huffingtonpost\.com/
        (?:
            r/segment/[^/]+/|
            HPLEmbedPlayer/\?segmentId=
        )
        (?P<id>[0-9a-f]+)'''

    _TEST = {
        'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
        'md5': '55f5e8981c1c80a64706a44b74833de8',
        'info_dict': {
            'id': '52dd3e4b02a7602131000677',
            'ext': 'mp4',
            'title': 'Legalese It! with @MikeSacksHP',
            'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more.  ',
            'duration': 1549,
            'upload_date': '20140124',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
        'expected_warnings': ['HTTP Error 404: Not Found'],
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
        data = self._download_json(api_url, video_id)['data']

        video_title = data['title']
        duration = parse_duration(data.get('running_time'))
        upload_date = unified_strdate(
            data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
        description = data.get('description')

        thumbnails = []
        for url in filter(None, data['images'].values()):
            m = re.match(r'.*-([0-9]+x[0-9]+)\.', url)
            if not m:
                continue
            thumbnails.append({
                'url': url,
                'resolution': m.group(1),
            })

        formats = []
        sources = data.get('sources', {})
        live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
        for key, url in live_sources:
            ext = determine_ext(url)
            if ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
            elif ext == 'f4m':
                formats.extend(self._extract_f4m_formats(
                    url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
            else:
                formats.append({
                    'format': key,
                    'format_id': key.replace('/', '.'),
                    'ext': 'mp4',
                    'url': url,
                    'vcodec': 'none' if key.startswith('audio/') else None,
                })

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': video_title,
            'description': description,
            'formats': formats,
            'duration': duration,
            'upload_date': upload_date,
            'thumbnails': thumbnails,
        }
Commit	Line	Data
db1f3888 PH	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
f0ec61b5	5	determine_ext,
db1f3888 PH	6	parse_duration,
	7	unified_strdate,
	8	)
	9
	10
	11	class HuffPostIE(InfoExtractor):
	12	IE_DESC = 'Huffington Post'
	13	_VALID_URL = r'''(?x)
	14	https?://(embed\.)?live\.huffingtonpost\.com/
	15	(?:
	16	r/segment/[^/]+/\|
	17	HPLEmbedPlayer/\?segmentId=
	18	)
	19	(?P<id>[0-9a-f]+)'''
	20
	21	_TEST = {
	22	'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
d16076ff	23	'md5': '55f5e8981c1c80a64706a44b74833de8',
db1f3888	24	'info_dict': {
2583a030 JMF	25	'id': '52dd3e4b02a7602131000677',
2583a030 JMF	26	'ext': 'mp4',
d16076ff PH	27	'title': 'Legalese It! with @MikeSacksHP',
d16076ff PH	28	'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ',
db1f3888	29	'duration': 1549,
d16076ff	30	'upload_date': '20140124',
f0ec61b5 YCH	31	},
	32	'params': {
	33	# m3u8 download
	34	'skip_download': True,
	35	},
	36	'expected_warnings': ['HTTP Error 404: Not Found'],
db1f3888 PH	37	}
	38
	39	def _real_extract(self, url):
4c1ce987	40	video_id = self._match_id(url)
db1f3888 PH	41
	42	api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
	43	data = self._download_json(api_url, video_id)['data']
	44
	45	video_title = data['title']
398133cf S	46	duration = parse_duration(data.get('running_time'))
	47	upload_date = unified_strdate(
	48	data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
d16076ff	49	description = data.get('description')
db1f3888 PH	50
db1f3888 PH	51	thumbnails = []
f0ec61b5	52	for url in filter(None, data['images'].values()):
ec85ded8	53	m = re.match(r'.*-([0-9]+x[0-9]+)\.', url)
db1f3888 PH	54	if not m:
	55	continue
	56	thumbnails.append({
	57	'url': url,
	58	'resolution': m.group(1),
	59	})
	60
f0ec61b5 YCH	61	formats = []
	62	sources = data.get('sources', {})
	63	live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
	64	for key, url in live_sources:
	65	ext = determine_ext(url)
	66	if ext == 'm3u8':
	67	formats.extend(self._extract_m3u8_formats(
	68	url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
	69	elif ext == 'f4m':
9154c87f	70	formats.extend(self._extract_f4m_formats(
f0ec61b5 YCH	71	url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
	72	else:
	73	formats.append({
	74	'format': key,
	75	'format_id': key.replace('/', '.'),
	76	'ext': 'mp4',
	77	'url': url,
	78	'vcodec': 'none' if key.startswith('audio/') else None,
	79	})
398133cf	80
db1f3888 PH	81	self._sort_formats(formats)
	82
	83	return {
	84	'id': video_id,
	85	'title': video_title,
d16076ff	86	'description': description,
db1f3888 PH	87	'formats': formats,
	88	'duration': duration,
	89	'upload_date': upload_date,
	90	'thumbnails': thumbnails,
	91	}