[yt-dlp.git] / yt_dlp / extractor / huffpost.py

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    parse_duration,
    unified_strdate,
)


class HuffPostIE(InfoExtractor):
    IE_DESC = 'Huffington Post'
    _VALID_URL = r'''(?x)
        https?://(embed\.)?live\.huffingtonpost\.com/
        (?:
            r/segment/[^/]+/|
            HPLEmbedPlayer/\?segmentId=
        )
        (?P<id>[0-9a-f]+)'''
    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1']

    _TEST = {
        'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
        'md5': '55f5e8981c1c80a64706a44b74833de8',
        'info_dict': {
            'id': '52dd3e4b02a7602131000677',
            'ext': 'mp4',
            'title': 'Legalese It! with @MikeSacksHP',
            'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more.  ',
            'duration': 1549,
            'upload_date': '20140124',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
        'expected_warnings': ['HTTP Error 404: Not Found'],
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        api_url = f'http://embed.live.huffingtonpost.com/api/segments/{video_id}.json'
        data = self._download_json(api_url, video_id)['data']

        video_title = data['title']
        duration = parse_duration(data.get('running_time'))
        upload_date = unified_strdate(
            data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
        description = data.get('description')

        thumbnails = []
        for url in filter(None, data['images'].values()):
            m = re.match(r'.*-([0-9]+x[0-9]+)\.', url)
            if not m:
                continue
            thumbnails.append({
                'url': url,
                'resolution': m.group(1),
            })

        formats = []
        sources = data.get('sources', {})
        live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
        for key, url in live_sources:
            ext = determine_ext(url)
            if ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
            elif ext == 'f4m':
                formats.extend(self._extract_f4m_formats(
                    url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
            else:
                formats.append({
                    'format': key,
                    'format_id': key.replace('/', '.'),
                    'ext': 'mp4',
                    'url': url,
                    'vcodec': 'none' if key.startswith('audio/') else None,
                })

        return {
            'id': video_id,
            'title': video_title,
            'description': description,
            'formats': formats,
            'duration': duration,
            'upload_date': upload_date,
            'thumbnails': thumbnails,
        }
Commit	Line	Data
db1f3888 PH	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
f0ec61b5	5	determine_ext,
db1f3888 PH	6	parse_duration,
	7	unified_strdate,
	8	)
	9
	10
	11	class HuffPostIE(InfoExtractor):
	12	IE_DESC = 'Huffington Post'
	13	_VALID_URL = r'''(?x)
	14	https?://(embed\.)?live\.huffingtonpost\.com/
	15	(?:
	16	r/segment/[^/]+/\|
	17	HPLEmbedPlayer/\?segmentId=
	18	)
	19	(?P<id>[0-9a-f]+)'''
bfd973ec	20	_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1']
db1f3888 PH	21
	22	_TEST = {
	23	'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
d16076ff	24	'md5': '55f5e8981c1c80a64706a44b74833de8',
db1f3888	25	'info_dict': {
2583a030 JMF	26	'id': '52dd3e4b02a7602131000677',
2583a030 JMF	27	'ext': 'mp4',
d16076ff PH	28	'title': 'Legalese It! with @MikeSacksHP',
d16076ff PH	29	'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ',
db1f3888	30	'duration': 1549,
d16076ff	31	'upload_date': '20140124',
f0ec61b5 YCH	32	},
	33	'params': {
	34	# m3u8 download
	35	'skip_download': True,
	36	},
	37	'expected_warnings': ['HTTP Error 404: Not Found'],
db1f3888 PH	38	}
	39
	40	def _real_extract(self, url):
4c1ce987	41	video_id = self._match_id(url)
db1f3888	42
add96eb9	43	api_url = f'http://embed.live.huffingtonpost.com/api/segments/{video_id}.json'
db1f3888 PH	44	data = self._download_json(api_url, video_id)['data']
	45
	46	video_title = data['title']
398133cf S	47	duration = parse_duration(data.get('running_time'))
	48	upload_date = unified_strdate(
	49	data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
d16076ff	50	description = data.get('description')
db1f3888 PH	51
db1f3888 PH	52	thumbnails = []
f0ec61b5	53	for url in filter(None, data['images'].values()):
ec85ded8	54	m = re.match(r'.*-([0-9]+x[0-9]+)\.', url)
db1f3888 PH	55	if not m:
	56	continue
	57	thumbnails.append({
	58	'url': url,
	59	'resolution': m.group(1),
	60	})
	61
f0ec61b5 YCH	62	formats = []
	63	sources = data.get('sources', {})
	64	live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
	65	for key, url in live_sources:
	66	ext = determine_ext(url)
	67	if ext == 'm3u8':
	68	formats.extend(self._extract_m3u8_formats(
	69	url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
	70	elif ext == 'f4m':
9154c87f	71	formats.extend(self._extract_f4m_formats(
f0ec61b5 YCH	72	url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
	73	else:
	74	formats.append({
	75	'format': key,
	76	'format_id': key.replace('/', '.'),
	77	'ext': 'mp4',
	78	'url': url,
	79	'vcodec': 'none' if key.startswith('audio/') else None,
	80	})
398133cf	81
db1f3888 PH	82	return {
	83	'id': video_id,
	84	'title': video_title,
d16076ff	85	'description': description,
db1f3888 PH	86	'formats': formats,
	87	'duration': duration,
	88	'upload_date': upload_date,
	89	'thumbnails': thumbnails,
	90	}