[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor

from ..compat import compat_str
from ..utils import int_or_none


class TEDIE(InfoExtractor):
    IE_NAME = 'ted'
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                            'argument that not only don\'t we understand our own '
                            'consciousness, but that half the time our brains are '
                            'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 853,
            'duration': 1308,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': 'b899ac15e345fb39534d913f7606082b',
        'info_dict': {
            'id': 'tSVI8ta_P4w',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': r're:^https?://.+\.jpg',
            'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
            'upload_date': '20140122',
            'uploader_id': 'TEDInstitute',
            'uploader': 'TED Institute',
        },
        'add_ie': ['Youtube'],
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'md5': '71b3ab2f4233012dce09d515c9c39ce2',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'webm',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # YouTube video
        'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': 'aFBIPO-P7LM',
            'ext': 'mp4',
            'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
            'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
            'uploader': 'TEDx Talks',
            'uploader_id': 'TEDxTalks',
            'upload_date': '20111216',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'width': 320, 'height': 180},
        'medium': {'width': 512, 'height': 288},
        'high': {'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
                                       webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type').startswith('embed'):
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
                                         'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        external = talk_info.get('external')
        if external:
            service = external['service']
            self.to_screen('Found video from %s' % service)
            ext_url = None
            if service.lower() == 'youtube':
                ext_url = external.get('code')
            return {
                '_type': 'url',
                'url': ext_url or external['uri'],
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)

        http_url = None
        for format_id, resources in talk_info['resources'].items():
            if format_id == 'h264':
                for resource in resources:
                    h264_url = resource.get('file')
                    if not h264_url:
                        continue
                    bitrate = int_or_none(resource.get('bitrate'))
                    formats.append({
                        'url': h264_url,
                        'format_id': '%s-%sk' % (format_id, bitrate),
                        'tbr': bitrate,
                    })
                    if re.search(r'\d+k', h264_url):
                        http_url = h264_url
            elif format_id == 'rtmp':
                streamer = talk_info.get('streamer')
                if not streamer:
                    continue
                for resource in resources:
                    formats.append({
                        'format_id': '%s-%s' % (format_id, resource.get('name')),
                        'url': streamer,
                        'play_path': resource['file'],
                        'ext': 'flv',
                        'width': int_or_none(resource.get('width')),
                        'height': int_or_none(resource.get('height')),
                        'tbr': int_or_none(resource.get('bitrate')),
                    })
            elif format_id == 'hls':
                formats.extend(self._extract_m3u8_formats(
                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))

        m3u8_formats = list(filter(
            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
            formats))
        if http_url:
            for m3u8_format in m3u8_formats:
                bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
                if not bitrate:
                    continue
                f = m3u8_format.copy()
                f.update({
                    'url': re.sub(r'\d+k', bitrate, http_url),
                    'format_id': m3u8_format['format_id'].replace('hls', 'http'),
                    'protocol': 'http',
                })
                formats.append(f)

        audio_download = talk_info.get('audioDownload')
        if audio_download:
            formats.append({
                'url': audio_download,
                'format_id': 'audio',
                'vcodec': 'none',
            })

        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': self._get_subtitles(video_id, talk_info),
            'formats': formats,
            'duration': talk_info.get('duration'),
        }

    def _get_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                sub_lang_list[l] = [
                    {
                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
                        'ext': ext,
                    }
                    for ext in ['ted', 'srt']
                ]
            return sub_lang_list
        else:
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
            webpage, 'config', default=None)
        if not config_json:
            embed_url = self._search_regex(
                r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
            return self.url_result(self._proto_relative_url(embed_url))
        config = json.loads(config_json)['config']
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a504ced0	6	from .common import InfoExtractor
9fd5ce0c	7
66ee7b32 S	8	from ..compat import compat_str
66ee7b32 S	9	from ..utils import int_or_none
4ed3e510	10
f853f859	11
a504ced0	12	class TEDIE(InfoExtractor):
cfbee8a4	13	IE_NAME = 'ted'
aab74fa1 PH	14	_VALID_URL = r'''(?x)
aab74fa1 PH	15	(?P<proto>https?://)
cd791a5e	16	(?P<type>www\|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
bacac173 JMF	17	(
	18	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	19	\|
	20	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	21	\|
ac6c1048 PH	22	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	23	)
bacac173 JMF	24	(/lang/(.*?))? # The url may contain the language
ac6c1048	25	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	26	.*)$
bacac173	27	'''
ac6c1048	28	_TESTS = [{
f853f859	29	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
f628d800	30	'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
f853f859	31	'info_dict': {
7b9965ea JMF	32	'id': '102',
7b9965ea JMF	33	'ext': 'mp4',
652bee05	34	'title': 'The illusion of consciousness',
bacac173	35	'description': ('Philosopher Dan Dennett makes a compelling '
9e1a5b84 JW	36	'argument that not only don\'t we understand our own '
	37	'consciousness, but that half the time our brains are '
	38	'actively fooling us.'),
652bee05	39	'uploader': 'Dan Dennett',
f628d800	40	'width': 853,
eb4cb42a	41	'duration': 1308,
6f5ac90c	42	}
ac6c1048 PH	43	}, {
ac6c1048 PH	44	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
f628d800	45	'md5': 'b899ac15e345fb39534d913f7606082b',
ac6c1048	46	'info_dict': {
f628d800	47	'id': 'tSVI8ta_P4w',
ac6c1048 PH	48	'ext': 'mp4',
ac6c1048 PH	49	'title': 'Vishal Sikka: The beauty and power of algorithms',
ec85ded8	50	'thumbnail': r're:^https?://.+\.jpg',
f628d800	51	'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
	52	'upload_date': '20140122',
	53	'uploader_id': 'TEDInstitute',
	54	'uploader': 'TED Institute',
	55	},
	56	'add_ie': ['Youtube'],
2d4c98db JMF	57	}, {
2d4c98db JMF	58	'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
f628d800	59	'md5': '71b3ab2f4233012dce09d515c9c39ce2',
2d4c98db JMF	60	'info_dict': {
2d4c98db JMF	61	'id': '1972',
5bec5748	62	'ext': 'mp4',
2d4c98db JMF	63	'title': 'Be passionate. Be courageous. Be your best.',
2d4c98db JMF	64	'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748	65	'description': 'md5:5174aed4d0f16021b704120360f72b92',
eb4cb42a	66	'duration': 1128,
2d4c98db	67	},
22a6f150 PH	68	}, {
	69	'url': 'http://www.ted.com/playlists/who_are_the_hackers',
	70	'info_dict': {
	71	'id': '10',
	72	'title': 'Who are the hackers?',
	73	},
	74	'playlist_mincount': 6,
a72cbfac JMF	75	}, {
	76	# contains a youtube video
	77	'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
	78	'add_ie': ['Youtube'],
	79	'info_dict': {
	80	'id': '_ZG8HBuDjgc',
f22ba4bd	81	'ext': 'webm',
a72cbfac JMF	82	'title': 'Douglas Adams: Parrots the Universe and Everything',
	83	'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
	84	'uploader': 'University of California Television (UCTV)',
	85	'uploader_id': 'UCtelevision',
	86	'upload_date': '20080522',
	87	},
	88	'params': {
	89	'skip_download': True,
	90	},
a461a119 S	91	}, {
	92	# YouTube video
	93	'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
	94	'add_ie': ['Youtube'],
	95	'info_dict': {
	96	'id': 'aFBIPO-P7LM',
	97	'ext': 'mp4',
	98	'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
	99	'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
	100	'uploader': 'TEDx Talks',
	101	'uploader_id': 'TEDxTalks',
	102	'upload_date': '20111216',
	103	},
	104	'params': {
	105	'skip_download': True,
	106	},
ac6c1048	107	}]
9fd5ce0c	108
0ba77818	109	_NATIVE_FORMATS = {
11fa3d7f	110	'low': {'width': 320, 'height': 180},
	111	'medium': {'width': 512, 'height': 288},
	112	'high': {'width': 854, 'height': 480},
652bee05	113	}
9fd5ce0c	114
ca1fee34	115	def _extract_info(self, webpage):
bacac173	116	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
9e1a5b84	117	webpage, 'info json')
ca1fee34 JMF	118	return json.loads(info_json)
ca1fee34 JMF	119
9fd5ce0c	120	def _real_extract(self, url):
bacac173	121	m = re.match(self._VALID_URL, url, re.VERBOSE)
cd791a5e	122	if m.group('type').startswith('embed'):
aab74fa1 PH	123	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
aab74fa1 PH	124	return self.url_result(desktop_url, 'TED')
bacac173	125	name = m.group('name')
9fd5ce0c	126	if m.group('type_talk'):
bacac173	127	return self._talk_info(url, name)
ac6c1048 PH	128	elif m.group('type_watch'):
ac6c1048 PH	129	return self._watch_info(url, name)
bacac173	130	else:
ca1fee34	131	return self._playlist_videos_info(url, name)
9fd5ce0c	132
ca1fee34	133	def _playlist_videos_info(self, url, name):
9fd5ce0c	134	'''Returns the videos of the playlist'''
fc2ef392	135
ca1fee34	136	webpage = self._download_webpage(url, name,
9e1a5b84	137	'Downloading playlist webpage')
ca1fee34 JMF	138	info = self._extract_info(webpage)
ca1fee34 JMF	139	playlist_info = info['playlist']
9fd5ce0c	140
fc2ef392	141	playlist_entries = [
f07a9f6f	142	self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34	143	for talk in info['talks']
fc2ef392 PH	144	]
fc2ef392 PH	145	return self.playlist_result(
ca1fee34 JMF	146	playlist_entries,
	147	playlist_id=compat_str(playlist_info['id']),
	148	playlist_title=playlist_info['title'])
9fd5ce0c	149
bacac173 JMF	150	def _talk_info(self, url, video_name):
bacac173 JMF	151	webpage = self._download_webpage(url, video_name)
9fd5ce0c	152	self.report_extraction(video_name)
a9a3876d	153
ca1fee34	154	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	155
a461a119 S	156	external = talk_info.get('external')
	157	if external:
	158	service = external['service']
	159	self.to_screen('Found video from %s' % service)
	160	ext_url = None
	161	if service.lower() == 'youtube':
	162	ext_url = external.get('code')
a72cbfac JMF	163	return {
a72cbfac JMF	164	'_type': 'url',
a461a119	165	'url': ext_url or external['uri'],
a72cbfac JMF	166	}
a72cbfac JMF	167
652bee05	168	formats = [{
652bee05 JMF	169	'url': format_url,
	170	'format_id': format_id,
	171	'format': format_id,
2d4c98db JMF	172	} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
	173	if formats:
	174	for f in formats:
	175	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	176	if finfo:
	177	f.update(finfo)
66ee7b32	178
11fa3d7f	179	http_url = None
66ee7b32 S	180	for format_id, resources in talk_info['resources'].items():
	181	if format_id == 'h264':
	182	for resource in resources:
11fa3d7f	183	h264_url = resource.get('file')
	184	if not h264_url:
	185	continue
66ee7b32 S	186	bitrate = int_or_none(resource.get('bitrate'))
66ee7b32 S	187	formats.append({
11fa3d7f	188	'url': h264_url,
66ee7b32 S	189	'format_id': '%s-%sk' % (format_id, bitrate),
	190	'tbr': bitrate,
	191	})
ec85ded8	192	if re.search(r'\d+k', h264_url):
11fa3d7f	193	http_url = h264_url
66ee7b32 S	194	elif format_id == 'rtmp':
	195	streamer = talk_info.get('streamer')
	196	if not streamer:
	197	continue
	198	for resource in resources:
	199	formats.append({
	200	'format_id': '%s-%s' % (format_id, resource.get('name')),
	201	'url': streamer,
	202	'play_path': resource['file'],
	203	'ext': 'flv',
	204	'width': int_or_none(resource.get('width')),
	205	'height': int_or_none(resource.get('height')),
	206	'tbr': int_or_none(resource.get('bitrate')),
	207	})
	208	elif format_id == 'hls':
11fa3d7f	209	formats.extend(self._extract_m3u8_formats(
	210	resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
	211
	212	m3u8_formats = list(filter(
	213	lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
	214	formats))
	215	if http_url:
	216	for m3u8_format in m3u8_formats:
	217	bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
	218	if not bitrate:
	219	continue
	220	f = m3u8_format.copy()
	221	f.update({
	222	'url': re.sub(r'\d+k', bitrate, http_url),
	223	'format_id': m3u8_format['format_id'].replace('hls', 'http'),
	224	'protocol': 'http',
	225	})
	226	formats.append(f)
66ee7b32 S	227
	228	audio_download = talk_info.get('audioDownload')
	229	if audio_download:
	230	formats.append({
	231	'url': audio_download,
	232	'format_id': 'audio',
736785ab	233	'vcodec': 'none',
66ee7b32 S	234	})
66ee7b32 S	235
f628d800	236	self._sort_formats(formats)
652bee05	237
7b9965ea	238	video_id = compat_str(talk_info['id'])
a9a3876d	239
b6c1cecc JMF	240	thumbnail = talk_info['thumb']
	241	if not thumbnail.startswith('http'):
	242	thumbnail = 'http://' + thumbnail
463a9087	243	return {
a9a3876d	244	'id': video_id,
a8eb5a8e	245	'title': talk_info['title'].strip(),
652bee05	246	'uploader': talk_info['speaker'],
b6c1cecc	247	'thumbnail': thumbnail,
652bee05	248	'description': self._og_search_description(webpage),
03091e37	249	'subtitles': self._get_subtitles(video_id, talk_info),
0d8cb1cc	250	'formats': formats,
eb4cb42a	251	'duration': talk_info.get('duration'),
0d8cb1cc PH	252	}
0d8cb1cc PH	253
a504ced0	254	def _get_subtitles(self, video_id, talk_info):
652bee05 JMF	255	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	256	if languages:
	257	sub_lang_list = {}
	258	for l in languages:
a504ced0 JMF	259	sub_lang_list[l] = [
	260	{
	261	'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
	262	'ext': ext,
	263	}
	264	for ext in ['ted', 'srt']
	265	]
652bee05 JMF	266	return sub_lang_list
652bee05 JMF	267	else:
652bee05	268	return {}
ac6c1048 PH	269
	270	def _watch_info(self, url, name):
	271	webpage = self._download_webpage(url, name)
	272
	273	config_json = self._html_search_regex(
de9bd74b	274	r'"pages\.jwplayer"\s,\s({.+?})\s\)\s</script>',
f628d800	275	webpage, 'config', default=None)
	276	if not config_json:
	277	embed_url = self._search_regex(
	278	r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
	279	return self.url_result(self._proto_relative_url(embed_url))
de9bd74b	280	config = json.loads(config_json)['config']
ac6c1048 PH	281	video_url = config['video']['url']
	282	thumbnail = config.get('image', {}).get('url')
	283
	284	title = self._html_search_regex(
	285	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	286	description = self._html_search_regex(
621f33c9 PH	287	[
	288	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	289	r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
	290	],
ac6c1048 PH	291	webpage, 'description', fatal=False)
	292
	293	return {
	294	'id': name,
	295	'url': video_url,
	296	'title': title,
	297	'thumbnail': thumbnail,
	298	'description': description,
	299	}