[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor

from ..compat import compat_str
from ..utils import int_or_none


class TEDIE(InfoExtractor):
    IE_NAME = 'ted'
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                            'argument that not only don\'t we understand our own '
                            'consciousness, but that half the time our brains are '
                            'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
            'duration': 1308,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'mp4',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # YouTube video
        'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': 'aFBIPO-P7LM',
            'ext': 'mp4',
            'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
            'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
            'uploader': 'TEDx Talks',
            'uploader_id': 'TEDxTalks',
            'upload_date': '20111216',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
                                       webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type').startswith('embed'):
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
                                         'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        external = talk_info.get('external')
        if external:
            service = external['service']
            self.to_screen('Found video from %s' % service)
            ext_url = None
            if service.lower() == 'youtube':
                ext_url = external.get('code')
            return {
                '_type': 'url',
                'url': ext_url or external['uri'],
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)

        for format_id, resources in talk_info['resources'].items():
            if format_id == 'h264':
                for resource in resources:
                    bitrate = int_or_none(resource.get('bitrate'))
                    formats.append({
                        'url': resource['file'],
                        'format_id': '%s-%sk' % (format_id, bitrate),
                        'tbr': bitrate,
                    })
            elif format_id == 'rtmp':
                streamer = talk_info.get('streamer')
                if not streamer:
                    continue
                for resource in resources:
                    formats.append({
                        'format_id': '%s-%s' % (format_id, resource.get('name')),
                        'url': streamer,
                        'play_path': resource['file'],
                        'ext': 'flv',
                        'width': int_or_none(resource.get('width')),
                        'height': int_or_none(resource.get('height')),
                        'tbr': int_or_none(resource.get('bitrate')),
                    })
            elif format_id == 'hls':
                hls_formats = self._extract_m3u8_formats(
                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
                for f in hls_formats:
                    if f.get('format_id') == 'hls-meta':
                        continue
                    if not f.get('height'):
                        f['vcodec'] = 'none'
                    else:
                        f['acodec'] = 'none'
                formats.extend(hls_formats)

        audio_download = talk_info.get('audioDownload')
        if audio_download:
            formats.append({
                'url': audio_download,
                'format_id': 'audio',
                'vcodec': 'none',
                'preference': -0.5,
            })

        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': self._get_subtitles(video_id, talk_info),
            'formats': formats,
            'duration': talk_info.get('duration'),
        }

    def _get_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                sub_lang_list[l] = [
                    {
                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
                        'ext': ext,
                    }
                    for ext in ['ted', 'srt']
                ]
            return sub_lang_list
        else:
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
            webpage, 'config')
        config = json.loads(config_json)['config']
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a504ced0	6	from .common import InfoExtractor
9fd5ce0c	7
66ee7b32 S	8	from ..compat import compat_str
66ee7b32 S	9	from ..utils import int_or_none
4ed3e510	10
f853f859	11
a504ced0	12	class TEDIE(InfoExtractor):
cfbee8a4	13	IE_NAME = 'ted'
aab74fa1 PH	14	_VALID_URL = r'''(?x)
aab74fa1 PH	15	(?P<proto>https?://)
cd791a5e	16	(?P<type>www\|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
bacac173 JMF	17	(
	18	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	19	\|
	20	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	21	\|
ac6c1048 PH	22	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	23	)
bacac173 JMF	24	(/lang/(.*?))? # The url may contain the language
ac6c1048	25	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	26	.*)$
bacac173	27	'''
ac6c1048	28	_TESTS = [{
f853f859	29	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c	30	'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859	31	'info_dict': {
7b9965ea JMF	32	'id': '102',
7b9965ea JMF	33	'ext': 'mp4',
652bee05	34	'title': 'The illusion of consciousness',
bacac173	35	'description': ('Philosopher Dan Dennett makes a compelling '
9e1a5b84 JW	36	'argument that not only don\'t we understand our own '
	37	'consciousness, but that half the time our brains are '
	38	'actively fooling us.'),
652bee05	39	'uploader': 'Dan Dennett',
0ba77818	40	'width': 854,
eb4cb42a	41	'duration': 1308,
6f5ac90c	42	}
ac6c1048 PH	43	}, {
	44	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
	45	'md5': '226f4fb9c62380d11b7995efa4c87994',
	46	'info_dict': {
	47	'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
	48	'ext': 'mp4',
	49	'title': 'Vishal Sikka: The beauty and power of algorithms',
	50	'thumbnail': 're:^https?://.+\.jpg',
	51	'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
	52	}
2d4c98db JMF	53	}, {
	54	'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
	55	'info_dict': {
	56	'id': '1972',
5bec5748	57	'ext': 'mp4',
2d4c98db JMF	58	'title': 'Be passionate. Be courageous. Be your best.',
2d4c98db JMF	59	'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748	60	'description': 'md5:5174aed4d0f16021b704120360f72b92',
eb4cb42a	61	'duration': 1128,
2d4c98db	62	},
22a6f150 PH	63	}, {
	64	'url': 'http://www.ted.com/playlists/who_are_the_hackers',
	65	'info_dict': {
	66	'id': '10',
	67	'title': 'Who are the hackers?',
	68	},
	69	'playlist_mincount': 6,
a72cbfac JMF	70	}, {
	71	# contains a youtube video
	72	'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
	73	'add_ie': ['Youtube'],
	74	'info_dict': {
	75	'id': '_ZG8HBuDjgc',
	76	'ext': 'mp4',
	77	'title': 'Douglas Adams: Parrots the Universe and Everything',
	78	'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
	79	'uploader': 'University of California Television (UCTV)',
	80	'uploader_id': 'UCtelevision',
	81	'upload_date': '20080522',
	82	},
	83	'params': {
	84	'skip_download': True,
	85	},
a461a119 S	86	}, {
	87	# YouTube video
	88	'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
	89	'add_ie': ['Youtube'],
	90	'info_dict': {
	91	'id': 'aFBIPO-P7LM',
	92	'ext': 'mp4',
	93	'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
	94	'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
	95	'uploader': 'TEDx Talks',
	96	'uploader_id': 'TEDxTalks',
	97	'upload_date': '20111216',
	98	},
	99	'params': {
	100	'skip_download': True,
	101	},
ac6c1048	102	}]
9fd5ce0c	103
0ba77818 PH	104	_NATIVE_FORMATS = {
	105	'low': {'preference': 1, 'width': 320, 'height': 180},
	106	'medium': {'preference': 2, 'width': 512, 'height': 288},
	107	'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05	108	}
9fd5ce0c	109
ca1fee34	110	def _extract_info(self, webpage):
bacac173	111	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
9e1a5b84	112	webpage, 'info json')
ca1fee34 JMF	113	return json.loads(info_json)
ca1fee34 JMF	114
9fd5ce0c	115	def _real_extract(self, url):
bacac173	116	m = re.match(self._VALID_URL, url, re.VERBOSE)
cd791a5e	117	if m.group('type').startswith('embed'):
aab74fa1 PH	118	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
aab74fa1 PH	119	return self.url_result(desktop_url, 'TED')
bacac173	120	name = m.group('name')
9fd5ce0c	121	if m.group('type_talk'):
bacac173	122	return self._talk_info(url, name)
ac6c1048 PH	123	elif m.group('type_watch'):
ac6c1048 PH	124	return self._watch_info(url, name)
bacac173	125	else:
ca1fee34	126	return self._playlist_videos_info(url, name)
9fd5ce0c	127
ca1fee34	128	def _playlist_videos_info(self, url, name):
9fd5ce0c	129	'''Returns the videos of the playlist'''
fc2ef392	130
ca1fee34	131	webpage = self._download_webpage(url, name,
9e1a5b84	132	'Downloading playlist webpage')
ca1fee34 JMF	133	info = self._extract_info(webpage)
ca1fee34 JMF	134	playlist_info = info['playlist']
9fd5ce0c	135
fc2ef392	136	playlist_entries = [
f07a9f6f	137	self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34	138	for talk in info['talks']
fc2ef392 PH	139	]
fc2ef392 PH	140	return self.playlist_result(
ca1fee34 JMF	141	playlist_entries,
	142	playlist_id=compat_str(playlist_info['id']),
	143	playlist_title=playlist_info['title'])
9fd5ce0c	144
bacac173 JMF	145	def _talk_info(self, url, video_name):
bacac173 JMF	146	webpage = self._download_webpage(url, video_name)
9fd5ce0c	147	self.report_extraction(video_name)
a9a3876d	148
ca1fee34	149	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	150
a461a119 S	151	external = talk_info.get('external')
	152	if external:
	153	service = external['service']
	154	self.to_screen('Found video from %s' % service)
	155	ext_url = None
	156	if service.lower() == 'youtube':
	157	ext_url = external.get('code')
a72cbfac JMF	158	return {
a72cbfac JMF	159	'_type': 'url',
a461a119	160	'url': ext_url or external['uri'],
a72cbfac JMF	161	}
a72cbfac JMF	162
652bee05	163	formats = [{
652bee05 JMF	164	'url': format_url,
	165	'format_id': format_id,
	166	'format': format_id,
2d4c98db JMF	167	} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
	168	if formats:
	169	for f in formats:
	170	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	171	if finfo:
	172	f.update(finfo)
66ee7b32 S	173
	174	for format_id, resources in talk_info['resources'].items():
	175	if format_id == 'h264':
	176	for resource in resources:
	177	bitrate = int_or_none(resource.get('bitrate'))
	178	formats.append({
	179	'url': resource['file'],
	180	'format_id': '%s-%sk' % (format_id, bitrate),
	181	'tbr': bitrate,
	182	})
	183	elif format_id == 'rtmp':
	184	streamer = talk_info.get('streamer')
	185	if not streamer:
	186	continue
	187	for resource in resources:
	188	formats.append({
	189	'format_id': '%s-%s' % (format_id, resource.get('name')),
	190	'url': streamer,
	191	'play_path': resource['file'],
	192	'ext': 'flv',
	193	'width': int_or_none(resource.get('width')),
	194	'height': int_or_none(resource.get('height')),
	195	'tbr': int_or_none(resource.get('bitrate')),
	196	})
	197	elif format_id == 'hls':
736785ab S	198	hls_formats = self._extract_m3u8_formats(
	199	resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
	200	for f in hls_formats:
6621ca39 S	201	if f.get('format_id') == 'hls-meta':
6621ca39 S	202	continue
0f0b5736 S	203	if not f.get('height'):
	204	f['vcodec'] = 'none'
	205	else:
	206	f['acodec'] = 'none'
736785ab	207	formats.extend(hls_formats)
66ee7b32 S	208
	209	audio_download = talk_info.get('audioDownload')
	210	if audio_download:
	211	formats.append({
	212	'url': audio_download,
	213	'format_id': 'audio',
736785ab	214	'vcodec': 'none',
14f7abfa	215	'preference': -0.5,
66ee7b32 S	216	})
66ee7b32 S	217
652bee05 JMF	218	self._sort_formats(formats)
652bee05 JMF	219
7b9965ea	220	video_id = compat_str(talk_info['id'])
a9a3876d	221
b6c1cecc JMF	222	thumbnail = talk_info['thumb']
	223	if not thumbnail.startswith('http'):
	224	thumbnail = 'http://' + thumbnail
463a9087	225	return {
a9a3876d	226	'id': video_id,
a8eb5a8e	227	'title': talk_info['title'].strip(),
652bee05	228	'uploader': talk_info['speaker'],
b6c1cecc	229	'thumbnail': thumbnail,
652bee05	230	'description': self._og_search_description(webpage),
03091e37	231	'subtitles': self._get_subtitles(video_id, talk_info),
0d8cb1cc	232	'formats': formats,
eb4cb42a	233	'duration': talk_info.get('duration'),
0d8cb1cc PH	234	}
0d8cb1cc PH	235
a504ced0	236	def _get_subtitles(self, video_id, talk_info):
652bee05 JMF	237	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	238	if languages:
	239	sub_lang_list = {}
	240	for l in languages:
a504ced0 JMF	241	sub_lang_list[l] = [
	242	{
	243	'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
	244	'ext': ext,
	245	}
	246	for ext in ['ted', 'srt']
	247	]
652bee05 JMF	248	return sub_lang_list
652bee05 JMF	249	else:
652bee05	250	return {}
ac6c1048 PH	251
	252	def _watch_info(self, url, name):
	253	webpage = self._download_webpage(url, name)
	254
	255	config_json = self._html_search_regex(
de9bd74b S	256	r'"pages\.jwplayer"\s,\s({.+?})\s\)\s</script>',
	257	webpage, 'config')
	258	config = json.loads(config_json)['config']
ac6c1048 PH	259	video_url = config['video']['url']
	260	thumbnail = config.get('image', {}).get('url')
	261
	262	title = self._html_search_regex(
	263	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	264	description = self._html_search_regex(
621f33c9 PH	265	[
	266	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	267	r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
	268	],
ac6c1048 PH	269	webpage, 'description', fatal=False)
	270
	271	return {
	272	'id': name,
	273	'url': video_url,
	274	'title': title,
	275	'thumbnail': thumbnail,
	276	'description': description,
	277	}