[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor

from ..compat import compat_str
from ..utils import int_or_none


class TEDIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                            'argument that not only don\'t we understand our own '
                            'consciousness, but that half the time our brains are '
                            'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
            'duration': 1308,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'mp4',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # YouTube video
        'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': 'aFBIPO-P7LM',
            'ext': 'mp4',
            'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
            'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
            'uploader': 'TEDx Talks',
            'uploader_id': 'TEDxTalks',
            'upload_date': '20111216',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
                                       webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type').startswith('embed'):
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
                                         'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        external = talk_info.get('external')
        if external:
            service = external['service']
            self.to_screen('Found video from %s' % service)
            ext_url = None
            if service.lower() == 'youtube':
                ext_url = external.get('code')
            return {
                '_type': 'url',
                'url': ext_url or external['uri'],
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)

        for format_id, resources in talk_info['resources'].items():
            if format_id == 'h264':
                for resource in resources:
                    bitrate = int_or_none(resource.get('bitrate'))
                    formats.append({
                        'url': resource['file'],
                        'format_id': '%s-%sk' % (format_id, bitrate),
                        'tbr': bitrate,
                    })
            elif format_id == 'rtmp':
                streamer = talk_info.get('streamer')
                if not streamer:
                    continue
                for resource in resources:
                    formats.append({
                        'format_id': '%s-%s' % (format_id, resource.get('name')),
                        'url': streamer,
                        'play_path': resource['file'],
                        'ext': 'flv',
                        'width': int_or_none(resource.get('width')),
                        'height': int_or_none(resource.get('height')),
                        'tbr': int_or_none(resource.get('bitrate')),
                    })
            elif format_id == 'hls':
                formats.extend(self._extract_m3u8_formats(
                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id))

        audio_download = talk_info.get('audioDownload')
        if audio_download:
            formats.append({
                'url': audio_download,
                'format_id': 'audio',
            })

        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': self._get_subtitles(video_id, talk_info),
            'formats': formats,
            'duration': talk_info.get('duration'),
        }

    def _get_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                sub_lang_list[l] = [
                    {
                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
                        'ext': ext,
                    }
                    for ext in ['ted', 'srt']
                ]
            return sub_lang_list
        else:
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
            webpage, 'config')
        config = json.loads(config_json)['config']
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a504ced0	6	from .common import InfoExtractor
9fd5ce0c	7
66ee7b32 S	8	from ..compat import compat_str
66ee7b32 S	9	from ..utils import int_or_none
4ed3e510	10
f853f859	11
a504ced0	12	class TEDIE(InfoExtractor):
aab74fa1 PH	13	_VALID_URL = r'''(?x)
aab74fa1 PH	14	(?P<proto>https?://)
cd791a5e	15	(?P<type>www\|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
bacac173 JMF	16	(
	17	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	18	\|
	19	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	20	\|
ac6c1048 PH	21	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	22	)
bacac173 JMF	23	(/lang/(.*?))? # The url may contain the language
ac6c1048	24	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	25	.*)$
bacac173	26	'''
ac6c1048	27	_TESTS = [{
f853f859	28	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c	29	'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859	30	'info_dict': {
7b9965ea JMF	31	'id': '102',
7b9965ea JMF	32	'ext': 'mp4',
652bee05	33	'title': 'The illusion of consciousness',
bacac173	34	'description': ('Philosopher Dan Dennett makes a compelling '
9e1a5b84 JW	35	'argument that not only don\'t we understand our own '
	36	'consciousness, but that half the time our brains are '
	37	'actively fooling us.'),
652bee05	38	'uploader': 'Dan Dennett',
0ba77818	39	'width': 854,
eb4cb42a	40	'duration': 1308,
6f5ac90c	41	}
ac6c1048 PH	42	}, {
	43	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
	44	'md5': '226f4fb9c62380d11b7995efa4c87994',
	45	'info_dict': {
	46	'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
	47	'ext': 'mp4',
	48	'title': 'Vishal Sikka: The beauty and power of algorithms',
	49	'thumbnail': 're:^https?://.+\.jpg',
	50	'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
	51	}
2d4c98db JMF	52	}, {
	53	'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
	54	'info_dict': {
	55	'id': '1972',
5bec5748	56	'ext': 'mp4',
2d4c98db JMF	57	'title': 'Be passionate. Be courageous. Be your best.',
2d4c98db JMF	58	'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748	59	'description': 'md5:5174aed4d0f16021b704120360f72b92',
eb4cb42a	60	'duration': 1128,
2d4c98db	61	},
22a6f150 PH	62	}, {
	63	'url': 'http://www.ted.com/playlists/who_are_the_hackers',
	64	'info_dict': {
	65	'id': '10',
	66	'title': 'Who are the hackers?',
	67	},
	68	'playlist_mincount': 6,
a72cbfac JMF	69	}, {
	70	# contains a youtube video
	71	'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
	72	'add_ie': ['Youtube'],
	73	'info_dict': {
	74	'id': '_ZG8HBuDjgc',
	75	'ext': 'mp4',
	76	'title': 'Douglas Adams: Parrots the Universe and Everything',
	77	'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
	78	'uploader': 'University of California Television (UCTV)',
	79	'uploader_id': 'UCtelevision',
	80	'upload_date': '20080522',
	81	},
	82	'params': {
	83	'skip_download': True,
	84	},
a461a119 S	85	}, {
	86	# YouTube video
	87	'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
	88	'add_ie': ['Youtube'],
	89	'info_dict': {
	90	'id': 'aFBIPO-P7LM',
	91	'ext': 'mp4',
	92	'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
	93	'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
	94	'uploader': 'TEDx Talks',
	95	'uploader_id': 'TEDxTalks',
	96	'upload_date': '20111216',
	97	},
	98	'params': {
	99	'skip_download': True,
	100	},
ac6c1048	101	}]
9fd5ce0c	102
0ba77818 PH	103	_NATIVE_FORMATS = {
	104	'low': {'preference': 1, 'width': 320, 'height': 180},
	105	'medium': {'preference': 2, 'width': 512, 'height': 288},
	106	'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05	107	}
9fd5ce0c	108
ca1fee34	109	def _extract_info(self, webpage):
bacac173	110	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
9e1a5b84	111	webpage, 'info json')
ca1fee34 JMF	112	return json.loads(info_json)
ca1fee34 JMF	113
9fd5ce0c	114	def _real_extract(self, url):
bacac173	115	m = re.match(self._VALID_URL, url, re.VERBOSE)
cd791a5e	116	if m.group('type').startswith('embed'):
aab74fa1 PH	117	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
aab74fa1 PH	118	return self.url_result(desktop_url, 'TED')
bacac173	119	name = m.group('name')
9fd5ce0c	120	if m.group('type_talk'):
bacac173	121	return self._talk_info(url, name)
ac6c1048 PH	122	elif m.group('type_watch'):
ac6c1048 PH	123	return self._watch_info(url, name)
bacac173	124	else:
ca1fee34	125	return self._playlist_videos_info(url, name)
9fd5ce0c	126
ca1fee34	127	def _playlist_videos_info(self, url, name):
9fd5ce0c	128	'''Returns the videos of the playlist'''
fc2ef392	129
ca1fee34	130	webpage = self._download_webpage(url, name,
9e1a5b84	131	'Downloading playlist webpage')
ca1fee34 JMF	132	info = self._extract_info(webpage)
ca1fee34 JMF	133	playlist_info = info['playlist']
9fd5ce0c	134
fc2ef392	135	playlist_entries = [
f07a9f6f	136	self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34	137	for talk in info['talks']
fc2ef392 PH	138	]
fc2ef392 PH	139	return self.playlist_result(
ca1fee34 JMF	140	playlist_entries,
	141	playlist_id=compat_str(playlist_info['id']),
	142	playlist_title=playlist_info['title'])
9fd5ce0c	143
bacac173 JMF	144	def _talk_info(self, url, video_name):
bacac173 JMF	145	webpage = self._download_webpage(url, video_name)
9fd5ce0c	146	self.report_extraction(video_name)
a9a3876d	147
ca1fee34	148	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	149
a461a119 S	150	external = talk_info.get('external')
	151	if external:
	152	service = external['service']
	153	self.to_screen('Found video from %s' % service)
	154	ext_url = None
	155	if service.lower() == 'youtube':
	156	ext_url = external.get('code')
a72cbfac JMF	157	return {
a72cbfac JMF	158	'_type': 'url',
a461a119	159	'url': ext_url or external['uri'],
a72cbfac JMF	160	}
a72cbfac JMF	161
652bee05	162	formats = [{
652bee05 JMF	163	'url': format_url,
	164	'format_id': format_id,
	165	'format': format_id,
2d4c98db JMF	166	} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
	167	if formats:
	168	for f in formats:
	169	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	170	if finfo:
	171	f.update(finfo)
66ee7b32 S	172
	173	for format_id, resources in talk_info['resources'].items():
	174	if format_id == 'h264':
	175	for resource in resources:
	176	bitrate = int_or_none(resource.get('bitrate'))
	177	formats.append({
	178	'url': resource['file'],
	179	'format_id': '%s-%sk' % (format_id, bitrate),
	180	'tbr': bitrate,
	181	})
	182	elif format_id == 'rtmp':
	183	streamer = talk_info.get('streamer')
	184	if not streamer:
	185	continue
	186	for resource in resources:
	187	formats.append({
	188	'format_id': '%s-%s' % (format_id, resource.get('name')),
	189	'url': streamer,
	190	'play_path': resource['file'],
	191	'ext': 'flv',
	192	'width': int_or_none(resource.get('width')),
	193	'height': int_or_none(resource.get('height')),
	194	'tbr': int_or_none(resource.get('bitrate')),
	195	})
	196	elif format_id == 'hls':
	197	formats.extend(self._extract_m3u8_formats(
	198	resources.get('stream'), video_name, 'mp4', m3u8_id=format_id))
	199
	200	audio_download = talk_info.get('audioDownload')
	201	if audio_download:
	202	formats.append({
	203	'url': audio_download,
	204	'format_id': 'audio',
	205	})
	206
652bee05 JMF	207	self._sort_formats(formats)
652bee05 JMF	208
7b9965ea	209	video_id = compat_str(talk_info['id'])
a9a3876d	210
b6c1cecc JMF	211	thumbnail = talk_info['thumb']
	212	if not thumbnail.startswith('http'):
	213	thumbnail = 'http://' + thumbnail
463a9087	214	return {
a9a3876d	215	'id': video_id,
a8eb5a8e	216	'title': talk_info['title'].strip(),
652bee05	217	'uploader': talk_info['speaker'],
b6c1cecc	218	'thumbnail': thumbnail,
652bee05	219	'description': self._og_search_description(webpage),
03091e37	220	'subtitles': self._get_subtitles(video_id, talk_info),
0d8cb1cc	221	'formats': formats,
eb4cb42a	222	'duration': talk_info.get('duration'),
0d8cb1cc PH	223	}
0d8cb1cc PH	224
a504ced0	225	def _get_subtitles(self, video_id, talk_info):
652bee05 JMF	226	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	227	if languages:
	228	sub_lang_list = {}
	229	for l in languages:
a504ced0 JMF	230	sub_lang_list[l] = [
	231	{
	232	'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
	233	'ext': ext,
	234	}
	235	for ext in ['ted', 'srt']
	236	]
652bee05 JMF	237	return sub_lang_list
652bee05 JMF	238	else:
652bee05	239	return {}
ac6c1048 PH	240
	241	def _watch_info(self, url, name):
	242	webpage = self._download_webpage(url, name)
	243
	244	config_json = self._html_search_regex(
de9bd74b S	245	r'"pages\.jwplayer"\s,\s({.+?})\s\)\s</script>',
	246	webpage, 'config')
	247	config = json.loads(config_json)['config']
ac6c1048 PH	248	video_url = config['video']['url']
	249	thumbnail = config.get('image', {}).get('url')
	250
	251	title = self._html_search_regex(
	252	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	253	description = self._html_search_regex(
621f33c9 PH	254	[
	255	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	256	r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
	257	],
ac6c1048 PH	258	webpage, 'description', fatal=False)
	259
	260	return {
	261	'id': name,
	262	'url': video_url,
	263	'title': title,
	264	'thumbnail': thumbnail,
	265	'description': description,
	266	}