[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .subtitles import SubtitlesInfoExtractor

from ..compat import (
    compat_str,
)


class TEDIE(SubtitlesInfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                            'argument that not only don\'t we understand our own '
                            'consciousness, but that half the time our brains are '
                            'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
            'duration': 1308,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'mp4',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
                                       webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type').startswith('embed'):
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
                                         'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        if talk_info.get('external') is not None:
            self.to_screen('Found video from %s' % talk_info['external']['service'])
            if 'code' in talk_info['external']:
                ext_url = talk_info['external']['code']
            else:
                ext_url = talk_info['external']['uri']
            return {
                '_type': 'url',
                'url': ext_url,
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)
        else:
            # Use rtmp downloads
            formats = [{
                'format_id': f['name'],
                'url': talk_info['streamer'],
                'play_path': f['file'],
                'ext': 'flv',
                'width': f['width'],
                'height': f['height'],
                'tbr': f['bitrate'],
            } for f in talk_info['resources']['rtmp']]
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
            'duration': talk_info.get('duration'),
        }

    def _get_available_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
            webpage, 'config')
        config = json.loads(config_json)['config']
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a9a3876d	6	from .subtitles import SubtitlesInfoExtractor
9fd5ce0c	7
1cc79574	8	from ..compat import (
ca1fee34	9	compat_str,
4ed3e510 IM	10	)
4ed3e510 IM	11
f853f859	12
a9a3876d	13	class TEDIE(SubtitlesInfoExtractor):
aab74fa1 PH	14	_VALID_URL = r'''(?x)
aab74fa1 PH	15	(?P<proto>https?://)
cd791a5e	16	(?P<type>www\|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
bacac173 JMF	17	(
	18	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	19	\|
	20	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	21	\|
ac6c1048 PH	22	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	23	)
bacac173 JMF	24	(/lang/(.*?))? # The url may contain the language
ac6c1048	25	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	26	.*)$
bacac173	27	'''
ac6c1048	28	_TESTS = [{
f853f859	29	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c	30	'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859	31	'info_dict': {
7b9965ea JMF	32	'id': '102',
7b9965ea JMF	33	'ext': 'mp4',
652bee05	34	'title': 'The illusion of consciousness',
bacac173	35	'description': ('Philosopher Dan Dennett makes a compelling '
9e1a5b84 JW	36	'argument that not only don\'t we understand our own '
	37	'consciousness, but that half the time our brains are '
	38	'actively fooling us.'),
652bee05	39	'uploader': 'Dan Dennett',
0ba77818	40	'width': 854,
eb4cb42a	41	'duration': 1308,
6f5ac90c	42	}
ac6c1048 PH	43	}, {
	44	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
	45	'md5': '226f4fb9c62380d11b7995efa4c87994',
	46	'info_dict': {
	47	'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
	48	'ext': 'mp4',
	49	'title': 'Vishal Sikka: The beauty and power of algorithms',
	50	'thumbnail': 're:^https?://.+\.jpg',
	51	'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
	52	}
2d4c98db JMF	53	}, {
	54	'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
	55	'info_dict': {
	56	'id': '1972',
5bec5748	57	'ext': 'mp4',
2d4c98db JMF	58	'title': 'Be passionate. Be courageous. Be your best.',
2d4c98db JMF	59	'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748	60	'description': 'md5:5174aed4d0f16021b704120360f72b92',
eb4cb42a	61	'duration': 1128,
2d4c98db	62	},
22a6f150 PH	63	}, {
	64	'url': 'http://www.ted.com/playlists/who_are_the_hackers',
	65	'info_dict': {
	66	'id': '10',
	67	'title': 'Who are the hackers?',
	68	},
	69	'playlist_mincount': 6,
a72cbfac JMF	70	}, {
	71	# contains a youtube video
	72	'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
	73	'add_ie': ['Youtube'],
	74	'info_dict': {
	75	'id': '_ZG8HBuDjgc',
	76	'ext': 'mp4',
	77	'title': 'Douglas Adams: Parrots the Universe and Everything',
	78	'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
	79	'uploader': 'University of California Television (UCTV)',
	80	'uploader_id': 'UCtelevision',
	81	'upload_date': '20080522',
	82	},
	83	'params': {
	84	'skip_download': True,
	85	},
ac6c1048	86	}]
9fd5ce0c	87
0ba77818 PH	88	_NATIVE_FORMATS = {
	89	'low': {'preference': 1, 'width': 320, 'height': 180},
	90	'medium': {'preference': 2, 'width': 512, 'height': 288},
	91	'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05	92	}
9fd5ce0c	93
ca1fee34	94	def _extract_info(self, webpage):
bacac173	95	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
9e1a5b84	96	webpage, 'info json')
ca1fee34 JMF	97	return json.loads(info_json)
ca1fee34 JMF	98
9fd5ce0c	99	def _real_extract(self, url):
bacac173	100	m = re.match(self._VALID_URL, url, re.VERBOSE)
cd791a5e	101	if m.group('type').startswith('embed'):
aab74fa1 PH	102	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
aab74fa1 PH	103	return self.url_result(desktop_url, 'TED')
bacac173	104	name = m.group('name')
9fd5ce0c	105	if m.group('type_talk'):
bacac173	106	return self._talk_info(url, name)
ac6c1048 PH	107	elif m.group('type_watch'):
ac6c1048 PH	108	return self._watch_info(url, name)
bacac173	109	else:
ca1fee34	110	return self._playlist_videos_info(url, name)
9fd5ce0c	111
ca1fee34	112	def _playlist_videos_info(self, url, name):
9fd5ce0c	113	'''Returns the videos of the playlist'''
fc2ef392	114
ca1fee34	115	webpage = self._download_webpage(url, name,
9e1a5b84	116	'Downloading playlist webpage')
ca1fee34 JMF	117	info = self._extract_info(webpage)
ca1fee34 JMF	118	playlist_info = info['playlist']
9fd5ce0c	119
fc2ef392	120	playlist_entries = [
f07a9f6f	121	self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34	122	for talk in info['talks']
fc2ef392 PH	123	]
fc2ef392 PH	124	return self.playlist_result(
ca1fee34 JMF	125	playlist_entries,
	126	playlist_id=compat_str(playlist_info['id']),
	127	playlist_title=playlist_info['title'])
9fd5ce0c	128
bacac173 JMF	129	def _talk_info(self, url, video_name):
bacac173 JMF	130	webpage = self._download_webpage(url, video_name)
9fd5ce0c	131	self.report_extraction(video_name)
a9a3876d	132
ca1fee34	133	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	134
a72cbfac JMF	135	if talk_info.get('external') is not None:
a72cbfac JMF	136	self.to_screen('Found video from %s' % talk_info['external']['service'])
1bd83860	137	if 'code' in talk_info['external']:
	138	ext_url = talk_info['external']['code']
	139	else:
	140	ext_url = talk_info['external']['uri']
a72cbfac JMF	141	return {
a72cbfac JMF	142	'_type': 'url',
1bd83860	143	'url': ext_url,
a72cbfac JMF	144	}
a72cbfac JMF	145
652bee05	146	formats = [{
652bee05 JMF	147	'url': format_url,
	148	'format_id': format_id,
	149	'format': format_id,
2d4c98db JMF	150	} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
	151	if formats:
	152	for f in formats:
	153	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	154	if finfo:
	155	f.update(finfo)
	156	else:
	157	# Use rtmp downloads
	158	formats = [{
	159	'format_id': f['name'],
	160	'url': talk_info['streamer'],
	161	'play_path': f['file'],
	162	'ext': 'flv',
	163	'width': f['width'],
	164	'height': f['height'],
	165	'tbr': f['bitrate'],
	166	} for f in talk_info['resources']['rtmp']]
652bee05 JMF	167	self._sort_formats(formats)
652bee05 JMF	168
7b9965ea	169	video_id = compat_str(talk_info['id'])
a9a3876d	170	# subtitles
652bee05	171	video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d	172	if self._downloader.params.get('listsubtitles', False):
652bee05	173	self._list_available_subtitles(video_id, talk_info)
a9a3876d IM	174	return
a9a3876d IM	175
b6c1cecc JMF	176	thumbnail = talk_info['thumb']
	177	if not thumbnail.startswith('http'):
	178	thumbnail = 'http://' + thumbnail
463a9087	179	return {
a9a3876d	180	'id': video_id,
a8eb5a8e	181	'title': talk_info['title'].strip(),
652bee05	182	'uploader': talk_info['speaker'],
b6c1cecc	183	'thumbnail': thumbnail,
652bee05	184	'description': self._og_search_description(webpage),
a9a3876d	185	'subtitles': video_subtitles,
0d8cb1cc	186	'formats': formats,
eb4cb42a	187	'duration': talk_info.get('duration'),
0d8cb1cc PH	188	}
0d8cb1cc PH	189
652bee05 JMF	190	def _get_available_subtitles(self, video_id, talk_info):
	191	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	192	if languages:
	193	sub_lang_list = {}
	194	for l in languages:
	195	url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
	196	sub_lang_list[l] = url
	197	return sub_lang_list
	198	else:
f07a9f6f	199	self._downloader.report_warning('video doesn\'t have subtitles')
652bee05	200	return {}
ac6c1048 PH	201
	202	def _watch_info(self, url, name):
	203	webpage = self._download_webpage(url, name)
	204
	205	config_json = self._html_search_regex(
de9bd74b S	206	r'"pages\.jwplayer"\s,\s({.+?})\s\)\s</script>',
	207	webpage, 'config')
	208	config = json.loads(config_json)['config']
ac6c1048 PH	209	video_url = config['video']['url']
	210	thumbnail = config.get('image', {}).get('url')
	211
	212	title = self._html_search_regex(
	213	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	214	description = self._html_search_regex(
621f33c9 PH	215	[
	216	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	217	r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
	218	],
ac6c1048 PH	219	webpage, 'description', fatal=False)
	220
	221	return {
	222	'id': name,
	223	'url': video_url,
	224	'title': title,
	225	'thumbnail': thumbnail,
	226	'description': description,
	227	}