[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .subtitles import SubtitlesInfoExtractor

from ..utils import (
    compat_str,
)


class TEDIE(SubtitlesInfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                'argument that not only don\'t we understand our own '
                'consciousness, but that half the time our brains are '
                'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
            webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type') == 'embed':
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
            'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)
        else:
            # Use rtmp downloads
            formats = [{
                'format_id': f['name'],
                'url': talk_info['streamer'],
                'play_path': f['file'],
                'ext': 'flv',
                'width': f['width'],
                'height': f['height'],
                'tbr': f['bitrate'],
            } for f in talk_info['resources']['rtmp']]
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'],
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
        }

    def _get_available_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r"data-config='([^']+)", webpage, 'config')
        config = json.loads(config_json)
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a9a3876d	6	from .subtitles import SubtitlesInfoExtractor
9fd5ce0c	7
4ed3e510	8	from ..utils import (
ca1fee34	9	compat_str,
4ed3e510 IM	10	)
4ed3e510 IM	11
f853f859	12
a9a3876d	13	class TEDIE(SubtitlesInfoExtractor):
aab74fa1 PH	14	_VALID_URL = r'''(?x)
	15	(?P<proto>https?://)
	16	(?P<type>www\|embed)(?P<urlmain>\.ted\.com/
bacac173 JMF	17	(
	18	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	19	\|
	20	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	21	\|
ac6c1048 PH	22	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	23	)
bacac173 JMF	24	(/lang/(.*?))? # The url may contain the language
ac6c1048	25	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	26	.*)$
bacac173	27	'''
ac6c1048	28	_TESTS = [{
f853f859	29	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c	30	'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859	31	'info_dict': {
7b9965ea JMF	32	'id': '102',
7b9965ea JMF	33	'ext': 'mp4',
652bee05	34	'title': 'The illusion of consciousness',
bacac173 JMF	35	'description': ('Philosopher Dan Dennett makes a compelling '
	36	'argument that not only don\'t we understand our own '
	37	'consciousness, but that half the time our brains are '
	38	'actively fooling us.'),
652bee05	39	'uploader': 'Dan Dennett',
0ba77818	40	'width': 854,
6f5ac90c	41	}
ac6c1048 PH	42	}, {
	43	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
	44	'md5': '226f4fb9c62380d11b7995efa4c87994',
	45	'info_dict': {
	46	'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
	47	'ext': 'mp4',
	48	'title': 'Vishal Sikka: The beauty and power of algorithms',
	49	'thumbnail': 're:^https?://.+\.jpg',
	50	'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
	51	}
2d4c98db JMF	52	}, {
	53	'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
	54	'info_dict': {
	55	'id': '1972',
5bec5748	56	'ext': 'mp4',
2d4c98db JMF	57	'title': 'Be passionate. Be courageous. Be your best.',
2d4c98db JMF	58	'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748	59	'description': 'md5:5174aed4d0f16021b704120360f72b92',
2d4c98db	60	},
22a6f150 PH	61	}, {
	62	'url': 'http://www.ted.com/playlists/who_are_the_hackers',
	63	'info_dict': {
	64	'id': '10',
	65	'title': 'Who are the hackers?',
	66	},
	67	'playlist_mincount': 6,
ac6c1048	68	}]
9fd5ce0c	69
0ba77818 PH	70	_NATIVE_FORMATS = {
	71	'low': {'preference': 1, 'width': 320, 'height': 180},
	72	'medium': {'preference': 2, 'width': 512, 'height': 288},
	73	'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05	74	}
9fd5ce0c	75
ca1fee34	76	def _extract_info(self, webpage):
bacac173 JMF	77	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
bacac173 JMF	78	webpage, 'info json')
ca1fee34 JMF	79	return json.loads(info_json)
ca1fee34 JMF	80
9fd5ce0c	81	def _real_extract(self, url):
bacac173	82	m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1 PH	83	if m.group('type') == 'embed':
	84	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
	85	return self.url_result(desktop_url, 'TED')
bacac173	86	name = m.group('name')
9fd5ce0c	87	if m.group('type_talk'):
bacac173	88	return self._talk_info(url, name)
ac6c1048 PH	89	elif m.group('type_watch'):
ac6c1048 PH	90	return self._watch_info(url, name)
bacac173	91	else:
ca1fee34	92	return self._playlist_videos_info(url, name)
9fd5ce0c	93
ca1fee34	94	def _playlist_videos_info(self, url, name):
9fd5ce0c	95	'''Returns the videos of the playlist'''
fc2ef392	96
ca1fee34 JMF	97	webpage = self._download_webpage(url, name,
	98	'Downloading playlist webpage')
	99	info = self._extract_info(webpage)
	100	playlist_info = info['playlist']
9fd5ce0c	101
fc2ef392	102	playlist_entries = [
f07a9f6f	103	self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34	104	for talk in info['talks']
fc2ef392 PH	105	]
fc2ef392 PH	106	return self.playlist_result(
ca1fee34 JMF	107	playlist_entries,
	108	playlist_id=compat_str(playlist_info['id']),
	109	playlist_title=playlist_info['title'])
9fd5ce0c	110
bacac173 JMF	111	def _talk_info(self, url, video_name):
bacac173 JMF	112	webpage = self._download_webpage(url, video_name)
9fd5ce0c	113	self.report_extraction(video_name)
a9a3876d	114
ca1fee34	115	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	116
652bee05	117	formats = [{
652bee05 JMF	118	'url': format_url,
	119	'format_id': format_id,
	120	'format': format_id,
2d4c98db JMF	121	} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
	122	if formats:
	123	for f in formats:
	124	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	125	if finfo:
	126	f.update(finfo)
	127	else:
	128	# Use rtmp downloads
	129	formats = [{
	130	'format_id': f['name'],
	131	'url': talk_info['streamer'],
	132	'play_path': f['file'],
	133	'ext': 'flv',
	134	'width': f['width'],
	135	'height': f['height'],
	136	'tbr': f['bitrate'],
	137	} for f in talk_info['resources']['rtmp']]
652bee05 JMF	138	self._sort_formats(formats)
652bee05 JMF	139
7b9965ea	140	video_id = compat_str(talk_info['id'])
a9a3876d	141	# subtitles
652bee05	142	video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d	143	if self._downloader.params.get('listsubtitles', False):
652bee05	144	self._list_available_subtitles(video_id, talk_info)
a9a3876d IM	145	return
a9a3876d IM	146
b6c1cecc JMF	147	thumbnail = talk_info['thumb']
	148	if not thumbnail.startswith('http'):
	149	thumbnail = 'http://' + thumbnail
463a9087	150	return {
a9a3876d	151	'id': video_id,
652bee05 JMF	152	'title': talk_info['title'],
652bee05 JMF	153	'uploader': talk_info['speaker'],
b6c1cecc	154	'thumbnail': thumbnail,
652bee05	155	'description': self._og_search_description(webpage),
a9a3876d	156	'subtitles': video_subtitles,
0d8cb1cc PH	157	'formats': formats,
	158	}
	159
652bee05 JMF	160	def _get_available_subtitles(self, video_id, talk_info):
	161	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	162	if languages:
	163	sub_lang_list = {}
	164	for l in languages:
	165	url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
	166	sub_lang_list[l] = url
	167	return sub_lang_list
	168	else:
f07a9f6f	169	self._downloader.report_warning('video doesn\'t have subtitles')
652bee05	170	return {}
ac6c1048 PH	171
	172	def _watch_info(self, url, name):
	173	webpage = self._download_webpage(url, name)
	174
	175	config_json = self._html_search_regex(
	176	r"data-config='([^']+)", webpage, 'config')
	177	config = json.loads(config_json)
	178	video_url = config['video']['url']
	179	thumbnail = config.get('image', {}).get('url')
	180
	181	title = self._html_search_regex(
	182	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	183	description = self._html_search_regex(
621f33c9 PH	184	[
	185	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	186	r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
	187	],
ac6c1048 PH	188	webpage, 'description', fatal=False)
	189
	190	return {
	191	'id': name,
	192	'url': video_url,
	193	'title': title,
	194	'thumbnail': thumbnail,
	195	'description': description,
	196	}