[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .subtitles import SubtitlesInfoExtractor

from ..utils import (
    compat_str,
)


class TEDIE(SubtitlesInfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                'argument that not only don\'t we understand our own '
                'consciousness, but that half the time our brains are '
                'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'md5': '49144e345a899b8cb34d315f3b9cfeeb',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
            webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type') == 'embed':
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
            'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)
        else:
            # Use rtmp downloads
            formats = [{
                'format_id': f['name'],
                'url': talk_info['streamer'],
                'play_path': f['file'],
                'ext': 'flv',
                'width': f['width'],
                'height': f['height'],
                'tbr': f['bitrate'],
            } for f in talk_info['resources']['rtmp']]
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'],
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
        }

    def _get_available_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r"data-config='([^']+)", webpage, 'config')
        config = json.loads(config_json)
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a9a3876d	6	from .subtitles import SubtitlesInfoExtractor
9fd5ce0c	7
4ed3e510	8	from ..utils import (
ca1fee34	9	compat_str,
4ed3e510 IM	10	)
4ed3e510 IM	11
f853f859	12
a9a3876d	13	class TEDIE(SubtitlesInfoExtractor):
aab74fa1 PH	14	_VALID_URL = r'''(?x)
	15	(?P<proto>https?://)
	16	(?P<type>www\|embed)(?P<urlmain>\.ted\.com/
bacac173 JMF	17	(
	18	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	19	\|
	20	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	21	\|
ac6c1048 PH	22	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	23	)
bacac173 JMF	24	(/lang/(.*?))? # The url may contain the language
ac6c1048	25	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	26	.*)$
bacac173	27	'''
ac6c1048	28	_TESTS = [{
f853f859	29	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
4d2f143c	30	'md5': 'fc94ac279feebbce69f21c0c6ee82810',
f853f859	31	'info_dict': {
7b9965ea JMF	32	'id': '102',
7b9965ea JMF	33	'ext': 'mp4',
652bee05	34	'title': 'The illusion of consciousness',
bacac173 JMF	35	'description': ('Philosopher Dan Dennett makes a compelling '
	36	'argument that not only don\'t we understand our own '
	37	'consciousness, but that half the time our brains are '
	38	'actively fooling us.'),
652bee05	39	'uploader': 'Dan Dennett',
0ba77818	40	'width': 854,
6f5ac90c	41	}
ac6c1048 PH	42	}, {
	43	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
	44	'md5': '226f4fb9c62380d11b7995efa4c87994',
	45	'info_dict': {
	46	'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
	47	'ext': 'mp4',
	48	'title': 'Vishal Sikka: The beauty and power of algorithms',
	49	'thumbnail': 're:^https?://.+\.jpg',
	50	'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
	51	}
2d4c98db JMF	52	}, {
2d4c98db JMF	53	'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
5bec5748	54	'md5': '49144e345a899b8cb34d315f3b9cfeeb',
2d4c98db JMF	55	'info_dict': {
2d4c98db JMF	56	'id': '1972',
5bec5748	57	'ext': 'mp4',
2d4c98db JMF	58	'title': 'Be passionate. Be courageous. Be your best.',
2d4c98db JMF	59	'uploader': 'Gabby Giffords and Mark Kelly',
5bec5748	60	'description': 'md5:5174aed4d0f16021b704120360f72b92',
2d4c98db	61	},
ac6c1048	62	}]
9fd5ce0c	63
0ba77818 PH	64	_NATIVE_FORMATS = {
	65	'low': {'preference': 1, 'width': 320, 'height': 180},
	66	'medium': {'preference': 2, 'width': 512, 'height': 288},
	67	'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05	68	}
9fd5ce0c	69
ca1fee34	70	def _extract_info(self, webpage):
bacac173 JMF	71	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
bacac173 JMF	72	webpage, 'info json')
ca1fee34 JMF	73	return json.loads(info_json)
ca1fee34 JMF	74
9fd5ce0c	75	def _real_extract(self, url):
bacac173	76	m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1 PH	77	if m.group('type') == 'embed':
	78	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
	79	return self.url_result(desktop_url, 'TED')
bacac173	80	name = m.group('name')
9fd5ce0c	81	if m.group('type_talk'):
bacac173	82	return self._talk_info(url, name)
ac6c1048 PH	83	elif m.group('type_watch'):
ac6c1048 PH	84	return self._watch_info(url, name)
bacac173	85	else:
ca1fee34	86	return self._playlist_videos_info(url, name)
9fd5ce0c	87
ca1fee34	88	def _playlist_videos_info(self, url, name):
9fd5ce0c	89	'''Returns the videos of the playlist'''
fc2ef392	90
ca1fee34 JMF	91	webpage = self._download_webpage(url, name,
	92	'Downloading playlist webpage')
	93	info = self._extract_info(webpage)
	94	playlist_info = info['playlist']
9fd5ce0c	95
fc2ef392	96	playlist_entries = [
f07a9f6f	97	self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34	98	for talk in info['talks']
fc2ef392 PH	99	]
fc2ef392 PH	100	return self.playlist_result(
ca1fee34 JMF	101	playlist_entries,
	102	playlist_id=compat_str(playlist_info['id']),
	103	playlist_title=playlist_info['title'])
9fd5ce0c	104
bacac173 JMF	105	def _talk_info(self, url, video_name):
bacac173 JMF	106	webpage = self._download_webpage(url, video_name)
9fd5ce0c	107	self.report_extraction(video_name)
a9a3876d	108
ca1fee34	109	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	110
652bee05	111	formats = [{
652bee05 JMF	112	'url': format_url,
	113	'format_id': format_id,
	114	'format': format_id,
2d4c98db JMF	115	} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
	116	if formats:
	117	for f in formats:
	118	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	119	if finfo:
	120	f.update(finfo)
	121	else:
	122	# Use rtmp downloads
	123	formats = [{
	124	'format_id': f['name'],
	125	'url': talk_info['streamer'],
	126	'play_path': f['file'],
	127	'ext': 'flv',
	128	'width': f['width'],
	129	'height': f['height'],
	130	'tbr': f['bitrate'],
	131	} for f in talk_info['resources']['rtmp']]
652bee05 JMF	132	self._sort_formats(formats)
652bee05 JMF	133
7b9965ea	134	video_id = compat_str(talk_info['id'])
a9a3876d	135	# subtitles
652bee05	136	video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d	137	if self._downloader.params.get('listsubtitles', False):
652bee05	138	self._list_available_subtitles(video_id, talk_info)
a9a3876d IM	139	return
a9a3876d IM	140
b6c1cecc JMF	141	thumbnail = talk_info['thumb']
	142	if not thumbnail.startswith('http'):
	143	thumbnail = 'http://' + thumbnail
463a9087	144	return {
a9a3876d	145	'id': video_id,
652bee05 JMF	146	'title': talk_info['title'],
652bee05 JMF	147	'uploader': talk_info['speaker'],
b6c1cecc	148	'thumbnail': thumbnail,
652bee05	149	'description': self._og_search_description(webpage),
a9a3876d	150	'subtitles': video_subtitles,
0d8cb1cc PH	151	'formats': formats,
	152	}
	153
652bee05 JMF	154	def _get_available_subtitles(self, video_id, talk_info):
	155	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	156	if languages:
	157	sub_lang_list = {}
	158	for l in languages:
	159	url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
	160	sub_lang_list[l] = url
	161	return sub_lang_list
	162	else:
f07a9f6f	163	self._downloader.report_warning('video doesn\'t have subtitles')
652bee05	164	return {}
ac6c1048 PH	165
	166	def _watch_info(self, url, name):
	167	webpage = self._download_webpage(url, name)
	168
	169	config_json = self._html_search_regex(
	170	r"data-config='([^']+)", webpage, 'config')
	171	config = json.loads(config_json)
	172	video_url = config['video']['url']
	173	thumbnail = config.get('image', {}).get('url')
	174
	175	title = self._html_search_regex(
	176	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	177	description = self._html_search_regex(
621f33c9 PH	178	[
	179	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	180	r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
	181	],
ac6c1048 PH	182	webpage, 'description', fatal=False)
	183
	184	return {
	185	'id': name,
	186	'url': video_url,
	187	'title': title,
	188	'thumbnail': thumbnail,
	189	'description': description,
	190	}