[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .subtitles import SubtitlesInfoExtractor

from ..utils import (
    compat_str,
)


class TEDIE(SubtitlesInfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': '4ea1dada91e4174b53dac2bb8ace429d',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                'argument that not only don\'t we understand our own '
                'consciousness, but that half the time our brains are '
                'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
            webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type') == 'embed':
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
            'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items()]
        for f in formats:
            finfo = self._NATIVE_FORMATS.get(f['format_id'])
            if finfo:
                f.update(finfo)
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'],
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
        }

    def _get_available_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
            return sub_lang_list
        else:
            self._downloader.report_warning(u'video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r"data-config='([^']+)", webpage, 'config')
        config = json.loads(config_json)
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a9a3876d	6	from .subtitles import SubtitlesInfoExtractor
9fd5ce0c	7
4ed3e510	8	from ..utils import (
ca1fee34	9	compat_str,
4ed3e510 IM	10	)
4ed3e510 IM	11
f853f859	12
a9a3876d	13	class TEDIE(SubtitlesInfoExtractor):
aab74fa1 PH	14	_VALID_URL = r'''(?x)
	15	(?P<proto>https?://)
	16	(?P<type>www\|embed)(?P<urlmain>\.ted\.com/
bacac173 JMF	17	(
	18	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	19	\|
	20	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	21	\|
ac6c1048 PH	22	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	23	)
bacac173 JMF	24	(/lang/(.*?))? # The url may contain the language
ac6c1048	25	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	26	.*)$
bacac173	27	'''
ac6c1048	28	_TESTS = [{
f853f859	29	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
26dca166	30	'md5': '4ea1dada91e4174b53dac2bb8ace429d',
f853f859	31	'info_dict': {
7b9965ea JMF	32	'id': '102',
7b9965ea JMF	33	'ext': 'mp4',
652bee05	34	'title': 'The illusion of consciousness',
bacac173 JMF	35	'description': ('Philosopher Dan Dennett makes a compelling '
	36	'argument that not only don\'t we understand our own '
	37	'consciousness, but that half the time our brains are '
	38	'actively fooling us.'),
652bee05	39	'uploader': 'Dan Dennett',
0ba77818	40	'width': 854,
6f5ac90c	41	}
ac6c1048 PH	42	}, {
	43	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
	44	'md5': '226f4fb9c62380d11b7995efa4c87994',
	45	'info_dict': {
	46	'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
	47	'ext': 'mp4',
	48	'title': 'Vishal Sikka: The beauty and power of algorithms',
	49	'thumbnail': 're:^https?://.+\.jpg',
	50	'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
	51	}
	52	}]
9fd5ce0c	53
0ba77818 PH	54	_NATIVE_FORMATS = {
	55	'low': {'preference': 1, 'width': 320, 'height': 180},
	56	'medium': {'preference': 2, 'width': 512, 'height': 288},
	57	'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05	58	}
9fd5ce0c	59
ca1fee34	60	def _extract_info(self, webpage):
bacac173 JMF	61	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
bacac173 JMF	62	webpage, 'info json')
ca1fee34 JMF	63	return json.loads(info_json)
ca1fee34 JMF	64
9fd5ce0c	65	def _real_extract(self, url):
bacac173	66	m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1 PH	67	if m.group('type') == 'embed':
	68	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
	69	return self.url_result(desktop_url, 'TED')
bacac173	70	name = m.group('name')
9fd5ce0c	71	if m.group('type_talk'):
bacac173	72	return self._talk_info(url, name)
ac6c1048 PH	73	elif m.group('type_watch'):
ac6c1048 PH	74	return self._watch_info(url, name)
bacac173	75	else:
ca1fee34	76	return self._playlist_videos_info(url, name)
9fd5ce0c	77
ca1fee34	78	def _playlist_videos_info(self, url, name):
9fd5ce0c	79	'''Returns the videos of the playlist'''
fc2ef392	80
ca1fee34 JMF	81	webpage = self._download_webpage(url, name,
	82	'Downloading playlist webpage')
	83	info = self._extract_info(webpage)
	84	playlist_info = info['playlist']
9fd5ce0c	85
fc2ef392	86	playlist_entries = [
ca1fee34 JMF	87	self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34 JMF	88	for talk in info['talks']
fc2ef392 PH	89	]
fc2ef392 PH	90	return self.playlist_result(
ca1fee34 JMF	91	playlist_entries,
	92	playlist_id=compat_str(playlist_info['id']),
	93	playlist_title=playlist_info['title'])
9fd5ce0c	94
bacac173 JMF	95	def _talk_info(self, url, video_name):
bacac173 JMF	96	webpage = self._download_webpage(url, video_name)
9fd5ce0c	97	self.report_extraction(video_name)
a9a3876d	98
ca1fee34	99	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	100
652bee05	101	formats = [{
652bee05 JMF	102	'url': format_url,
	103	'format_id': format_id,
	104	'format': format_id,
652bee05	105	} for (format_id, format_url) in talk_info['nativeDownloads'].items()]
0ba77818 PH	106	for f in formats:
	107	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	108	if finfo:
	109	f.update(finfo)
652bee05 JMF	110	self._sort_formats(formats)
652bee05 JMF	111
7b9965ea	112	video_id = compat_str(talk_info['id'])
a9a3876d	113	# subtitles
652bee05	114	video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d	115	if self._downloader.params.get('listsubtitles', False):
652bee05	116	self._list_available_subtitles(video_id, talk_info)
a9a3876d IM	117	return
a9a3876d IM	118
b6c1cecc JMF	119	thumbnail = talk_info['thumb']
	120	if not thumbnail.startswith('http'):
	121	thumbnail = 'http://' + thumbnail
463a9087	122	return {
a9a3876d	123	'id': video_id,
652bee05 JMF	124	'title': talk_info['title'],
652bee05 JMF	125	'uploader': talk_info['speaker'],
b6c1cecc	126	'thumbnail': thumbnail,
652bee05	127	'description': self._og_search_description(webpage),
a9a3876d	128	'subtitles': video_subtitles,
0d8cb1cc PH	129	'formats': formats,
	130	}
	131
652bee05 JMF	132	def _get_available_subtitles(self, video_id, talk_info):
	133	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	134	if languages:
	135	sub_lang_list = {}
	136	for l in languages:
	137	url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
	138	sub_lang_list[l] = url
	139	return sub_lang_list
	140	else:
4ed3e510	141	self._downloader.report_warning(u'video doesn\'t have subtitles')
652bee05	142	return {}
ac6c1048 PH	143
	144	def _watch_info(self, url, name):
	145	webpage = self._download_webpage(url, name)
	146
	147	config_json = self._html_search_regex(
	148	r"data-config='([^']+)", webpage, 'config')
	149	config = json.loads(config_json)
	150	video_url = config['video']['url']
	151	thumbnail = config.get('image', {}).get('url')
	152
	153	title = self._html_search_regex(
	154	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	155	description = self._html_search_regex(
	156	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	157	webpage, 'description', fatal=False)
	158
	159	return {
	160	'id': name,
	161	'url': video_url,
	162	'title': title,
	163	'thumbnail': thumbnail,
	164	'description': description,
	165	}