[yt-dlp.git] / youtube_dl / extractor / ted.py

from __future__ import unicode_literals

import json
import re

from .subtitles import SubtitlesInfoExtractor

from ..utils import (
    compat_str,
)


class TEDIE(SubtitlesInfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': '4ea1dada91e4174b53dac2bb8ace429d',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                'argument that not only don\'t we understand our own '
                'consciousness, but that half the time our brains are '
                'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'flv',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:d89e1d8ebafdac8e55df4c219ecdbfe9',
        },
        'params': {
            # rtmp download
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
            webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type') == 'embed':
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
            'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)
        else:
            # Use rtmp downloads
            formats = [{
                'format_id': f['name'],
                'url': talk_info['streamer'],
                'play_path': f['file'],
                'ext': 'flv',
                'width': f['width'],
                'height': f['height'],
                'tbr': f['bitrate'],
            } for f in talk_info['resources']['rtmp']]
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'],
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
        }

    def _get_available_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r"data-config='([^']+)", webpage, 'config')
        config = json.loads(config_json)
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
Commit	Line	Data
f853f859 PH	1	from __future__ import unicode_literals
f853f859 PH	2
9fd5ce0c PH	3	import json
	4	import re
	5
a9a3876d	6	from .subtitles import SubtitlesInfoExtractor
9fd5ce0c	7
4ed3e510	8	from ..utils import (
ca1fee34	9	compat_str,
4ed3e510 IM	10	)
4ed3e510 IM	11
f853f859	12
a9a3876d	13	class TEDIE(SubtitlesInfoExtractor):
aab74fa1 PH	14	_VALID_URL = r'''(?x)
	15	(?P<proto>https?://)
	16	(?P<type>www\|embed)(?P<urlmain>\.ted\.com/
bacac173 JMF	17	(
	18	(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
	19	\|
	20	((?P<type_talk>talks)) # We have a simple talk
ac6c1048 PH	21	\|
ac6c1048 PH	22	(?P<type_watch>watch)/[^/]+/[^/]+
bacac173 JMF	23	)
bacac173 JMF	24	(/lang/(.*?))? # The url may contain the language
ac6c1048	25	/(?P<name>[\w-]+) # Here goes the name and then ".html"
aab74fa1	26	.*)$
bacac173	27	'''
ac6c1048	28	_TESTS = [{
f853f859	29	'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
26dca166	30	'md5': '4ea1dada91e4174b53dac2bb8ace429d',
f853f859	31	'info_dict': {
7b9965ea JMF	32	'id': '102',
7b9965ea JMF	33	'ext': 'mp4',
652bee05	34	'title': 'The illusion of consciousness',
bacac173 JMF	35	'description': ('Philosopher Dan Dennett makes a compelling '
	36	'argument that not only don\'t we understand our own '
	37	'consciousness, but that half the time our brains are '
	38	'actively fooling us.'),
652bee05	39	'uploader': 'Dan Dennett',
0ba77818	40	'width': 854,
6f5ac90c	41	}
ac6c1048 PH	42	}, {
	43	'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
	44	'md5': '226f4fb9c62380d11b7995efa4c87994',
	45	'info_dict': {
	46	'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
	47	'ext': 'mp4',
	48	'title': 'Vishal Sikka: The beauty and power of algorithms',
	49	'thumbnail': 're:^https?://.+\.jpg',
	50	'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
	51	}
2d4c98db JMF	52	}, {
	53	'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
	54	'info_dict': {
	55	'id': '1972',
	56	'ext': 'flv',
	57	'title': 'Be passionate. Be courageous. Be your best.',
	58	'uploader': 'Gabby Giffords and Mark Kelly',
	59	'description': 'md5:d89e1d8ebafdac8e55df4c219ecdbfe9',
	60	},
	61	'params': {
	62	# rtmp download
	63	'skip_download': True,
	64	},
ac6c1048	65	}]
9fd5ce0c	66
0ba77818 PH	67	_NATIVE_FORMATS = {
	68	'low': {'preference': 1, 'width': 320, 'height': 180},
	69	'medium': {'preference': 2, 'width': 512, 'height': 288},
	70	'high': {'preference': 3, 'width': 854, 'height': 480},
652bee05	71	}
9fd5ce0c	72
ca1fee34	73	def _extract_info(self, webpage):
bacac173 JMF	74	info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
bacac173 JMF	75	webpage, 'info json')
ca1fee34 JMF	76	return json.loads(info_json)
ca1fee34 JMF	77
9fd5ce0c	78	def _real_extract(self, url):
bacac173	79	m = re.match(self._VALID_URL, url, re.VERBOSE)
aab74fa1 PH	80	if m.group('type') == 'embed':
	81	desktop_url = m.group('proto') + 'www' + m.group('urlmain')
	82	return self.url_result(desktop_url, 'TED')
bacac173	83	name = m.group('name')
9fd5ce0c	84	if m.group('type_talk'):
bacac173	85	return self._talk_info(url, name)
ac6c1048 PH	86	elif m.group('type_watch'):
ac6c1048 PH	87	return self._watch_info(url, name)
bacac173	88	else:
ca1fee34	89	return self._playlist_videos_info(url, name)
9fd5ce0c	90
ca1fee34	91	def _playlist_videos_info(self, url, name):
9fd5ce0c	92	'''Returns the videos of the playlist'''
fc2ef392	93
ca1fee34 JMF	94	webpage = self._download_webpage(url, name,
	95	'Downloading playlist webpage')
	96	info = self._extract_info(webpage)
	97	playlist_info = info['playlist']
9fd5ce0c	98
fc2ef392	99	playlist_entries = [
f07a9f6f	100	self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
ca1fee34	101	for talk in info['talks']
fc2ef392 PH	102	]
fc2ef392 PH	103	return self.playlist_result(
ca1fee34 JMF	104	playlist_entries,
	105	playlist_id=compat_str(playlist_info['id']),
	106	playlist_title=playlist_info['title'])
9fd5ce0c	107
bacac173 JMF	108	def _talk_info(self, url, video_name):
bacac173 JMF	109	webpage = self._download_webpage(url, video_name)
9fd5ce0c	110	self.report_extraction(video_name)
a9a3876d	111
ca1fee34	112	talk_info = self._extract_info(webpage)['talks'][0]
a9a3876d	113
652bee05	114	formats = [{
652bee05 JMF	115	'url': format_url,
	116	'format_id': format_id,
	117	'format': format_id,
2d4c98db JMF	118	} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
	119	if formats:
	120	for f in formats:
	121	finfo = self._NATIVE_FORMATS.get(f['format_id'])
	122	if finfo:
	123	f.update(finfo)
	124	else:
	125	# Use rtmp downloads
	126	formats = [{
	127	'format_id': f['name'],
	128	'url': talk_info['streamer'],
	129	'play_path': f['file'],
	130	'ext': 'flv',
	131	'width': f['width'],
	132	'height': f['height'],
	133	'tbr': f['bitrate'],
	134	} for f in talk_info['resources']['rtmp']]
652bee05 JMF	135	self._sort_formats(formats)
652bee05 JMF	136
7b9965ea	137	video_id = compat_str(talk_info['id'])
a9a3876d	138	# subtitles
652bee05	139	video_subtitles = self.extract_subtitles(video_id, talk_info)
a9a3876d	140	if self._downloader.params.get('listsubtitles', False):
652bee05	141	self._list_available_subtitles(video_id, talk_info)
a9a3876d IM	142	return
a9a3876d IM	143
b6c1cecc JMF	144	thumbnail = talk_info['thumb']
	145	if not thumbnail.startswith('http'):
	146	thumbnail = 'http://' + thumbnail
463a9087	147	return {
a9a3876d	148	'id': video_id,
652bee05 JMF	149	'title': talk_info['title'],
652bee05 JMF	150	'uploader': talk_info['speaker'],
b6c1cecc	151	'thumbnail': thumbnail,
652bee05	152	'description': self._og_search_description(webpage),
a9a3876d	153	'subtitles': video_subtitles,
0d8cb1cc PH	154	'formats': formats,
	155	}
	156
652bee05 JMF	157	def _get_available_subtitles(self, video_id, talk_info):
	158	languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
	159	if languages:
	160	sub_lang_list = {}
	161	for l in languages:
	162	url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
	163	sub_lang_list[l] = url
	164	return sub_lang_list
	165	else:
f07a9f6f	166	self._downloader.report_warning('video doesn\'t have subtitles')
652bee05	167	return {}
ac6c1048 PH	168
	169	def _watch_info(self, url, name):
	170	webpage = self._download_webpage(url, name)
	171
	172	config_json = self._html_search_regex(
	173	r"data-config='([^']+)", webpage, 'config')
	174	config = json.loads(config_json)
	175	video_url = config['video']['url']
	176	thumbnail = config.get('image', {}).get('url')
	177
	178	title = self._html_search_regex(
	179	r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
	180	description = self._html_search_regex(
621f33c9 PH	181	[
	182	r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',
	183	r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
	184	],
ac6c1048 PH	185	webpage, 'description', fatal=False)
	186
	187	return {
	188	'id': name,
	189	'url': video_url,
	190	'title': title,
	191	'thumbnail': thumbnail,
	192	'description': description,
	193	}