]>
jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/ted.py
4 from .common
import InfoExtractor
16 class TedBaseIE(InfoExtractor
):
17 _VALID_URL_BASE
= r
'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)'
19 def _parse_playlist(self
, playlist
):
20 for entry
in try_get(playlist
, lambda x
: x
['videos']['nodes'], list):
21 if entry
.get('__typename') == 'Video' and entry
.get('canonicalUrl'):
22 yield self
.url_result(entry
['canonicalUrl'], TedTalkIE
.ie_key())
25 class TedTalkIE(TedBaseIE
):
26 _VALID_URL
= TedBaseIE
._VALID
_URL
_BASE
.format(type='talks')
28 'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits',
29 'md5': '47e82c666d9c3261d4fe74748a90aada',
33 'title': 'How to break down barriers and not accept limits',
34 'description': 'md5:000707cece219d1e165b11550d612331',
36 'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'],
37 'uploader': 'Candace Parker',
39 'upload_date': '20220114',
40 'release_date': '20211201',
41 'thumbnail': r
're:http.*\.jpg',
45 def _real_extract(self
, url
):
46 display_id
= self
._match
_id
(url
)
47 webpage
= self
._download
_webpage
(url
, display_id
)
48 talk_info
= self
._search
_nextjs
_data
(webpage
, display_id
)['props']['pageProps']['videoData']
49 video_id
= talk_info
['id']
50 playerData
= self
._parse
_json
(talk_info
.get('playerData'), video_id
)
53 formats
, subtitles
= [], {}
54 for format_id
, resources
in (playerData
.get('resources') or {}).items():
55 if format_id
== 'hls':
56 stream_url
= url_or_none(try_get(resources
, lambda x
: x
['stream']))
59 m3u8_formats
, m3u8_subs
= self
._extract
_m
3u8_formats
_and
_subtitles
(
60 stream_url
, video_id
, 'mp4', m3u8_id
=format_id
, fatal
=False)
61 formats
.extend(m3u8_formats
)
62 subtitles
= self
._merge
_subtitles
(subtitles
, m3u8_subs
)
65 if not isinstance(resources
, list):
67 if format_id
== 'h264':
68 for resource
in resources
:
69 h264_url
= resource
.get('file')
72 bitrate
= int_or_none(resource
.get('bitrate'))
75 'format_id': '%s-%sk' % (format_id
, bitrate
),
78 if re
.search(r
'\d+k', h264_url
):
80 elif format_id
== 'rtmp':
81 streamer
= talk_info
.get('streamer')
85 'format_id': '%s-%s' % (format_id
, resource
.get('name')),
87 'play_path': resource
['file'],
89 'width': int_or_none(resource
.get('width')),
90 'height': int_or_none(resource
.get('height')),
91 'tbr': int_or_none(resource
.get('bitrate')),
92 } for resource
in resources
if resource
.get('file'))
95 m3u8_formats
= [f
for f
in formats
if f
.get('protocol') == 'm3u8' and f
.get('vcodec') != 'none']
96 for m3u8_format
in m3u8_formats
:
97 bitrate
= self
._search
_regex
(r
'(\d+k)', m3u8_format
['url'], 'bitrate', default
=None)
100 bitrate_url
= re
.sub(r
'\d+k', bitrate
, http_url
)
101 if not self
._is
_valid
_url
(
102 bitrate_url
, video_id
, '%s bitrate' % bitrate
):
104 f
= m3u8_format
.copy()
107 'format_id': m3u8_format
['format_id'].replace('hls', 'http'),
110 if f
.get('acodec') == 'none':
114 audio_download
= talk_info
.get('audioDownload')
117 'url': audio_download
,
118 'format_id': 'audio',
123 external
= playerData
.get('external') or {}
124 service
= external
.get('service') or ''
125 ext_url
= external
.get('code') if service
.lower() == 'youtube' else None
126 return self
.url_result(ext_url
or external
['uri'])
128 self
._sort
_formats
(formats
)
130 thumbnail
= playerData
.get('thumb') or self
._og
_search
_property
('image', webpage
)
132 # trim thumbnail resize parameters
133 thumbnail
= thumbnail
.split('?')[0]
137 'title': talk_info
.get('title') or self
._og
_search
_title
(webpage
),
138 'uploader': talk_info
.get('presenterDisplayName'),
139 'thumbnail': thumbnail
,
140 'description': talk_info
.get('description') or self
._og
_search
_description
(webpage
),
141 'subtitles': subtitles
,
143 'duration': talk_info
.get('duration') or parse_duration(self
._og
_search
_property
('video:duration', webpage
)),
144 'view_count': str_to_int(talk_info
.get('viewedCount')),
145 'upload_date': unified_strdate(talk_info
.get('publishedAt')),
146 'release_date': unified_strdate(talk_info
.get('recordedOn')),
147 'tags': try_get(playerData
, lambda x
: x
['targeting']['tag'].split(',')),
151 class TedSeriesIE(TedBaseIE
):
152 _VALID_URL
= fr
'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?'
154 'url': 'https://www.ted.com/series/small_thing_big_idea',
157 'title': 'Small Thing Big Idea',
158 'series': 'Small Thing Big Idea',
159 'description': 'md5:6869ca52cec661aef72b3e9f7441c55c'
161 'playlist_mincount': 16,
163 'url': 'https://www.ted.com/series/the_way_we_work#season_2',
166 'title': 'The Way We Work Season 2',
167 'series': 'The Way We Work',
168 'description': 'md5:59469256e533e1a48c4aa926a382234c',
171 'playlist_mincount': 8,
174 def _real_extract(self
, url
):
175 display_id
, season
= self
._match
_valid
_url
(url
).group('id', 'season')
176 webpage
= self
._download
_webpage
(url
, display_id
, 'Downloading series webpage')
177 info
= self
._search
_nextjs
_data
(webpage
, display_id
)['props']['pageProps']
179 entries
= itertools
.chain
.from_iterable(
180 self
._parse
_playlist
(s
) for s
in info
['seasons'] if season
in [None, s
.get('seasonNumber')])
182 series_id
= try_get(info
, lambda x
: x
['series']['id'])
183 series_name
= try_get(info
, lambda x
: x
['series']['name']) or self
._og
_search
_title
(webpage
, fatal
=False)
185 return self
.playlist_result(
187 f
'{series_id}_{season}' if season
and series_id
else series_id
,
188 f
'{series_name} Season {season}' if season
else series_name
,
189 self
._og
_search
_description
(webpage
),
190 series
=series_name
, season_number
=int_or_none(season
))
193 class TedPlaylistIE(TedBaseIE
):
194 _VALID_URL
= TedBaseIE
._VALID
_URL
_BASE
.format(type=r
'playlists(?:/\d+)?')
196 'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all',
199 'title': 'The most popular talks of all time',
200 'description': 'md5:d2f22831dc86c7040e733a3cb3993d78'
202 'playlist_mincount': 25,
205 def _real_extract(self
, url
):
206 display_id
= self
._match
_id
(url
)
207 webpage
= self
._download
_webpage
(url
, display_id
)
208 playlist
= self
._search
_nextjs
_data
(webpage
, display_id
)['props']['pageProps']['playlist']
210 return self
.playlist_result(
211 self
._parse
_playlist
(playlist
), playlist
.get('id'),
212 playlist
.get('title') or self
._og
_search
_title
(webpage
, default
='').replace(' | TED Talks', '') or None,
213 self
._og
_search
_description
(webpage
))
216 class TedEmbedIE(InfoExtractor
):
217 _VALID_URL
= r
'https?://embed(?:-ssl)?\.ted\.com/'
218 _EMBED_REGEX
= [rf
'<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL}.+?)\1']
221 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
225 'title': 'How to get serious about diversity and inclusion in the workplace',
226 'description': 'md5:0978aafe396e05341f8ecc795d22189d',
229 'uploader': 'Janet Stovall',
231 'upload_date': '20180822',
232 'release_date': '20180719',
233 'thumbnail': r
're:http.*\.jpg',
237 def _real_extract(self
, url
):
238 return self
.url_result(re
.sub(r
'://embed(-ssl)?', '://www', url
), TedTalkIE
.ie_key())