]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nba.py
[utils] Add `parse_qs`
[yt-dlp.git] / yt_dlp / extractor / nba.py
1 from __future__ import unicode_literals
2
3 import functools
4 import re
5
6 from .turner import TurnerBaseIE
7 from ..compat import (
8 compat_str,
9 compat_urllib_parse_unquote,
10 )
11 from ..utils import (
12 int_or_none,
13 merge_dicts,
14 OnDemandPagedList,
15 parse_duration,
16 parse_iso8601,
17 parse_qs,
18 try_get,
19 update_url_query,
20 urljoin,
21 )
22
23
24 class NBACVPBaseIE(TurnerBaseIE):
25 def _extract_nba_cvp_info(self, path, video_id, fatal=False):
26 return self._extract_cvp_info(
27 'http://secure.nba.com/%s' % path, video_id, {
28 'default': {
29 'media_src': 'http://nba.cdn.turner.com/nba/big',
30 },
31 'm3u8': {
32 'media_src': 'http://nbavod-f.akamaihd.net',
33 },
34 }, fatal=fatal)
35
36
37 class NBAWatchBaseIE(NBACVPBaseIE):
38 _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/'
39
40 def _extract_video(self, filter_key, filter_value):
41 video = self._download_json(
42 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch',
43 filter_value, query={
44 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName',
45 'q': filter_key + ':' + filter_value,
46 'wt': 'json',
47 })['response']['docs'][0]
48
49 video_id = str(video['pid'])
50 title = video['name']
51
52 formats = []
53 m3u8_url = (self._download_json(
54 'https://watch.nba.com/service/publishpoint', video_id, query={
55 'type': 'video',
56 'format': 'json',
57 'id': video_id,
58 }, headers={
59 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1',
60 }, fatal=False) or {}).get('path')
61 if m3u8_url:
62 m3u8_formats = self._extract_m3u8_formats(
63 re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4',
64 'm3u8_native', m3u8_id='hls', fatal=False)
65 formats.extend(m3u8_formats)
66 for f in m3u8_formats:
67 http_f = f.copy()
68 http_f.update({
69 'format_id': http_f['format_id'].replace('hls-', 'http-'),
70 'protocol': 'http',
71 'url': http_f['url'].replace('.m3u8', ''),
72 })
73 formats.append(http_f)
74
75 info = {
76 'id': video_id,
77 'title': title,
78 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')),
79 'description': video.get('description'),
80 'duration': int_or_none(video.get('runtime')),
81 'timestamp': parse_iso8601(video.get('releaseDate')),
82 'tags': video.get('tags'),
83 }
84
85 seo_name = video.get('seoName')
86 if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name):
87 base_path = ''
88 if seo_name.startswith('teams/'):
89 base_path += seo_name.split('/')[1] + '/'
90 base_path += 'video/'
91 cvp_info = self._extract_nba_cvp_info(
92 base_path + seo_name + '.xml', video_id, False)
93 if cvp_info:
94 formats.extend(cvp_info['formats'])
95 info = merge_dicts(info, cvp_info)
96
97 self._sort_formats(formats)
98 info['formats'] = formats
99 return info
100
101
102 class NBAWatchEmbedIE(NBAWatchBaseIE):
103 IENAME = 'nba:watch:embed'
104 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)'
105 _TESTS = [{
106 'url': 'http://watch.nba.com/embed?id=659395',
107 'md5': 'b7e3f9946595f4ca0a13903ce5edd120',
108 'info_dict': {
109 'id': '659395',
110 'ext': 'mp4',
111 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
112 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
113 'timestamp': 1492228800,
114 'upload_date': '20170415',
115 },
116 }]
117
118 def _real_extract(self, url):
119 video_id = self._match_id(url)
120 return self._extract_video('pid', video_id)
121
122
123 class NBAWatchIE(NBAWatchBaseIE):
124 IE_NAME = 'nba:watch'
125 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P<id>.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)'
126 _TESTS = [{
127 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
128 'md5': '9d902940d2a127af3f7f9d2f3dc79c96',
129 'info_dict': {
130 'id': '70946',
131 'ext': 'mp4',
132 'title': 'Thunder vs. Nets',
133 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
134 'duration': 181,
135 'timestamp': 1354597200,
136 'upload_date': '20121204',
137 },
138 }, {
139 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
140 'only_matching': True,
141 }, {
142 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
143 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4',
144 'info_dict': {
145 'id': '330865',
146 'ext': 'mp4',
147 'title': 'Hawks vs. Cavaliers Game 1',
148 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
149 'duration': 228,
150 'timestamp': 1432094400,
151 'upload_date': '20150521',
152 },
153 }, {
154 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115',
155 'only_matching': True,
156 }, {
157 # only CVP mp4 format available
158 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106',
159 'only_matching': True,
160 }, {
161 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights',
162 'only_matching': True,
163 }]
164
165 def _real_extract(self, url):
166 display_id = self._match_id(url)
167 collection_id = parse_qs(url).get('collection', [None])[0]
168 if collection_id:
169 if self.get_param('noplaylist'):
170 self.to_screen('Downloading just video %s because of --no-playlist' % display_id)
171 else:
172 self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id)
173 return self.url_result(
174 'https://www.nba.com/watch/list/collection/' + collection_id,
175 NBAWatchCollectionIE.ie_key(), collection_id)
176 return self._extract_video('seoName', display_id)
177
178
179 class NBAWatchCollectionIE(NBAWatchBaseIE):
180 IE_NAME = 'nba:watch:collection'
181 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P<id>[^/?#&]+)'
182 _TESTS = [{
183 'url': 'https://watch.nba.com/list/collection/season-preview-2020',
184 'info_dict': {
185 'id': 'season-preview-2020',
186 },
187 'playlist_mincount': 43,
188 }]
189 _PAGE_SIZE = 100
190
191 def _fetch_page(self, collection_id, page):
192 page += 1
193 videos = self._download_json(
194 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id,
195 collection_id, 'Downloading page %d JSON metadata' % page, query={
196 'count': self._PAGE_SIZE,
197 'page': page,
198 })['results']['videos']
199 for video in videos:
200 program = video.get('program') or {}
201 seo_name = program.get('seoName') or program.get('slug')
202 if not seo_name:
203 continue
204 yield {
205 '_type': 'url',
206 'id': program.get('id'),
207 'title': program.get('title') or video.get('title'),
208 'url': 'https://www.nba.com/watch/video/' + seo_name,
209 'thumbnail': video.get('image'),
210 'description': program.get('description') or video.get('description'),
211 'duration': parse_duration(program.get('runtimeHours')),
212 'timestamp': parse_iso8601(video.get('releaseDate')),
213 }
214
215 def _real_extract(self, url):
216 collection_id = self._match_id(url)
217 entries = OnDemandPagedList(
218 functools.partial(self._fetch_page, collection_id),
219 self._PAGE_SIZE)
220 return self.playlist_result(entries, collection_id)
221
222
223 class NBABaseIE(NBACVPBaseIE):
224 _VALID_URL_BASE = r'''(?x)
225 https?://(?:www\.)?nba\.com/
226 (?P<team>
227 blazers|
228 bucks|
229 bulls|
230 cavaliers|
231 celtics|
232 clippers|
233 grizzlies|
234 hawks|
235 heat|
236 hornets|
237 jazz|
238 kings|
239 knicks|
240 lakers|
241 magic|
242 mavericks|
243 nets|
244 nuggets|
245 pacers|
246 pelicans|
247 pistons|
248 raptors|
249 rockets|
250 sixers|
251 spurs|
252 suns|
253 thunder|
254 timberwolves|
255 warriors|
256 wizards
257 )
258 (?:/play\#)?/'''
259 _CHANNEL_PATH_REGEX = r'video/channel|series'
260
261 def _embed_url_result(self, team, content_id):
262 return self.url_result(update_url_query(
263 'https://secure.nba.com/assets/amp/include/video/iframe.html', {
264 'contentId': content_id,
265 'team': team,
266 }), NBAEmbedIE.ie_key())
267
268 def _call_api(self, team, content_id, query, resource):
269 return self._download_json(
270 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team,
271 content_id, 'Download %s JSON metadata' % resource,
272 query=query, headers={
273 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b',
274 })['response']['result']
275
276 def _extract_video(self, video, team, extract_all=True):
277 video_id = compat_str(video['nid'])
278 team = video['brand']
279
280 info = {
281 'id': video_id,
282 'title': video.get('title') or video.get('headline') or video['shortHeadline'],
283 'description': video.get('description'),
284 'timestamp': parse_iso8601(video.get('published')),
285 }
286
287 subtitles = {}
288 captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {}
289 for caption_url in captions.values():
290 subtitles.setdefault('en', []).append({'url': caption_url})
291
292 formats = []
293 mp4_url = video.get('mp4')
294 if mp4_url:
295 formats.append({
296 'url': mp4_url,
297 })
298
299 if extract_all:
300 source_url = video.get('videoSource')
301 if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'):
302 formats.append({
303 'format_id': 'source',
304 'url': source_url,
305 'quality': 1,
306 })
307
308 m3u8_url = video.get('m3u8')
309 if m3u8_url:
310 if '.akamaihd.net/i/' in m3u8_url:
311 formats.extend(self._extract_akamai_formats(
312 m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'}))
313 else:
314 formats.extend(self._extract_m3u8_formats(
315 m3u8_url, video_id, 'mp4',
316 'm3u8_native', m3u8_id='hls', fatal=False))
317
318 content_xml = video.get('contentXml')
319 if team and content_xml:
320 cvp_info = self._extract_nba_cvp_info(
321 team + content_xml, video_id, fatal=False)
322 if cvp_info:
323 formats.extend(cvp_info['formats'])
324 subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles'])
325 info = merge_dicts(info, cvp_info)
326
327 self._sort_formats(formats)
328 else:
329 info.update(self._embed_url_result(team, video['videoId']))
330
331 info.update({
332 'formats': formats,
333 'subtitles': subtitles,
334 })
335
336 return info
337
338 def _real_extract(self, url):
339 team, display_id = self._match_valid_url(url).groups()
340 if '/play#/' in url:
341 display_id = compat_urllib_parse_unquote(display_id)
342 else:
343 webpage = self._download_webpage(url, display_id)
344 display_id = self._search_regex(
345 self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id')
346 return self._extract_url_results(team, display_id)
347
348
349 class NBAEmbedIE(NBABaseIE):
350 IENAME = 'nba:embed'
351 _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)'
352 _TESTS = [{
353 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&ampEnv=',
354 'only_matching': True,
355 }, {
356 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP',
357 'only_matching': True,
358 }]
359
360 def _real_extract(self, url):
361 qs = parse_qs(url)
362 content_id = qs['contentId'][0]
363 team = qs.get('team', [None])[0]
364 if not team:
365 return self.url_result(
366 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key())
367 video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0]
368 return self._extract_video(video, team)
369
370
371 class NBAIE(NBABaseIE):
372 IENAME = 'nba'
373 _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P<id>(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX
374 _TESTS = [{
375 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774',
376 'info_dict': {
377 'id': '45039',
378 'ext': 'mp4',
379 'title': 'AND WE BACK.',
380 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.',
381 'duration': 94,
382 'timestamp': 1607112000,
383 'upload_date': '20201218',
384 },
385 }, {
386 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860',
387 'only_matching': True,
388 }, {
389 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0',
390 'only_matching': True,
391 }]
392 _CONTENT_ID_REGEX = r'videoID'
393
394 def _extract_url_results(self, team, content_id):
395 return self._embed_url_result(team, content_id)
396
397
398 class NBAChannelIE(NBABaseIE):
399 IENAME = 'nba:channel'
400 _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P<id>[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX
401 _TESTS = [{
402 'url': 'https://www.nba.com/blazers/video/channel/summer_league',
403 'info_dict': {
404 'title': 'Summer League',
405 },
406 'playlist_mincount': 138,
407 }, {
408 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date',
409 'only_matching': True,
410 }]
411 _CONTENT_ID_REGEX = r'videoSubCategory'
412 _PAGE_SIZE = 100
413
414 def _fetch_page(self, team, channel, page):
415 results = self._call_api(team, channel, {
416 'channels': channel,
417 'count': self._PAGE_SIZE,
418 'offset': page * self._PAGE_SIZE,
419 }, 'page %d' % (page + 1))
420 for video in results:
421 yield self._extract_video(video, team, False)
422
423 def _extract_url_results(self, team, content_id):
424 entries = OnDemandPagedList(
425 functools.partial(self._fetch_page, team, content_id),
426 self._PAGE_SIZE)
427 return self.playlist_result(entries, playlist_title=content_id)