]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nba.py
Completely change project name to yt-dlp (#85)
[yt-dlp.git] / yt_dlp / extractor / nba.py
1 from __future__ import unicode_literals
2
3 import functools
4 import re
5
6 from .turner import TurnerBaseIE
7 from ..compat import (
8 compat_parse_qs,
9 compat_str,
10 compat_urllib_parse_unquote,
11 compat_urllib_parse_urlparse,
12 )
13 from ..utils import (
14 int_or_none,
15 merge_dicts,
16 OnDemandPagedList,
17 parse_duration,
18 parse_iso8601,
19 try_get,
20 update_url_query,
21 urljoin,
22 )
23
24
25 class NBACVPBaseIE(TurnerBaseIE):
26 def _extract_nba_cvp_info(self, path, video_id, fatal=False):
27 return self._extract_cvp_info(
28 'http://secure.nba.com/%s' % path, video_id, {
29 'default': {
30 'media_src': 'http://nba.cdn.turner.com/nba/big',
31 },
32 'm3u8': {
33 'media_src': 'http://nbavod-f.akamaihd.net',
34 },
35 }, fatal=fatal)
36
37
38 class NBAWatchBaseIE(NBACVPBaseIE):
39 _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/'
40
41 def _extract_video(self, filter_key, filter_value):
42 video = self._download_json(
43 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch',
44 filter_value, query={
45 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName',
46 'q': filter_key + ':' + filter_value,
47 'wt': 'json',
48 })['response']['docs'][0]
49
50 video_id = str(video['pid'])
51 title = video['name']
52
53 formats = []
54 m3u8_url = (self._download_json(
55 'https://watch.nba.com/service/publishpoint', video_id, query={
56 'type': 'video',
57 'format': 'json',
58 'id': video_id,
59 }, headers={
60 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1',
61 }, fatal=False) or {}).get('path')
62 if m3u8_url:
63 m3u8_formats = self._extract_m3u8_formats(
64 re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4',
65 'm3u8_native', m3u8_id='hls', fatal=False)
66 formats.extend(m3u8_formats)
67 for f in m3u8_formats:
68 http_f = f.copy()
69 http_f.update({
70 'format_id': http_f['format_id'].replace('hls-', 'http-'),
71 'protocol': 'http',
72 'url': http_f['url'].replace('.m3u8', ''),
73 })
74 formats.append(http_f)
75
76 info = {
77 'id': video_id,
78 'title': title,
79 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')),
80 'description': video.get('description'),
81 'duration': int_or_none(video.get('runtime')),
82 'timestamp': parse_iso8601(video.get('releaseDate')),
83 'tags': video.get('tags'),
84 }
85
86 seo_name = video.get('seoName')
87 if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name):
88 base_path = ''
89 if seo_name.startswith('teams/'):
90 base_path += seo_name.split('/')[1] + '/'
91 base_path += 'video/'
92 cvp_info = self._extract_nba_cvp_info(
93 base_path + seo_name + '.xml', video_id, False)
94 if cvp_info:
95 formats.extend(cvp_info['formats'])
96 info = merge_dicts(info, cvp_info)
97
98 self._sort_formats(formats)
99 info['formats'] = formats
100 return info
101
102
103 class NBAWatchEmbedIE(NBAWatchBaseIE):
104 IENAME = 'nba:watch:embed'
105 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)'
106 _TESTS = [{
107 'url': 'http://watch.nba.com/embed?id=659395',
108 'md5': 'b7e3f9946595f4ca0a13903ce5edd120',
109 'info_dict': {
110 'id': '659395',
111 'ext': 'mp4',
112 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
113 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
114 'timestamp': 1492228800,
115 'upload_date': '20170415',
116 },
117 }]
118
119 def _real_extract(self, url):
120 video_id = self._match_id(url)
121 return self._extract_video('pid', video_id)
122
123
124 class NBAWatchIE(NBAWatchBaseIE):
125 IE_NAME = 'nba:watch'
126 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P<id>.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)'
127 _TESTS = [{
128 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
129 'md5': '9d902940d2a127af3f7f9d2f3dc79c96',
130 'info_dict': {
131 'id': '70946',
132 'ext': 'mp4',
133 'title': 'Thunder vs. Nets',
134 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
135 'duration': 181,
136 'timestamp': 1354597200,
137 'upload_date': '20121204',
138 },
139 }, {
140 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
141 'only_matching': True,
142 }, {
143 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
144 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4',
145 'info_dict': {
146 'id': '330865',
147 'ext': 'mp4',
148 'title': 'Hawks vs. Cavaliers Game 1',
149 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
150 'duration': 228,
151 'timestamp': 1432094400,
152 'upload_date': '20150521',
153 },
154 }, {
155 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115',
156 'only_matching': True,
157 }, {
158 # only CVP mp4 format available
159 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106',
160 'only_matching': True,
161 }, {
162 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights',
163 'only_matching': True,
164 }]
165
166 def _real_extract(self, url):
167 display_id = self._match_id(url)
168 collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0]
169 if collection_id:
170 if self._downloader.params.get('noplaylist'):
171 self.to_screen('Downloading just video %s because of --no-playlist' % display_id)
172 else:
173 self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id)
174 return self.url_result(
175 'https://www.nba.com/watch/list/collection/' + collection_id,
176 NBAWatchCollectionIE.ie_key(), collection_id)
177 return self._extract_video('seoName', display_id)
178
179
180 class NBAWatchCollectionIE(NBAWatchBaseIE):
181 IE_NAME = 'nba:watch:collection'
182 _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P<id>[^/?#&]+)'
183 _TESTS = [{
184 'url': 'https://watch.nba.com/list/collection/season-preview-2020',
185 'info_dict': {
186 'id': 'season-preview-2020',
187 },
188 'playlist_mincount': 43,
189 }]
190 _PAGE_SIZE = 100
191
192 def _fetch_page(self, collection_id, page):
193 page += 1
194 videos = self._download_json(
195 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id,
196 collection_id, 'Downloading page %d JSON metadata' % page, query={
197 'count': self._PAGE_SIZE,
198 'page': page,
199 })['results']['videos']
200 for video in videos:
201 program = video.get('program') or {}
202 seo_name = program.get('seoName') or program.get('slug')
203 if not seo_name:
204 continue
205 yield {
206 '_type': 'url',
207 'id': program.get('id'),
208 'title': program.get('title') or video.get('title'),
209 'url': 'https://www.nba.com/watch/video/' + seo_name,
210 'thumbnail': video.get('image'),
211 'description': program.get('description') or video.get('description'),
212 'duration': parse_duration(program.get('runtimeHours')),
213 'timestamp': parse_iso8601(video.get('releaseDate')),
214 }
215
216 def _real_extract(self, url):
217 collection_id = self._match_id(url)
218 entries = OnDemandPagedList(
219 functools.partial(self._fetch_page, collection_id),
220 self._PAGE_SIZE)
221 return self.playlist_result(entries, collection_id)
222
223
224 class NBABaseIE(NBACVPBaseIE):
225 _VALID_URL_BASE = r'''(?x)
226 https?://(?:www\.)?nba\.com/
227 (?P<team>
228 blazers|
229 bucks|
230 bulls|
231 cavaliers|
232 celtics|
233 clippers|
234 grizzlies|
235 hawks|
236 heat|
237 hornets|
238 jazz|
239 kings|
240 knicks|
241 lakers|
242 magic|
243 mavericks|
244 nets|
245 nuggets|
246 pacers|
247 pelicans|
248 pistons|
249 raptors|
250 rockets|
251 sixers|
252 spurs|
253 suns|
254 thunder|
255 timberwolves|
256 warriors|
257 wizards
258 )
259 (?:/play\#)?/'''
260 _CHANNEL_PATH_REGEX = r'video/channel|series'
261
262 def _embed_url_result(self, team, content_id):
263 return self.url_result(update_url_query(
264 'https://secure.nba.com/assets/amp/include/video/iframe.html', {
265 'contentId': content_id,
266 'team': team,
267 }), NBAEmbedIE.ie_key())
268
269 def _call_api(self, team, content_id, query, resource):
270 return self._download_json(
271 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team,
272 content_id, 'Download %s JSON metadata' % resource,
273 query=query, headers={
274 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b',
275 })['response']['result']
276
277 def _extract_video(self, video, team, extract_all=True):
278 video_id = compat_str(video['nid'])
279 team = video['brand']
280
281 info = {
282 'id': video_id,
283 'title': video.get('title') or video.get('headline') or video['shortHeadline'],
284 'description': video.get('description'),
285 'timestamp': parse_iso8601(video.get('published')),
286 }
287
288 subtitles = {}
289 captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {}
290 for caption_url in captions.values():
291 subtitles.setdefault('en', []).append({'url': caption_url})
292
293 formats = []
294 mp4_url = video.get('mp4')
295 if mp4_url:
296 formats.append({
297 'url': mp4_url,
298 })
299
300 if extract_all:
301 source_url = video.get('videoSource')
302 if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'):
303 formats.append({
304 'format_id': 'source',
305 'url': source_url,
306 'quality': 1,
307 })
308
309 m3u8_url = video.get('m3u8')
310 if m3u8_url:
311 if '.akamaihd.net/i/' in m3u8_url:
312 formats.extend(self._extract_akamai_formats(
313 m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'}))
314 else:
315 formats.extend(self._extract_m3u8_formats(
316 m3u8_url, video_id, 'mp4',
317 'm3u8_native', m3u8_id='hls', fatal=False))
318
319 content_xml = video.get('contentXml')
320 if team and content_xml:
321 cvp_info = self._extract_nba_cvp_info(
322 team + content_xml, video_id, fatal=False)
323 if cvp_info:
324 formats.extend(cvp_info['formats'])
325 subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles'])
326 info = merge_dicts(info, cvp_info)
327
328 self._sort_formats(formats)
329 else:
330 info.update(self._embed_url_result(team, video['videoId']))
331
332 info.update({
333 'formats': formats,
334 'subtitles': subtitles,
335 })
336
337 return info
338
339 def _real_extract(self, url):
340 team, display_id = re.match(self._VALID_URL, url).groups()
341 if '/play#/' in url:
342 display_id = compat_urllib_parse_unquote(display_id)
343 else:
344 webpage = self._download_webpage(url, display_id)
345 display_id = self._search_regex(
346 self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id')
347 return self._extract_url_results(team, display_id)
348
349
350 class NBAEmbedIE(NBABaseIE):
351 IENAME = 'nba:embed'
352 _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)'
353 _TESTS = [{
354 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&ampEnv=',
355 'only_matching': True,
356 }, {
357 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP',
358 'only_matching': True,
359 }]
360
361 def _real_extract(self, url):
362 qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
363 content_id = qs['contentId'][0]
364 team = qs.get('team', [None])[0]
365 if not team:
366 return self.url_result(
367 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key())
368 video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0]
369 return self._extract_video(video, team)
370
371
372 class NBAIE(NBABaseIE):
373 IENAME = 'nba'
374 _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P<id>(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX
375 _TESTS = [{
376 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774',
377 'info_dict': {
378 'id': '45039',
379 'ext': 'mp4',
380 'title': 'AND WE BACK.',
381 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.',
382 'duration': 94,
383 'timestamp': 1607112000,
384 'upload_date': '20201218',
385 },
386 }, {
387 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860',
388 'only_matching': True,
389 }, {
390 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0',
391 'only_matching': True,
392 }]
393 _CONTENT_ID_REGEX = r'videoID'
394
395 def _extract_url_results(self, team, content_id):
396 return self._embed_url_result(team, content_id)
397
398
399 class NBAChannelIE(NBABaseIE):
400 IENAME = 'nba:channel'
401 _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P<id>[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX
402 _TESTS = [{
403 'url': 'https://www.nba.com/blazers/video/channel/summer_league',
404 'info_dict': {
405 'title': 'Summer League',
406 },
407 'playlist_mincount': 138,
408 }, {
409 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date',
410 'only_matching': True,
411 }]
412 _CONTENT_ID_REGEX = r'videoSubCategory'
413 _PAGE_SIZE = 100
414
415 def _fetch_page(self, team, channel, page):
416 results = self._call_api(team, channel, {
417 'channels': channel,
418 'count': self._PAGE_SIZE,
419 'offset': page * self._PAGE_SIZE,
420 }, 'page %d' % (page + 1))
421 for video in results:
422 yield self._extract_video(video, team, False)
423
424 def _extract_url_results(self, team, content_id):
425 entries = OnDemandPagedList(
426 functools.partial(self._fetch_page, team, content_id),
427 self._PAGE_SIZE)
428 return self.playlist_result(entries, playlist_title=content_id)