6 from .common
import InfoExtractor
24 class CBCIE(InfoExtractor
):
26 _VALID_URL
= r
'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'
29 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
30 'md5': '97e24d09672fc4cf56256d6faa6c25bc',
34 'title': 'Don Cherry – All-Stars',
35 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.',
36 'timestamp': 1454463000,
37 'upload_date': '20160203',
38 'uploader': 'CBCC-NEW',
40 'skip': 'Geo-restricted to Canada',
42 # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com
43 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4',
44 'md5': '162adfa070274b144f4fdc3c3b8207db',
48 'title': '22 Minutes Update: What Not To Wear Quebec',
49 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.",
50 'upload_date': '20131025',
51 'uploader': 'CBCC-NEW',
52 'timestamp': 1382717907,
54 'skip': 'No longer available',
56 # with clipId, feed only available via tpfeed.cbc.ca
57 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
58 'md5': '0274a90b51a9b4971fe005c63f592f12',
62 'title': 'Robin Williams freestyles on 90 Minutes Live',
63 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
64 'upload_date': '19780210',
65 'uploader': 'CBCC-NEW',
66 'timestamp': 255977160,
70 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
72 'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
76 'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
77 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
78 'upload_date': '20160201',
79 'timestamp': 1454342820,
80 'uploader': 'CBCC-NEW',
83 'md5': '415a0e3f586113894174dfb31aa5bb1a',
87 'title': 'Fly like an eagle!',
88 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
89 'upload_date': '20150315',
90 'timestamp': 1426443984,
91 'uploader': 'CBCC-NEW',
94 'skip': 'Geo-restricted to Canada',
96 # multiple CBC.APP.Caffeine.initInstance(...)
97 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238',
99 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
100 'id': 'dog-indoor-exercise-winter-1.3928238',
101 'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
103 'playlist_mincount': 6,
107 def suitable(cls
, url
):
108 return False if CBCPlayerIE
.suitable(url
) else super(CBCIE
, cls
).suitable(url
)
110 def _extract_player_init(self
, player_init
, display_id
):
111 player_info
= self
._parse
_json
(player_init
, display_id
, js_to_json
)
112 media_id
= player_info
.get('mediaId')
114 clip_id
= player_info
['clipId']
115 feed
= self
._download
_json
(
116 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id
,
117 clip_id
, fatal
=False)
119 media_id
= try_get(feed
, lambda x
: x
['entries'][0]['guid'], compat_str
)
121 media_id
= self
._download
_json
(
122 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id
,
123 clip_id
)['entries'][0]['id'].split('/')[-1]
124 return self
.url_result('cbcplayer:%s' % media_id
, 'CBCPlayer', media_id
)
126 def _real_extract(self
, url
):
127 display_id
= self
._match
_id
(url
)
128 webpage
= self
._download
_webpage
(url
, display_id
)
129 title
= (self
._og
_search
_title
(webpage
, default
=None)
130 or self
._html
_search
_meta
('twitter:title', webpage
, 'title', default
=None)
131 or self
._html
_extract
_title
(webpage
))
133 self
._extract
_player
_init
(player_init
, display_id
)
134 for player_init
in re
.findall(r
'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage
)]
137 r
'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
138 r
'<div[^>]+\bid=["\']player
-(\d
+)',
139 r'guid
["\']\s*:\s*["\'](\d
+)'):
140 media_ids.extend(re.findall(media_id_re, webpage))
142 self.url_result('cbcplayer
:%s' % media_id, 'CBCPlayer
', media_id)
143 for media_id in orderedSet(media_ids)])
144 return self.playlist_result(
145 entries, display_id, strip_or_none(title),
146 self._og_search_description(webpage))
149 class CBCPlayerIE(InfoExtractor):
150 IE_NAME = 'cbc
.ca
:player
'
151 _VALID_URL = r'(?
:cbcplayer
:|https?
://(?
:www\
.)?cbc\
.ca
/(?
:player
/play
/|i
/caffeine
/syndicate
/\?mediaId
=))(?P
<id>\d
+)'
153 'url
': 'http
://www
.cbc
.ca
/player
/play
/2683190193',
154 'md5
': '64d25f841ddf4ddb28a235338af32e2c
',
158 'title
': 'Gerry Runs a Sweat Shop
',
159 'description
': 'md5
:b457e1c01e8ff408d9d801c1c2cd29b0
',
160 'timestamp
': 1455071400,
161 'upload_date
': '20160210',
162 'uploader
': 'CBCC
-NEW
',
164 'skip
': 'Geo
-restricted to Canada
',
166 # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
167 'url
': 'http
://www
.cbc
.ca
/player
/play
/2657631896',
168 'md5
': 'e5e708c34ae6fca156aafe17c43e8b75
',
172 'title
': 'CBC Montreal
is organizing its first ever community hackathon
!',
173 'description
': 'The modern technology we tend to depend on so heavily
, is never without it
\'s share of hiccups
and headaches
. Next weekend
- CBC Montreal will be getting members of the public
for its first Hackathon
.',
174 'timestamp
': 1425704400,
175 'upload_date
': '20150307',
176 'uploader
': 'CBCC
-NEW
',
179 'url
': 'http
://www
.cbc
.ca
/player
/play
/2164402062',
180 'md5
': '33fcd8f6719b9dd60a5e73adcb83b9f6
',
184 'title
': 'Cancer survivor four times over
',
185 'description
': 'Tim Mayer has beaten three different forms of cancer four times
in five years
.',
186 'timestamp
': 1320410746,
187 'upload_date
': '20111104',
188 'uploader
': 'CBCC
-NEW
',
192 def _real_extract(self, url):
193 video_id = self._match_id(url)
195 '_type
': 'url_transparent
',
196 'ie_key
': 'ThePlatform
',
198 'http
://link
.theplatform
.com
/s
/ExhSPC
/media
/guid
/2655402169/%s?mbr
=true
&formats
=MPEG4
,FLV
,MP3
' % video_id, {
199 'force_smil_url
': True
205 class CBCGemIE(InfoExtractor):
206 IE_NAME = 'gem
.cbc
.ca
'
207 _VALID_URL = r'https?
://gem\
.cbc\
.ca
/(?
:media
/)?
(?P
<id>[0-9a
-z
-]+/s
[0-9]+[a
-z
][0-9]+)'
209 # This is a normal, public, TV show video
210 'url
': 'https
://gem
.cbc
.ca
/media
/schitts
-creek
/s06e01
',
211 'md5
': '93dbb31c74a8e45b378cf13bd3f6f11e
',
213 'id': 'schitts
-creek
/s06e01
',
215 'title
': 'Smoke Signals
',
216 'description
': 'md5
:929868d20021c924020641769eb3e7f1
',
217 'thumbnail
': 'https
://images
.radio
-canada
.ca
/v1
/synps
-cbc
/episode
/perso
/cbc_schitts_creek_season_06e01_thumbnail_v01
.jpg?im
=Resize
=(Size
)',
219 'categories
': ['comedy
'],
220 'series
': 'Schitt
\'s Creek
',
221 'season
': 'Season
6',
223 'episode
': 'Smoke Signals
',
225 'episode_id
': 'schitts
-creek
/s06e01
',
227 'params
': {'format': 'bv'},
228 'skip
': 'Geo
-restricted to Canada
',
230 # This video requires an account in the browser, but works fine in yt-dlp
231 'url
': 'https
://gem
.cbc
.ca
/media
/schitts
-creek
/s01e01
',
232 'md5
': '297a9600f554f2258aed01514226a697
',
234 'id': 'schitts
-creek
/s01e01
',
236 'title
': 'The Cup Runneth Over
',
237 'description
': 'md5
:9bca14ea49ab808097530eb05a29e797
',
238 'thumbnail
': 'https
://images
.radio
-canada
.ca
/v1
/synps
-cbc
/episode
/perso
/cbc_schitts_creek_season_01e01_thumbnail_v01
.jpg?im
=Resize
=(Size
)',
239 'series
': 'Schitt
\'s Creek
',
241 'season
': 'Season
1',
243 'episode
': 'The Cup Runneth Over
',
244 'episode_id
': 'schitts
-creek
/s01e01
',
246 'categories
': ['comedy
'],
248 'params
': {'format': 'bv'},
249 'skip
': 'Geo
-restricted to Canada
',
251 'url
': 'https
://gem
.cbc
.ca
/nadiyas
-family
-favourites
/s01e01
',
252 'only_matching
': True,
255 _GEO_COUNTRIES = ['CA
']
256 _TOKEN_API_KEY = '3f4beddd
-2061-49b0
-ae80
-6f1f2ed65b37
'
257 _NETRC_MACHINE = 'cbcgem
'
260 def _new_claims_token(self, email, password):
263 'password
': password,
265 headers = {'content-type': 'application/json'}
266 query = {'apikey': self._TOKEN_API_KEY}
267 resp = self._download_json('https
://api
.loginradius
.com
/identity
/v2
/auth
/login
',
268 None, data=data, headers=headers, query=query)
269 access_token = resp['access_token
']
272 'access_token
': access_token,
273 'apikey
': self._TOKEN_API_KEY,
276 resp = self._download_json('https
://cloud
-api
.loginradius
.com
/sso
/jwt
/api
/token
',
277 None, headers=headers, query=query)
278 sig = resp['signature
']
280 data = json.dumps({'jwt': sig}).encode()
281 headers = {'content-type': 'application/json', 'ott-device-type': 'web'}
282 resp = self._download_json('https
://services
.radio
-canada
.ca
/ott
/cbc
-api
/v2
/token
',
283 None, data=data, headers=headers)
284 cbc_access_token = resp['accessToken
']
286 headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token}
287 resp = self._download_json('https
://services
.radio
-canada
.ca
/ott
/cbc
-api
/v2
/profile
',
288 None, headers=headers)
289 return resp['claimsToken
']
291 def _get_claims_token_expiry(self):
293 # JWT is decoded here and 'exp
' field is extracted
294 # It is a Unix timestamp for when the token expires
295 b64_data = self._claims_token.split('.')[1]
296 data = base64.urlsafe_b64decode(b64_data + "==")
297 return json.loads(data)['exp
']
299 def claims_token_expired(self):
300 exp = self._get_claims_token_expiry()
301 if exp - time.time() < 10:
302 # It will expire in less than 10 seconds, or has already expired
306 def claims_token_valid(self):
307 return self._claims_token is not None and not self.claims_token_expired()
309 def _get_claims_token(self, email, password):
310 if not self.claims_token_valid():
311 self._claims_token = self._new_claims_token(email, password)
312 self.cache.store(self._NETRC_MACHINE, 'claims_token
', self._claims_token)
313 return self._claims_token
315 def _real_initialize(self):
316 if self.claims_token_valid():
318 self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token
')
320 def _find_secret_formats(self, formats, video_id):
321 """ Find a valid video url and convert it to the secret variant """
322 base_format = next((f for f in formats if f.get('vcodec
') != 'none
'), None)
326 base_url = re.sub(r'(Manifest\
(.*?
),filter=[\w
-]+(.*?\
))', r'\
1\
2', base_format['url
'])
327 url = re.sub(r'(Manifest\
(.*?
),format
=[\w
-]+(.*?\
))', r'\
1\
2', base_url)
329 secret_xml = self._download_xml(url, video_id, note='Downloading secret XML
', fatal=False)
333 for child in secret_xml:
334 if child.attrib.get('Type
') != 'video
':
336 for video_quality in child:
337 bitrate = int_or_none(video_quality.attrib.get('Bitrate
'))
338 if not bitrate or 'Index
' not in video_quality.attrib:
340 height = int_or_none(video_quality.attrib.get('MaxHeight
'))
344 'format_id
': join_nonempty('sec
', height),
345 # Note: \g<1> is necessary instead of \1 since bitrate is a number
346 'url
': re.sub(r'(QualityLevels\
()\d
+(\
))', fr'\g
<1>{bitrate}\
2', base_url),
347 'width
': int_or_none(video_quality.attrib.get('MaxWidth
')),
348 'tbr
': bitrate / 1000.0,
352 def _real_extract(self, url):
353 video_id = self._match_id(url)
354 video_info = self._download_json(
355 f'https
://services
.radio
-canada
.ca
/ott
/cbc
-api
/v2
/assets
/{video_id}
',
356 video_id, expected_status=426)
358 email, password = self._get_login_info()
359 if email and password:
360 claims_token = self._get_claims_token(email, password)
361 headers = {'x-claims-token': claims_token}
364 m3u8_info = self._download_json(video_info['playSession
']['url
'], video_id, headers=headers)
365 m3u8_url = m3u8_info.get('url
')
367 if m3u8_info.get('errorCode
') == 1:
368 self.raise_geo_restricted(countries=['CA
'])
369 elif m3u8_info.get('errorCode
') == 35:
370 self.raise_login_required(method='password
')
371 elif m3u8_info.get('errorCode
') != 0:
372 raise ExtractorError(f'{self.IE_NAME} said
: {m3u8_info.get("errorCode")}
- {m3u8_info.get("message")}
')
374 formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls
')
375 self._remove_duplicate_formats(formats)
376 formats.extend(self._find_secret_formats(formats, video_id))
378 for format in formats:
379 if format.get('vcodec
') == 'none
':
380 if format.get('ext
') is None:
381 format['ext
'] = 'm4a
'
382 if format.get('acodec
') is None:
383 format['acodec
'] = 'mp4a
.40.2'
385 # Put described audio at the beginning of the list, so that it
386 # isn't chosen by default
, as most people won
't want it.
387 if 'descriptive
' in format['format_id
'].lower():
388 format['preference
'] = -2
392 'title
': video_info['title
'],
393 'description
': video_info.get('description
'),
394 'thumbnail
': video_info.get('image
'),
395 'series
': video_info.get('series
'),
396 'season_number
': video_info.get('season
'),
397 'season
': f'Season {video_info.get("season")}
',
398 'episode_number
': video_info.get('episode
'),
399 'episode
': video_info.get('title
'),
400 'episode_id
': video_id,
401 'duration
': video_info.get('duration
'),
402 'categories
': [video_info.get('category
')],
404 'release_timestamp
': video_info.get('airDate
'),
405 'timestamp
': video_info.get('availableDate
'),
409 class CBCGemPlaylistIE(InfoExtractor):
410 IE_NAME = 'gem
.cbc
.ca
:playlist
'
411 _VALID_URL = r'https?
://gem\
.cbc\
.ca
/(?
:media
/)?
(?P
<id>(?P
<show
>[0-9a
-z
-]+)/s(?P
<season
>[0-9]+))/?
(?
:[?
#]|$)'
413 # TV show playlist, all public videos
414 'url': 'https://gem.cbc.ca/media/schitts-creek/s06',
415 'playlist_count': 16,
417 'id': 'schitts-creek/s06',
419 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2',
422 'url': 'https://gem.cbc.ca/schitts-creek/s06',
423 'only_matching': True,
425 _API_BASE
= 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/'
427 def _real_extract(self
, url
):
428 match
= self
._match
_valid
_url
(url
)
429 season_id
= match
.group('id')
430 show
= match
.group('show')
431 show_info
= self
._download
_json
(self
._API
_BASE
+ show
, season_id
, expected_status
=426)
432 season
= int(match
.group('season'))
434 season_info
= next((s
for s
in show_info
['seasons'] if s
.get('season') == season
), None)
436 if season_info
is None:
437 raise ExtractorError(f
'Couldn\'t find season {season} of {show}')
440 for episode
in season_info
['assets']:
442 '_type': 'url_transparent',
444 'url': 'https://gem.cbc.ca/media/' + episode
['id'],
446 'title': episode
.get('title'),
447 'description': episode
.get('description'),
448 'thumbnail': episode
.get('image'),
449 'series': episode
.get('series'),
450 'season_number': episode
.get('season'),
451 'season': season_info
['title'],
452 'season_id': season_info
.get('id'),
453 'episode_number': episode
.get('episode'),
454 'episode': episode
.get('title'),
455 'episode_id': episode
['id'],
456 'duration': episode
.get('duration'),
457 'categories': [episode
.get('category')],
461 tn_uri
= season_info
.get('image')
462 # the-national was observed to use a "data:image/png;base64"
463 # URI for their 'image' value. The image was 1x1, and is
464 # probably just a placeholder, so it is ignored.
465 if tn_uri
is not None and not tn_uri
.startswith('data:'):
472 'title': season_info
['title'],
473 'description': season_info
.get('description'),
474 'thumbnail': thumbnail
,
475 'series': show_info
.get('title'),
476 'season_number': season_info
.get('season'),
477 'season': season_info
['title'],
481 class CBCGemLiveIE(InfoExtractor
):
482 IE_NAME
= 'gem.cbc.ca:live'
483 _VALID_URL
= r
'https?://gem\.cbc\.ca/live(?:-event)?/(?P<id>\d+)'
486 'url': 'https://gem.cbc.ca/live/920604739687',
489 'description': 'The live TV channel and local programming from Ottawa',
490 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg',
492 'id': 'AyqZwxRqh8EH',
494 'timestamp': 1492106160,
495 'upload_date': '20170413',
496 'uploader': 'CBCC-NEW',
498 'skip': 'Live might have ended',
501 'url': 'https://gem.cbc.ca/live/44',
506 'title': r
're:^Ottawa [0-9\-: ]+',
507 'description': 'The live TV channel and local programming from Ottawa',
508 'live_status': 'is_live',
509 'thumbnail': r
're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*'
511 'params': {'skip_download': True}
,
512 'skip': 'Live might have ended',
515 'url': 'https://gem.cbc.ca/live-event/10835',
520 'title': r
're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+',
521 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.',
522 'live_status': 'is_live',
523 'thumbnail': r
're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*',
524 'timestamp': 1679706000,
525 'upload_date': '20230325',
527 'params': {'skip_download': True}
,
528 'skip': 'Live might have ended',
532 def _real_extract(self
, url
):
533 video_id
= self
._match
_id
(url
)
534 webpage
= self
._download
_webpage
(url
, video_id
)
535 video_info
= self
._search
_nextjs
_data
(webpage
, video_id
)['props']['pageProps']['data']
537 # Two types of metadata JSON
538 if not video_info
.get('formattedIdMedia'):
539 video_info
= traverse_obj(
540 video_info
, (('freeTv', ('streams', ...)), 'items', lambda _
, v
: v
['key'] == video_id
, {dict}
),
541 get_all
=False, default
={})
543 video_stream_id
= video_info
.get('formattedIdMedia')
544 if not video_stream_id
:
545 raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected
=True)
547 stream_data
= self
._download
_json
(
548 'https://services.radio-canada.ca/media/validation/v2/', video_id
, query
={
550 'connectionType': 'hd',
551 'deviceType': 'ipad',
552 'idMedia': video_stream_id
,
553 'multibitrate': 'true',
556 'manifestType': 'desktop',
561 'formats': self
._extract
_m
3u8_formats
(stream_data
['url'], video_id
, 'mp4', live
=True),
563 **traverse_obj(video_info
, {
565 'description': 'description',
566 'thumbnail': ('images', 'card', 'url'),
567 'timestamp': ('airDate', {parse_iso8601}
),