]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/drtv.py
[extractor/adobepass] Handle `Charter_Direct` MSO as `Spectrum` (#6824)
[yt-dlp.git] / yt_dlp / extractor / drtv.py
1 import binascii
2 import hashlib
3 import re
4
5 from .common import InfoExtractor
6 from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
7 from ..compat import compat_urllib_parse_unquote
8 from ..utils import (
9 ExtractorError,
10 float_or_none,
11 int_or_none,
12 mimetype2ext,
13 str_or_none,
14 traverse_obj,
15 unified_timestamp,
16 update_url_query,
17 url_or_none,
18 )
19
20 SERIES_API = 'https://production-cdn.dr-massive.com/api/page?device=web_browser&item_detail_expand=all&lang=da&max_list_prefetch=3&path=%s'
21
22
23 class DRTVIE(InfoExtractor):
24 _VALID_URL = r'''(?x)
25 https?://
26 (?:
27 (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?P<radio>radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
28 (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/
29 )
30 (?P<id>[\da-z_-]+)
31 '''
32 _GEO_BYPASS = False
33 _GEO_COUNTRIES = ['DK']
34 IE_NAME = 'drtv'
35 _TESTS = [{
36 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
37 'md5': '25e659cccc9a2ed956110a299fdf5983',
38 'info_dict': {
39 'id': 'klassen-darlig-taber-10',
40 'ext': 'mp4',
41 'title': 'Klassen - Dårlig taber (10)',
42 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa',
43 'timestamp': 1539085800,
44 'upload_date': '20181009',
45 'duration': 606.84,
46 'series': 'Klassen',
47 'season': 'Klassen I',
48 'season_number': 1,
49 'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b',
50 'episode': 'Episode 10',
51 'episode_number': 10,
52 'release_year': 2016,
53 },
54 'expected_warnings': ['Unable to download f4m manifest'],
55 'skip': 'this video has been removed',
56 }, {
57 # embed
58 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
59 'info_dict': {
60 'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6',
61 'ext': 'mp4',
62 'title': 'christiania pusher street ryddes drdkrjpo',
63 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
64 'timestamp': 1472800279,
65 'upload_date': '20160902',
66 'duration': 131.4,
67 },
68 'params': {
69 'skip_download': True,
70 },
71 'expected_warnings': ['Unable to download f4m manifest'],
72 }, {
73 # with SignLanguage formats
74 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
75 'info_dict': {
76 'id': '00831690010',
77 'ext': 'mp4',
78 'title': 'Historien om Danmark: Stenalder',
79 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
80 'timestamp': 1546628400,
81 'upload_date': '20190104',
82 'duration': 3504.619,
83 'formats': 'mincount:20',
84 'release_year': 2017,
85 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35',
86 'season_number': 1,
87 'season': 'Historien om Danmark',
88 'series': 'Historien om Danmark',
89 },
90 'params': {
91 'skip_download': True,
92 },
93 }, {
94 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9',
95 'only_matching': True,
96 }, {
97 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769',
98 'info_dict': {
99 'id': '00951930010',
100 'ext': 'mp4',
101 'title': 'Bonderøven 2019 (1:8)',
102 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd',
103 'timestamp': 1654856100,
104 'upload_date': '20220610',
105 'duration': 2576.6,
106 'season': 'Bonderøven 2019',
107 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5',
108 'release_year': 2019,
109 'season_number': 2019,
110 'series': 'Frank & Kastaniegaarden',
111 'episode_number': 1,
112 'episode': 'Episode 1',
113 },
114 'params': {
115 'skip_download': True,
116 },
117 }, {
118 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769',
119 'only_matching': True,
120 }, {
121 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769',
122 'only_matching': True,
123 }, {
124 'url': 'https://www.dr.dk/drtv/program/jagten_220924',
125 'only_matching': True,
126 }, {
127 'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3',
128 'info_dict': {
129 'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113',
130 'title': "Regionale nyheder",
131 'ext': 'mp4',
132 'duration': 120.043,
133 'series': 'P4 Østjylland regionale nyheder',
134 'timestamp': 1651746600,
135 'season': 'Regionale nyheder',
136 'release_year': 0,
137 'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5',
138 'description': '',
139 'upload_date': '20220505',
140 },
141 'params': {
142 'skip_download': True,
143 },
144 'skip': 'this video has been removed',
145 }, {
146 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9',
147 'info_dict': {
148 'ext': 'mp4',
149 'id': '14802310112',
150 'timestamp': 1678786200,
151 'duration': 120.043,
152 'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f',
153 'series': 'P4 København regionale nyheder',
154 'upload_date': '20230314',
155 'release_year': 0,
156 'description': 'Hør seneste regionale nyheder fra P4 København.',
157 'season': 'Regionale nyheder',
158 'title': 'Regionale nyheder',
159 },
160 }]
161
162 def _real_extract(self, url):
163 raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio')
164
165 webpage = self._download_webpage(url, raw_video_id)
166
167 if '>Programmet er ikke længere tilgængeligt' in webpage:
168 raise ExtractorError(
169 'Video %s is not available' % raw_video_id, expected=True)
170
171 video_id = self._search_regex(
172 (r'data-(?:material-identifier|episode-slug)="([^"]+)"',
173 r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'),
174 webpage, 'video id', default=None)
175
176 if not video_id:
177 video_id = self._search_regex(
178 r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)',
179 webpage, 'urn', default=None)
180 if video_id:
181 video_id = compat_urllib_parse_unquote(video_id)
182
183 _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard'
184 query = {'expanded': 'true'}
185
186 if video_id:
187 programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id)
188 else:
189 programcard_url = _PROGRAMCARD_BASE
190 if is_radio_url:
191 video_id = self._search_nextjs_data(
192 webpage, raw_video_id)['props']['pageProps']['episode']['productionNumber']
193 else:
194 json_data = self._search_json(
195 r'window\.__data\s*=', webpage, 'data', raw_video_id)
196 video_id = traverse_obj(json_data, (
197 'cache', 'page', ..., (None, ('entries', 0)), 'item', 'customId',
198 {lambda x: x.split(':')[-1]}), get_all=False)
199 if not video_id:
200 raise ExtractorError('Unable to extract video id')
201 query['productionnumber'] = video_id
202
203 data = self._download_json(
204 programcard_url, video_id, 'Downloading video JSON', query=query)
205
206 supplementary_data = {}
207 if re.search(r'_\d+$', raw_video_id):
208 supplementary_data = self._download_json(
209 SERIES_API % f'/episode/{raw_video_id}', raw_video_id, fatal=False) or {}
210
211 title = str_or_none(data.get('Title')) or re.sub(
212 r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '',
213 self._og_search_title(webpage))
214 description = self._og_search_description(
215 webpage, default=None) or data.get('Description')
216
217 timestamp = unified_timestamp(
218 data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime'))
219
220 thumbnail = None
221 duration = None
222
223 restricted_to_denmark = False
224
225 formats = []
226 subtitles = {}
227
228 assets = []
229 primary_asset = data.get('PrimaryAsset')
230 if isinstance(primary_asset, dict):
231 assets.append(primary_asset)
232 secondary_assets = data.get('SecondaryAssets')
233 if isinstance(secondary_assets, list):
234 for secondary_asset in secondary_assets:
235 if isinstance(secondary_asset, dict):
236 assets.append(secondary_asset)
237
238 def hex_to_bytes(hex):
239 return binascii.a2b_hex(hex.encode('ascii'))
240
241 def decrypt_uri(e):
242 n = int(e[2:10], 16)
243 a = e[10 + n:]
244 data = hex_to_bytes(e[10:10 + n])
245 key = hashlib.sha256(('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()
246 iv = hex_to_bytes(a)
247 decrypted = unpad_pkcs7(aes_cbc_decrypt_bytes(data, key, iv))
248 return decrypted.decode('utf-8').split('?')[0]
249
250 for asset in assets:
251 kind = asset.get('Kind')
252 if kind == 'Image':
253 thumbnail = url_or_none(asset.get('Uri'))
254 elif kind in ('VideoResource', 'AudioResource'):
255 duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
256 restricted_to_denmark = asset.get('RestrictedToDenmark')
257 asset_target = asset.get('Target')
258 for link in asset.get('Links', []):
259 uri = link.get('Uri')
260 if not uri:
261 encrypted_uri = link.get('EncryptedUri')
262 if not encrypted_uri:
263 continue
264 try:
265 uri = decrypt_uri(encrypted_uri)
266 except Exception:
267 self.report_warning(
268 'Unable to decrypt EncryptedUri', video_id)
269 continue
270 uri = url_or_none(uri)
271 if not uri:
272 continue
273 target = link.get('Target')
274 format_id = target or ''
275 if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'):
276 preference = -1
277 format_id += '-%s' % asset_target
278 elif asset_target == 'Default':
279 preference = 1
280 else:
281 preference = None
282 if target == 'HDS':
283 f4m_formats = self._extract_f4m_formats(
284 uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
285 video_id, preference, f4m_id=format_id, fatal=False)
286 if kind == 'AudioResource':
287 for f in f4m_formats:
288 f['vcodec'] = 'none'
289 formats.extend(f4m_formats)
290 elif target == 'HLS':
291 fmts, subs = self._extract_m3u8_formats_and_subtitles(
292 uri, video_id, 'mp4', entry_protocol='m3u8_native',
293 quality=preference, m3u8_id=format_id, fatal=False)
294 formats.extend(fmts)
295 self._merge_subtitles(subs, target=subtitles)
296 else:
297 bitrate = link.get('Bitrate')
298 if bitrate:
299 format_id += '-%s' % bitrate
300 formats.append({
301 'url': uri,
302 'format_id': format_id,
303 'tbr': int_or_none(bitrate),
304 'ext': link.get('FileFormat'),
305 'vcodec': 'none' if kind == 'AudioResource' else None,
306 'quality': preference,
307 })
308 subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist')
309 if isinstance(subtitles_list, list):
310 LANGS = {
311 'Danish': 'da',
312 }
313 for subs in subtitles_list:
314 if not isinstance(subs, dict):
315 continue
316 sub_uri = url_or_none(subs.get('Uri'))
317 if not sub_uri:
318 continue
319 lang = subs.get('Language') or 'da'
320 subtitles.setdefault(LANGS.get(lang, lang), []).append({
321 'url': sub_uri,
322 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt'
323 })
324
325 if not formats and restricted_to_denmark:
326 self.raise_geo_restricted(
327 'Unfortunately, DR is not allowed to show this program outside Denmark.',
328 countries=self._GEO_COUNTRIES)
329
330 return {
331 'id': video_id,
332 'title': title,
333 'description': description,
334 'thumbnail': thumbnail,
335 'timestamp': timestamp,
336 'duration': duration,
337 'formats': formats,
338 'subtitles': subtitles,
339 'series': str_or_none(data.get('SeriesTitle')),
340 'season': str_or_none(data.get('SeasonTitle')),
341 'season_number': int_or_none(data.get('SeasonNumber')),
342 'season_id': str_or_none(data.get('SeasonUrn')),
343 'episode': traverse_obj(supplementary_data, ('entries', 0, 'item', 'contextualTitle')) or str_or_none(data.get('EpisodeTitle')),
344 'episode_number': traverse_obj(supplementary_data, ('entries', 0, 'item', 'episodeNumber')) or int_or_none(data.get('EpisodeNumber')),
345 'release_year': int_or_none(data.get('ProductionYear')),
346 }
347
348
349 class DRTVLiveIE(InfoExtractor):
350 IE_NAME = 'drtv:live'
351 _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)'
352 _GEO_COUNTRIES = ['DK']
353 _TEST = {
354 'url': 'https://www.dr.dk/tv/live/dr1',
355 'info_dict': {
356 'id': 'dr1',
357 'ext': 'mp4',
358 'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
359 },
360 'params': {
361 # m3u8 download
362 'skip_download': True,
363 },
364 }
365
366 def _real_extract(self, url):
367 channel_id = self._match_id(url)
368 channel_data = self._download_json(
369 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id,
370 channel_id)
371 title = channel_data['Title']
372
373 formats = []
374 for streaming_server in channel_data.get('StreamingServers', []):
375 server = streaming_server.get('Server')
376 if not server:
377 continue
378 link_type = streaming_server.get('LinkType')
379 for quality in streaming_server.get('Qualities', []):
380 for stream in quality.get('Streams', []):
381 stream_path = stream.get('Stream')
382 if not stream_path:
383 continue
384 stream_url = update_url_query(
385 '%s/%s' % (server, stream_path), {'b': ''})
386 if link_type == 'HLS':
387 formats.extend(self._extract_m3u8_formats(
388 stream_url, channel_id, 'mp4',
389 m3u8_id=link_type, fatal=False, live=True))
390 elif link_type == 'HDS':
391 formats.extend(self._extract_f4m_formats(update_url_query(
392 '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}),
393 channel_id, f4m_id=link_type, fatal=False))
394
395 return {
396 'id': channel_id,
397 'title': title,
398 'thumbnail': channel_data.get('PrimaryImageUri'),
399 'formats': formats,
400 'is_live': True,
401 }
402
403
404 class DRTVSeasonIE(InfoExtractor):
405 IE_NAME = 'drtv:season'
406 _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/saeson/(?P<display_id>[\w-]+)_(?P<id>\d+)'
407 _GEO_COUNTRIES = ['DK']
408 _TESTS = [{
409 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_9008',
410 'info_dict': {
411 'id': '9008',
412 'display_id': 'frank-and-kastaniegaarden',
413 'title': 'Frank & Kastaniegaarden',
414 'series': 'Frank & Kastaniegaarden',
415 },
416 'playlist_mincount': 8
417 }, {
418 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_8761',
419 'info_dict': {
420 'id': '8761',
421 'display_id': 'frank-and-kastaniegaarden',
422 'title': 'Frank & Kastaniegaarden',
423 'series': 'Frank & Kastaniegaarden',
424 },
425 'playlist_mincount': 19
426 }]
427
428 def _real_extract(self, url):
429 display_id, season_id = self._match_valid_url(url).group('display_id', 'id')
430 data = self._download_json(SERIES_API % f'/saeson/{display_id}_{season_id}', display_id)
431
432 entries = [{
433 '_type': 'url',
434 'url': f'https://www.dr.dk/drtv{episode["path"]}',
435 'ie_key': DRTVIE.ie_key(),
436 'title': episode.get('title'),
437 'episode': episode.get('episodeName'),
438 'description': episode.get('shortDescription'),
439 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
440 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')),
441 'episode_number': episode.get('episodeNumber'),
442 } for episode in traverse_obj(data, ('entries', 0, 'item', 'episodes', 'items'))]
443
444 return {
445 '_type': 'playlist',
446 'id': season_id,
447 'display_id': display_id,
448 'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
449 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
450 'entries': entries,
451 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
452 }
453
454
455 class DRTVSeriesIE(InfoExtractor):
456 IE_NAME = 'drtv:series'
457 _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/serie/(?P<display_id>[\w-]+)_(?P<id>\d+)'
458 _GEO_COUNTRIES = ['DK']
459 _TESTS = [{
460 'url': 'https://www.dr.dk/drtv/serie/frank-and-kastaniegaarden_6954',
461 'info_dict': {
462 'id': '6954',
463 'display_id': 'frank-and-kastaniegaarden',
464 'title': 'Frank & Kastaniegaarden',
465 'series': 'Frank & Kastaniegaarden',
466 },
467 'playlist_mincount': 15
468 }]
469
470 def _real_extract(self, url):
471 display_id, series_id = self._match_valid_url(url).group('display_id', 'id')
472 data = self._download_json(SERIES_API % f'/serie/{display_id}_{series_id}', display_id)
473
474 entries = [{
475 '_type': 'url',
476 'url': f'https://www.dr.dk/drtv{season.get("path")}',
477 'ie_key': DRTVSeasonIE.ie_key(),
478 'title': season.get('title'),
479 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
480 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
481 } for season in traverse_obj(data, ('entries', 0, 'item', 'show', 'seasons', 'items'))]
482
483 return {
484 '_type': 'playlist',
485 'id': series_id,
486 'display_id': display_id,
487 'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
488 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
489 'entries': entries
490 }