]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/drtv.py
[xanimu] Add extractor (#5969)
[yt-dlp.git] / yt_dlp / extractor / drtv.py
CommitLineData
a2d821d7
S
1import binascii
2import hashlib
3import re
4
5
1335c3ac 6from .common import InfoExtractor
1d3586d0 7from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
a2d821d7 8from ..compat import compat_urllib_parse_unquote
1335c3ac
S
9from ..utils import (
10 ExtractorError,
6066d03d
S
11 int_or_none,
12 float_or_none,
13 mimetype2ext,
a2d821d7 14 str_or_none,
ab4cbeff 15 traverse_obj,
5709d661 16 try_get,
a2d821d7 17 unified_timestamp,
2c15db82 18 update_url_query,
a2d821d7 19 url_or_none,
1335c3ac 20)
f2b8db57 21
82d02080 22
ab4cbeff
FNJS
23SERIES_API = 'https://production-cdn.dr-massive.com/api/page?device=web_browser&item_detail_expand=all&lang=da&max_list_prefetch=3&path=%s'
24
25
18c1c424 26class DRTVIE(InfoExtractor):
5709d661
S
27 _VALID_URL = r'''(?x)
28 https?://
29 (?:
7ddbf09c 30 (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
29f7c58a 31 (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/
5709d661
S
32 )
33 (?P<id>[\da-z_-]+)
34 '''
96182695
S
35 _GEO_BYPASS = False
36 _GEO_COUNTRIES = ['DK']
2c15db82 37 IE_NAME = 'drtv'
5e9e3d0f 38 _TESTS = [{
3fcce302 39 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
a2d821d7 40 'md5': '25e659cccc9a2ed956110a299fdf5983',
f2b8db57 41 'info_dict': {
3fcce302 42 'id': 'klassen-darlig-taber-10',
f2b8db57 43 'ext': 'mp4',
3fcce302
S
44 'title': 'Klassen - Dårlig taber (10)',
45 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa',
a2d821d7
S
46 'timestamp': 1539085800,
47 'upload_date': '20181009',
3fcce302 48 'duration': 606.84,
a2d821d7
S
49 'series': 'Klassen',
50 'season': 'Klassen I',
51 'season_number': 1,
52 'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b',
53 'episode': 'Episode 10',
54 'episode_number': 10,
55 'release_year': 2016,
3fcce302 56 },
a2d821d7 57 'expected_warnings': ['Unable to download f4m manifest'],
7ddbf09c 58 'skip': 'this video has been removed',
5e9e3d0f 59 }, {
8d65880e 60 # embed
5e9e3d0f 61 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
5e9e3d0f 62 'info_dict': {
a2d821d7 63 'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6',
5e9e3d0f 64 'ext': 'mp4',
a2d821d7 65 'title': 'christiania pusher street ryddes drdkrjpo',
8d65880e 66 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
5e9e3d0f
SB
67 'timestamp': 1472800279,
68 'upload_date': '20160902',
69 'duration': 131.4,
3fcce302 70 },
8d65880e
S
71 'params': {
72 'skip_download': True,
73 },
a2d821d7 74 'expected_warnings': ['Unable to download f4m manifest'],
b972fb03 75 }, {
8d65880e 76 # with SignLanguage formats
b972fb03 77 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
b972fb03 78 'info_dict': {
7ddbf09c 79 'id': '00831690010',
b972fb03 80 'ext': 'mp4',
a2d821d7 81 'title': 'Historien om Danmark: Stenalder',
8d65880e 82 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
a2d821d7
S
83 'timestamp': 1546628400,
84 'upload_date': '20190104',
7ddbf09c 85 'duration': 3504.618,
8d65880e 86 'formats': 'mincount:20',
7ddbf09c
FH
87 'release_year': 2017,
88 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35',
89 'season_number': 1,
90 'season': 'Historien om Danmark',
91 'series': 'Historien om Danmark',
8d65880e
S
92 },
93 'params': {
94 'skip_download': True,
b972fb03 95 },
f5629946 96 }, {
7ddbf09c 97 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9',
f5629946 98 'only_matching': True,
5709d661
S
99 }, {
100 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769',
101 'info_dict': {
102 'id': '00951930010',
103 'ext': 'mp4',
7ddbf09c
FH
104 'title': 'Bonderøven 2019 (1:8)',
105 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd',
106 'timestamp': 1603188600,
107 'upload_date': '20201020',
5709d661 108 'duration': 2576.6,
7ddbf09c
FH
109 'season': 'Bonderøven 2019',
110 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5',
111 'release_year': 2019,
112 'season_number': 2019,
113 'series': 'Frank & Kastaniegaarden'
5709d661
S
114 },
115 'params': {
116 'skip_download': True,
117 },
118 }, {
119 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769',
120 'only_matching': True,
121 }, {
122 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769',
123 'only_matching': True,
29f7c58a 124 }, {
125 'url': 'https://www.dr.dk/drtv/program/jagten_220924',
126 'only_matching': True,
7ddbf09c
FH
127 }, {
128 'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3',
129 'info_dict': {
130 'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113',
131 'title': "Regionale nyheder",
132 'ext': 'mp4',
133 'duration': 120.043,
134 'series': 'P4 Østjylland regionale nyheder',
135 'timestamp': 1651746600,
136 'season': 'Regionale nyheder',
137 'release_year': 0,
138 'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5',
139 'description': '',
140 'upload_date': '20220505',
141 },
142 'params': {
143 'skip_download': True,
144 },
5e9e3d0f 145 }]
f2b8db57
S
146
147 def _real_extract(self, url):
ab4cbeff 148 raw_video_id = self._match_id(url)
f2b8db57 149
ab4cbeff 150 webpage = self._download_webpage(url, raw_video_id)
78271e33 151
aff84bec
S
152 if '>Programmet er ikke længere tilgængeligt' in webpage:
153 raise ExtractorError(
ab4cbeff 154 'Video %s is not available' % raw_video_id, expected=True)
aff84bec 155
78271e33 156 video_id = self._search_regex(
5e9e3d0f 157 (r'data-(?:material-identifier|episode-slug)="([^"]+)"',
a2d821d7
S
158 r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'),
159 webpage, 'video id', default=None)
160
161 if not video_id:
5709d661 162 video_id = self._search_regex(
a2d821d7 163 r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)',
5709d661
S
164 webpage, 'urn', default=None)
165 if video_id:
166 video_id = compat_urllib_parse_unquote(video_id)
167
168 _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard'
169 query = {'expanded': 'true'}
170
171 if video_id:
172 programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id)
173 else:
174 programcard_url = _PROGRAMCARD_BASE
175 page = self._parse_json(
176 self._search_regex(
177 r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage,
178 'data'), '1')['cache']['page']
179 page = page[list(page.keys())[0]]
180 item = try_get(
181 page, (lambda x: x['item'], lambda x: x['entries'][0]['item']),
182 dict)
183 video_id = item['customId'].split(':')[-1]
184 query['productionnumber'] = video_id
f2b8db57 185
1602a240 186 data = self._download_json(
5709d661 187 programcard_url, video_id, 'Downloading video JSON', query=query)
f2b8db57 188
ab4cbeff
FNJS
189 supplementary_data = self._download_json(
190 SERIES_API % f'/episode/{raw_video_id}', raw_video_id,
191 default={}) if re.search(r'_\d+$', raw_video_id) else {}
192
a2d821d7
S
193 title = str_or_none(data.get('Title')) or re.sub(
194 r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '',
195 self._og_search_title(webpage))
6066d03d
S
196 description = self._og_search_description(
197 webpage, default=None) or data.get('Description')
5e9e3d0f 198
a2d821d7
S
199 timestamp = unified_timestamp(
200 data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime'))
f2b8db57
S
201
202 thumbnail = None
203 duration = None
204
205 restricted_to_denmark = False
206
207 formats = []
208 subtitles = {}
209
a2d821d7
S
210 assets = []
211 primary_asset = data.get('PrimaryAsset')
212 if isinstance(primary_asset, dict):
213 assets.append(primary_asset)
214 secondary_assets = data.get('SecondaryAssets')
215 if isinstance(secondary_assets, list):
216 for secondary_asset in secondary_assets:
217 if isinstance(secondary_asset, dict):
218 assets.append(secondary_asset)
219
220 def hex_to_bytes(hex):
221 return binascii.a2b_hex(hex.encode('ascii'))
222
223 def decrypt_uri(e):
224 n = int(e[2:10], 16)
225 a = e[10 + n:]
1d3586d0 226 data = hex_to_bytes(e[10:10 + n])
227 key = hashlib.sha256(('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()
228 iv = hex_to_bytes(a)
229 decrypted = unpad_pkcs7(aes_cbc_decrypt_bytes(data, key, iv))
230 return decrypted.decode('utf-8').split('?')[0]
a2d821d7
S
231
232 for asset in assets:
2c15db82
RA
233 kind = asset.get('Kind')
234 if kind == 'Image':
a2d821d7 235 thumbnail = url_or_none(asset.get('Uri'))
8d65880e 236 elif kind in ('VideoResource', 'AudioResource'):
6066d03d
S
237 duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
238 restricted_to_denmark = asset.get('RestrictedToDenmark')
8d65880e 239 asset_target = asset.get('Target')
6066d03d
S
240 for link in asset.get('Links', []):
241 uri = link.get('Uri')
a2d821d7
S
242 if not uri:
243 encrypted_uri = link.get('EncryptedUri')
244 if not encrypted_uri:
245 continue
246 try:
247 uri = decrypt_uri(encrypted_uri)
248 except Exception:
249 self.report_warning(
250 'Unable to decrypt EncryptedUri', video_id)
251 continue
252 uri = url_or_none(uri)
6066d03d
S
253 if not uri:
254 continue
255 target = link.get('Target')
256 format_id = target or ''
49fe4175 257 if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'):
b972fb03 258 preference = -1
8d65880e 259 format_id += '-%s' % asset_target
49fe4175
S
260 elif asset_target == 'Default':
261 preference = 1
262 else:
263 preference = None
1335c3ac 264 if target == 'HDS':
2c15db82 265 f4m_formats = self._extract_f4m_formats(
1335c3ac 266 uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
9a0942ad 267 video_id, preference, f4m_id=format_id, fatal=False)
2c15db82
RA
268 if kind == 'AudioResource':
269 for f in f4m_formats:
270 f['vcodec'] = 'none'
271 formats.extend(f4m_formats)
1335c3ac
S
272 elif target == 'HLS':
273 formats.extend(self._extract_m3u8_formats(
6066d03d 274 uri, video_id, 'mp4', entry_protocol='m3u8_native',
f983b875 275 quality=preference, m3u8_id=format_id,
9a0942ad 276 fatal=False))
1335c3ac
S
277 else:
278 bitrate = link.get('Bitrate')
279 if bitrate:
280 format_id += '-%s' % bitrate
281 formats.append({
282 'url': uri,
283 'format_id': format_id,
6066d03d 284 'tbr': int_or_none(bitrate),
1335c3ac 285 'ext': link.get('FileFormat'),
2c15db82 286 'vcodec': 'none' if kind == 'AudioResource' else None,
f983b875 287 'quality': preference,
1335c3ac 288 })
a2d821d7
S
289 subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist')
290 if isinstance(subtitles_list, list):
291 LANGS = {
292 'Danish': 'da',
293 }
294 for subs in subtitles_list:
295 if not isinstance(subs, dict):
296 continue
297 sub_uri = url_or_none(subs.get('Uri'))
298 if not sub_uri:
299 continue
300 lang = subs.get('Language') or 'da'
301 subtitles.setdefault(LANGS.get(lang, lang), []).append({
302 'url': sub_uri,
303 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt'
304 })
f2b8db57
S
305
306 if not formats and restricted_to_denmark:
6066d03d
S
307 self.raise_geo_restricted(
308 'Unfortunately, DR is not allowed to show this program outside Denmark.',
96182695 309 countries=self._GEO_COUNTRIES)
f2b8db57 310
f2b8db57
S
311 return {
312 'id': video_id,
313 'title': title,
314 'description': description,
315 'thumbnail': thumbnail,
316 'timestamp': timestamp,
317 'duration': duration,
318 'formats': formats,
18c1c424 319 'subtitles': subtitles,
a2d821d7
S
320 'series': str_or_none(data.get('SeriesTitle')),
321 'season': str_or_none(data.get('SeasonTitle')),
322 'season_number': int_or_none(data.get('SeasonNumber')),
323 'season_id': str_or_none(data.get('SeasonUrn')),
ab4cbeff
FNJS
324 'episode': traverse_obj(supplementary_data, ('entries', 0, 'item', 'contextualTitle')) or str_or_none(data.get('EpisodeTitle')),
325 'episode_number': traverse_obj(supplementary_data, ('entries', 0, 'item', 'episodeNumber')) or int_or_none(data.get('EpisodeNumber')),
a2d821d7 326 'release_year': int_or_none(data.get('ProductionYear')),
f2b8db57 327 }
2c15db82
RA
328
329
330class DRTVLiveIE(InfoExtractor):
331 IE_NAME = 'drtv:live'
332 _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)'
fc11ad38 333 _GEO_COUNTRIES = ['DK']
2c15db82
RA
334 _TEST = {
335 'url': 'https://www.dr.dk/tv/live/dr1',
336 'info_dict': {
337 'id': 'dr1',
338 'ext': 'mp4',
339 'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
340 },
341 'params': {
342 # m3u8 download
343 'skip_download': True,
344 },
345 }
346
347 def _real_extract(self, url):
348 channel_id = self._match_id(url)
349 channel_data = self._download_json(
350 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id,
351 channel_id)
39ca3b5c 352 title = channel_data['Title']
2c15db82
RA
353
354 formats = []
355 for streaming_server in channel_data.get('StreamingServers', []):
356 server = streaming_server.get('Server')
357 if not server:
358 continue
359 link_type = streaming_server.get('LinkType')
360 for quality in streaming_server.get('Qualities', []):
361 for stream in quality.get('Streams', []):
362 stream_path = stream.get('Stream')
363 if not stream_path:
364 continue
365 stream_url = update_url_query(
366 '%s/%s' % (server, stream_path), {'b': ''})
367 if link_type == 'HLS':
368 formats.extend(self._extract_m3u8_formats(
369 stream_url, channel_id, 'mp4',
370 m3u8_id=link_type, fatal=False, live=True))
371 elif link_type == 'HDS':
372 formats.extend(self._extract_f4m_formats(update_url_query(
373 '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}),
374 channel_id, f4m_id=link_type, fatal=False))
2c15db82
RA
375
376 return {
377 'id': channel_id,
378 'title': title,
379 'thumbnail': channel_data.get('PrimaryImageUri'),
380 'formats': formats,
381 'is_live': True,
382 }
ab4cbeff
FNJS
383
384
385class DRTVSeasonIE(InfoExtractor):
386 IE_NAME = 'drtv:season'
387 _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/saeson/(?P<display_id>[\w-]+)_(?P<id>\d+)'
388 _GEO_COUNTRIES = ['DK']
389 _TESTS = [{
390 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_9008',
391 'info_dict': {
392 'id': '9008',
393 'display_id': 'frank-and-kastaniegaarden',
394 'title': 'Frank & Kastaniegaarden',
395 'series': 'Frank & Kastaniegaarden',
396 },
397 'playlist_mincount': 8
398 }, {
399 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_8761',
400 'info_dict': {
401 'id': '8761',
402 'display_id': 'frank-and-kastaniegaarden',
403 'title': 'Frank & Kastaniegaarden',
404 'series': 'Frank & Kastaniegaarden',
405 },
406 'playlist_mincount': 19
407 }]
408
409 def _real_extract(self, url):
410 display_id, season_id = self._match_valid_url(url).group('display_id', 'id')
411 data = self._download_json(SERIES_API % f'/saeson/{display_id}_{season_id}', display_id)
412
413 entries = [{
414 '_type': 'url',
415 'url': f'https://www.dr.dk/drtv{episode["path"]}',
416 'ie_key': DRTVIE.ie_key(),
417 'title': episode.get('title'),
418 'episode': episode.get('episodeName'),
419 'description': episode.get('shortDescription'),
420 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
421 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')),
422 'episode_number': episode.get('episodeNumber'),
423 } for episode in traverse_obj(data, ('entries', 0, 'item', 'episodes', 'items'))]
424
425 return {
426 '_type': 'playlist',
427 'id': season_id,
428 'display_id': display_id,
429 'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
430 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
431 'entries': entries,
432 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
433 }
434
435
436class DRTVSeriesIE(InfoExtractor):
437 IE_NAME = 'drtv:series'
438 _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/serie/(?P<display_id>[\w-]+)_(?P<id>\d+)'
439 _GEO_COUNTRIES = ['DK']
440 _TESTS = [{
441 'url': 'https://www.dr.dk/drtv/serie/frank-and-kastaniegaarden_6954',
442 'info_dict': {
443 'id': '6954',
444 'display_id': 'frank-and-kastaniegaarden',
445 'title': 'Frank & Kastaniegaarden',
446 'series': 'Frank & Kastaniegaarden',
447 },
448 'playlist_mincount': 15
449 }]
450
451 def _real_extract(self, url):
452 display_id, series_id = self._match_valid_url(url).group('display_id', 'id')
453 data = self._download_json(SERIES_API % f'/serie/{display_id}_{series_id}', display_id)
454
455 entries = [{
456 '_type': 'url',
457 'url': f'https://www.dr.dk/drtv{season.get("path")}',
458 'ie_key': DRTVSeasonIE.ie_key(),
459 'title': season.get('title'),
460 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
461 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
462 } for season in traverse_obj(data, ('entries', 0, 'item', 'show', 'seasons', 'items'))]
463
464 return {
465 '_type': 'playlist',
466 'id': series_id,
467 'display_id': display_id,
468 'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
469 'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
470 'entries': entries
471 }