yt_dlp/extractor/drtv.py

   1 import binascii
   2 import hashlib
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
   7 from ..compat import compat_urllib_parse_unquote
   8 from ..utils import (
   9     ExtractorError,
  10     float_or_none,
  11     int_or_none,
  12     mimetype2ext,
  13     str_or_none,
  14     traverse_obj,
  15     unified_timestamp,
  16     update_url_query,
  17     url_or_none,
  18 )
  19
  20 SERIES_API = 'https://production-cdn.dr-massive.com/api/page?device=web_browser&item_detail_expand=all&lang=da&max_list_prefetch=3&path=%s'
  21
  22
  23 class DRTVIE(InfoExtractor):
  24     _VALID_URL = r'''(?x)
  25                     https?://
  26                         (?:
  27                             (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?P<radio>radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
  28                             (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/
  29                         )
  30                         (?P<id>[\da-z_-]+)
  31                     '''
  32     _GEO_BYPASS = False
  33     _GEO_COUNTRIES = ['DK']
  34     IE_NAME = 'drtv'
  35     _TESTS = [{
  36         'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
  37         'md5': '25e659cccc9a2ed956110a299fdf5983',
  38         'info_dict': {
  39             'id': 'klassen-darlig-taber-10',
  40             'ext': 'mp4',
  41             'title': 'Klassen - Dårlig taber (10)',
  42             'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa',
  43             'timestamp': 1539085800,
  44             'upload_date': '20181009',
  45             'duration': 606.84,
  46             'series': 'Klassen',
  47             'season': 'Klassen I',
  48             'season_number': 1,
  49             'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b',
  50             'episode': 'Episode 10',
  51             'episode_number': 10,
  52             'release_year': 2016,
  53         },
  54         'expected_warnings': ['Unable to download f4m manifest'],
  55         'skip': 'this video has been removed',
  56     }, {
  57         # embed
  58         'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
  59         'info_dict': {
  60             'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6',
  61             'ext': 'mp4',
  62             'title': 'christiania pusher street ryddes drdkrjpo',
  63             'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
  64             'timestamp': 1472800279,
  65             'upload_date': '20160902',
  66             'duration': 131.4,
  67         },
  68         'params': {
  69             'skip_download': True,
  70         },
  71         'expected_warnings': ['Unable to download f4m manifest'],
  72     }, {
  73         # with SignLanguage formats
  74         'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
  75         'info_dict': {
  76             'id': '00831690010',
  77             'ext': 'mp4',
  78             'title': 'Historien om Danmark: Stenalder',
  79             'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
  80             'timestamp': 1546628400,
  81             'upload_date': '20190104',
  82             'duration': 3504.619,
  83             'formats': 'mincount:20',
  84             'release_year': 2017,
  85             'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35',
  86             'season_number': 1,
  87             'season': 'Historien om Danmark',
  88             'series': 'Historien om Danmark',
  89         },
  90         'params': {
  91             'skip_download': True,
  92         },
  93     }, {
  94         'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9',
  95         'only_matching': True,
  96     }, {
  97         'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769',
  98         'info_dict': {
  99             'id': '00951930010',
 100             'ext': 'mp4',
 101             'title': 'Bonderøven 2019 (1:8)',
 102             'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd',
 103             'timestamp': 1654856100,
 104             'upload_date': '20220610',
 105             'duration': 2576.6,
 106             'season': 'Bonderøven 2019',
 107             'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5',
 108             'release_year': 2019,
 109             'season_number': 2019,
 110             'series': 'Frank & Kastaniegaarden',
 111             'episode_number': 1,
 112             'episode': 'Episode 1',
 113         },
 114         'params': {
 115             'skip_download': True,
 116         },
 117     }, {
 118         'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769',
 119         'only_matching': True,
 120     }, {
 121         'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769',
 122         'only_matching': True,
 123     }, {
 124         'url': 'https://www.dr.dk/drtv/program/jagten_220924',
 125         'only_matching': True,
 126     }, {
 127         'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3',
 128         'info_dict': {
 129             'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113',
 130             'title': "Regionale nyheder",
 131             'ext': 'mp4',
 132             'duration': 120.043,
 133             'series': 'P4 Østjylland regionale nyheder',
 134             'timestamp': 1651746600,
 135             'season': 'Regionale nyheder',
 136             'release_year': 0,
 137             'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5',
 138             'description': '',
 139             'upload_date': '20220505',
 140         },
 141         'params': {
 142             'skip_download': True,
 143         },
 144         'skip': 'this video has been removed',
 145     }, {
 146         'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9',
 147         'info_dict': {
 148             'ext': 'mp4',
 149             'id': '14802310112',
 150             'timestamp': 1678786200,
 151             'duration': 120.043,
 152             'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f',
 153             'series': 'P4 København regionale nyheder',
 154             'upload_date': '20230314',
 155             'release_year': 0,
 156             'description': 'Hør seneste regionale nyheder fra P4 København.',
 157             'season': 'Regionale nyheder',
 158             'title': 'Regionale nyheder',
 159         },
 160     }]
 161
 162     def _real_extract(self, url):
 163         raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio')
 164
 165         webpage = self._download_webpage(url, raw_video_id)
 166
 167         if '>Programmet er ikke længere tilgængeligt' in webpage:
 168             raise ExtractorError(
 169                 'Video %s is not available' % raw_video_id, expected=True)
 170
 171         video_id = self._search_regex(
 172             (r'data-(?:material-identifier|episode-slug)="([^"]+)"',
 173              r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'),
 174             webpage, 'video id', default=None)
 175
 176         if not video_id:
 177             video_id = self._search_regex(
 178                 r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)',
 179                 webpage, 'urn', default=None)
 180             if video_id:
 181                 video_id = compat_urllib_parse_unquote(video_id)
 182
 183         _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard'
 184         query = {'expanded': 'true'}
 185
 186         if video_id:
 187             programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id)
 188         else:
 189             programcard_url = _PROGRAMCARD_BASE
 190             if is_radio_url:
 191                 video_id = self._search_nextjs_data(
 192                     webpage, raw_video_id)['props']['pageProps']['episode']['productionNumber']
 193             else:
 194                 json_data = self._search_json(
 195                     r'window\.__data\s*=', webpage, 'data', raw_video_id)
 196                 video_id = traverse_obj(json_data, (
 197                     'cache', 'page', ..., (None, ('entries', 0)), 'item', 'customId',
 198                     {lambda x: x.split(':')[-1]}), get_all=False)
 199                 if not video_id:
 200                     raise ExtractorError('Unable to extract video id')
 201             query['productionnumber'] = video_id
 202
 203         data = self._download_json(
 204             programcard_url, video_id, 'Downloading video JSON', query=query)
 205
 206         supplementary_data = {}
 207         if re.search(r'_\d+$', raw_video_id):
 208             supplementary_data = self._download_json(
 209                 SERIES_API % f'/episode/{raw_video_id}', raw_video_id, fatal=False) or {}
 210
 211         title = str_or_none(data.get('Title')) or re.sub(
 212             r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '',
 213             self._og_search_title(webpage))
 214         description = self._og_search_description(
 215             webpage, default=None) or data.get('Description')
 216
 217         timestamp = unified_timestamp(
 218             data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime'))
 219
 220         thumbnail = None
 221         duration = None
 222
 223         restricted_to_denmark = False
 224
 225         formats = []
 226         subtitles = {}
 227
 228         assets = []
 229         primary_asset = data.get('PrimaryAsset')
 230         if isinstance(primary_asset, dict):
 231             assets.append(primary_asset)
 232         secondary_assets = data.get('SecondaryAssets')
 233         if isinstance(secondary_assets, list):
 234             for secondary_asset in secondary_assets:
 235                 if isinstance(secondary_asset, dict):
 236                     assets.append(secondary_asset)
 237
 238         def hex_to_bytes(hex):
 239             return binascii.a2b_hex(hex.encode('ascii'))
 240
 241         def decrypt_uri(e):
 242             n = int(e[2:10], 16)
 243             a = e[10 + n:]
 244             data = hex_to_bytes(e[10:10 + n])
 245             key = hashlib.sha256(('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest()
 246             iv = hex_to_bytes(a)
 247             decrypted = unpad_pkcs7(aes_cbc_decrypt_bytes(data, key, iv))
 248             return decrypted.decode('utf-8').split('?')[0]
 249
 250         for asset in assets:
 251             kind = asset.get('Kind')
 252             if kind == 'Image':
 253                 thumbnail = url_or_none(asset.get('Uri'))
 254             elif kind in ('VideoResource', 'AudioResource'):
 255                 duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
 256                 restricted_to_denmark = asset.get('RestrictedToDenmark')
 257                 asset_target = asset.get('Target')
 258                 for link in asset.get('Links', []):
 259                     uri = link.get('Uri')
 260                     if not uri:
 261                         encrypted_uri = link.get('EncryptedUri')
 262                         if not encrypted_uri:
 263                             continue
 264                         try:
 265                             uri = decrypt_uri(encrypted_uri)
 266                         except Exception:
 267                             self.report_warning(
 268                                 'Unable to decrypt EncryptedUri', video_id)
 269                             continue
 270                     uri = url_or_none(uri)
 271                     if not uri:
 272                         continue
 273                     target = link.get('Target')
 274                     format_id = target or ''
 275                     if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'):
 276                         preference = -1
 277                         format_id += '-%s' % asset_target
 278                     elif asset_target == 'Default':
 279                         preference = 1
 280                     else:
 281                         preference = None
 282                     if target == 'HDS':
 283                         f4m_formats = self._extract_f4m_formats(
 284                             uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
 285                             video_id, preference, f4m_id=format_id, fatal=False)
 286                         if kind == 'AudioResource':
 287                             for f in f4m_formats:
 288                                 f['vcodec'] = 'none'
 289                         formats.extend(f4m_formats)
 290                     elif target == 'HLS':
 291                         fmts, subs = self._extract_m3u8_formats_and_subtitles(
 292                             uri, video_id, 'mp4', entry_protocol='m3u8_native',
 293                             quality=preference, m3u8_id=format_id, fatal=False)
 294                         formats.extend(fmts)
 295                         self._merge_subtitles(subs, target=subtitles)
 296                     else:
 297                         bitrate = link.get('Bitrate')
 298                         if bitrate:
 299                             format_id += '-%s' % bitrate
 300                         formats.append({
 301                             'url': uri,
 302                             'format_id': format_id,
 303                             'tbr': int_or_none(bitrate),
 304                             'ext': link.get('FileFormat'),
 305                             'vcodec': 'none' if kind == 'AudioResource' else None,
 306                             'quality': preference,
 307                         })
 308             subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist')
 309             if isinstance(subtitles_list, list):
 310                 LANGS = {
 311                     'Danish': 'da',
 312                 }
 313                 for subs in subtitles_list:
 314                     if not isinstance(subs, dict):
 315                         continue
 316                     sub_uri = url_or_none(subs.get('Uri'))
 317                     if not sub_uri:
 318                         continue
 319                     lang = subs.get('Language') or 'da'
 320                     subtitles.setdefault(LANGS.get(lang, lang), []).append({
 321                         'url': sub_uri,
 322                         'ext': mimetype2ext(subs.get('MimeType')) or 'vtt'
 323                     })
 324
 325         if not formats and restricted_to_denmark:
 326             self.raise_geo_restricted(
 327                 'Unfortunately, DR is not allowed to show this program outside Denmark.',
 328                 countries=self._GEO_COUNTRIES)
 329
 330         return {
 331             'id': video_id,
 332             'title': title,
 333             'description': description,
 334             'thumbnail': thumbnail,
 335             'timestamp': timestamp,
 336             'duration': duration,
 337             'formats': formats,
 338             'subtitles': subtitles,
 339             'series': str_or_none(data.get('SeriesTitle')),
 340             'season': str_or_none(data.get('SeasonTitle')),
 341             'season_number': int_or_none(data.get('SeasonNumber')),
 342             'season_id': str_or_none(data.get('SeasonUrn')),
 343             'episode': traverse_obj(supplementary_data, ('entries', 0, 'item', 'contextualTitle')) or str_or_none(data.get('EpisodeTitle')),
 344             'episode_number': traverse_obj(supplementary_data, ('entries', 0, 'item', 'episodeNumber')) or int_or_none(data.get('EpisodeNumber')),
 345             'release_year': int_or_none(data.get('ProductionYear')),
 346         }
 347
 348
 349 class DRTVLiveIE(InfoExtractor):
 350     IE_NAME = 'drtv:live'
 351     _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)'
 352     _GEO_COUNTRIES = ['DK']
 353     _TEST = {
 354         'url': 'https://www.dr.dk/tv/live/dr1',
 355         'info_dict': {
 356             'id': 'dr1',
 357             'ext': 'mp4',
 358             'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
 359         },
 360         'params': {
 361             # m3u8 download
 362             'skip_download': True,
 363         },
 364     }
 365
 366     def _real_extract(self, url):
 367         channel_id = self._match_id(url)
 368         channel_data = self._download_json(
 369             'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id,
 370             channel_id)
 371         title = channel_data['Title']
 372
 373         formats = []
 374         for streaming_server in channel_data.get('StreamingServers', []):
 375             server = streaming_server.get('Server')
 376             if not server:
 377                 continue
 378             link_type = streaming_server.get('LinkType')
 379             for quality in streaming_server.get('Qualities', []):
 380                 for stream in quality.get('Streams', []):
 381                     stream_path = stream.get('Stream')
 382                     if not stream_path:
 383                         continue
 384                     stream_url = update_url_query(
 385                         '%s/%s' % (server, stream_path), {'b': ''})
 386                     if link_type == 'HLS':
 387                         formats.extend(self._extract_m3u8_formats(
 388                             stream_url, channel_id, 'mp4',
 389                             m3u8_id=link_type, fatal=False, live=True))
 390                     elif link_type == 'HDS':
 391                         formats.extend(self._extract_f4m_formats(update_url_query(
 392                             '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}),
 393                             channel_id, f4m_id=link_type, fatal=False))
 394
 395         return {
 396             'id': channel_id,
 397             'title': title,
 398             'thumbnail': channel_data.get('PrimaryImageUri'),
 399             'formats': formats,
 400             'is_live': True,
 401         }
 402
 403
 404 class DRTVSeasonIE(InfoExtractor):
 405     IE_NAME = 'drtv:season'
 406     _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/saeson/(?P<display_id>[\w-]+)_(?P<id>\d+)'
 407     _GEO_COUNTRIES = ['DK']
 408     _TESTS = [{
 409         'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_9008',
 410         'info_dict': {
 411             'id': '9008',
 412             'display_id': 'frank-and-kastaniegaarden',
 413             'title': 'Frank & Kastaniegaarden',
 414             'series': 'Frank & Kastaniegaarden',
 415         },
 416         'playlist_mincount': 8
 417     }, {
 418         'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_8761',
 419         'info_dict': {
 420             'id': '8761',
 421             'display_id': 'frank-and-kastaniegaarden',
 422             'title': 'Frank & Kastaniegaarden',
 423             'series': 'Frank & Kastaniegaarden',
 424         },
 425         'playlist_mincount': 19
 426     }]
 427
 428     def _real_extract(self, url):
 429         display_id, season_id = self._match_valid_url(url).group('display_id', 'id')
 430         data = self._download_json(SERIES_API % f'/saeson/{display_id}_{season_id}', display_id)
 431
 432         entries = [{
 433             '_type': 'url',
 434             'url': f'https://www.dr.dk/drtv{episode["path"]}',
 435             'ie_key': DRTVIE.ie_key(),
 436             'title': episode.get('title'),
 437             'episode': episode.get('episodeName'),
 438             'description': episode.get('shortDescription'),
 439             'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
 440             'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')),
 441             'episode_number': episode.get('episodeNumber'),
 442         } for episode in traverse_obj(data, ('entries', 0, 'item', 'episodes', 'items'))]
 443
 444         return {
 445             '_type': 'playlist',
 446             'id': season_id,
 447             'display_id': display_id,
 448             'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
 449             'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
 450             'entries': entries,
 451             'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
 452         }
 453
 454
 455 class DRTVSeriesIE(InfoExtractor):
 456     IE_NAME = 'drtv:series'
 457     _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/serie/(?P<display_id>[\w-]+)_(?P<id>\d+)'
 458     _GEO_COUNTRIES = ['DK']
 459     _TESTS = [{
 460         'url': 'https://www.dr.dk/drtv/serie/frank-and-kastaniegaarden_6954',
 461         'info_dict': {
 462             'id': '6954',
 463             'display_id': 'frank-and-kastaniegaarden',
 464             'title': 'Frank & Kastaniegaarden',
 465             'series': 'Frank & Kastaniegaarden',
 466         },
 467         'playlist_mincount': 15
 468     }]
 469
 470     def _real_extract(self, url):
 471         display_id, series_id = self._match_valid_url(url).group('display_id', 'id')
 472         data = self._download_json(SERIES_API % f'/serie/{display_id}_{series_id}', display_id)
 473
 474         entries = [{
 475             '_type': 'url',
 476             'url': f'https://www.dr.dk/drtv{season.get("path")}',
 477             'ie_key': DRTVSeasonIE.ie_key(),
 478             'title': season.get('title'),
 479             'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
 480             'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber'))
 481         } for season in traverse_obj(data, ('entries', 0, 'item', 'show', 'seasons', 'items'))]
 482
 483         return {
 484             '_type': 'playlist',
 485             'id': series_id,
 486             'display_id': display_id,
 487             'title': traverse_obj(data, ('entries', 0, 'item', 'title')),
 488             'series': traverse_obj(data, ('entries', 0, 'item', 'title')),
 489             'entries': entries
 490         }