yt_dlp/extractor/canvas.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5
   6 from .common import InfoExtractor
   7 from .gigya import GigyaBaseIE
   8 from ..compat import compat_HTTPError
   9 from ..utils import (
  10     ExtractorError,
  11     clean_html,
  12     extract_attributes,
  13     float_or_none,
  14     get_element_by_class,
  15     int_or_none,
  16     merge_dicts,
  17     str_or_none,
  18     strip_or_none,
  19     url_or_none,
  20 )
  21
  22
  23 class CanvasIE(InfoExtractor):
  24     _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
  25     _TESTS = [{
  26         'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  27         'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
  28         'info_dict': {
  29             'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  30             'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  31             'ext': 'mp4',
  32             'title': 'Nachtwacht: De Greystook',
  33             'description': 'Nachtwacht: De Greystook',
  34             'thumbnail': r're:^https?://.*\.jpg$',
  35             'duration': 1468.02,
  36         },
  37         'expected_warnings': ['is not a supported codec'],
  38     }, {
  39         'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  40         'only_matching': True,
  41     }]
  42     _GEO_BYPASS = False
  43     _HLS_ENTRY_PROTOCOLS_MAP = {
  44         'HLS': 'm3u8_native',
  45         'HLS_AES': 'm3u8',
  46     }
  47     _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
  48
  49     def _real_extract(self, url):
  50         mobj = re.match(self._VALID_URL, url)
  51         site_id, video_id = mobj.group('site_id'), mobj.group('id')
  52
  53         data = None
  54         if site_id != 'vrtvideo':
  55             # Old API endpoint, serves more formats but may fail for some videos
  56             data = self._download_json(
  57                 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
  58                 % (site_id, video_id), video_id, 'Downloading asset JSON',
  59                 'Unable to download asset JSON', fatal=False)
  60
  61         # New API endpoint
  62         if not data:
  63             headers = self.geo_verification_headers()
  64             headers.update({'Content-Type': 'application/json'})
  65             token = self._download_json(
  66                 '%s/tokens' % self._REST_API_BASE, video_id,
  67                 'Downloading token', data=b'', headers=headers)['vrtPlayerToken']
  68             data = self._download_json(
  69                 '%s/videos/%s' % (self._REST_API_BASE, video_id),
  70                 video_id, 'Downloading video JSON', query={
  71                     'vrtPlayerToken': token,
  72                     'client': '%s@PROD' % site_id,
  73                 }, expected_status=400)
  74             if not data.get('title'):
  75                 code = data.get('code')
  76                 if code == 'AUTHENTICATION_REQUIRED':
  77                     self.raise_login_required()
  78                 elif code == 'INVALID_LOCATION':
  79                     self.raise_geo_restricted(countries=['BE'])
  80                 raise ExtractorError(data.get('message') or code, expected=True)
  81
  82         title = data['title']
  83         description = data.get('description')
  84
  85         formats = []
  86         subtitles = {}
  87         for target in data['targetUrls']:
  88             format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
  89             if not format_url or not format_type:
  90                 continue
  91             format_type = format_type.upper()
  92             if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
  93                 fmts, subs = self._extract_m3u8_formats_and_subtitles(
  94                     format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
  95                     m3u8_id=format_type, fatal=False)
  96                 formats.extend(fmts)
  97                 subtitles = self._merge_subtitles(subtitles, subs)
  98             elif format_type == 'HDS':
  99                 formats.extend(self._extract_f4m_formats(
 100                     format_url, video_id, f4m_id=format_type, fatal=False))
 101             elif format_type == 'MPEG_DASH':
 102                 fmts, subs = self._extract_mpd_formats_and_subtitles(
 103                     format_url, video_id, mpd_id=format_type, fatal=False)
 104                 formats.extend(fmts)
 105                 subtitles = self._merge_subtitles(subtitles, subs)
 106             elif format_type == 'HSS':
 107                 fmts, subs = self._extract_ism_formats_and_subtitles(
 108                     format_url, video_id, ism_id='mss', fatal=False)
 109                 formats.extend(fmts)
 110                 subtitles = self._merge_subtitles(subtitles, subs)
 111             else:
 112                 formats.append({
 113                     'format_id': format_type,
 114                     'url': format_url,
 115                 })
 116         self._sort_formats(formats)
 117
 118         subtitle_urls = data.get('subtitleUrls')
 119         if isinstance(subtitle_urls, list):
 120             for subtitle in subtitle_urls:
 121                 subtitle_url = subtitle.get('url')
 122                 if subtitle_url and subtitle.get('type') == 'CLOSED':
 123                     subtitles.setdefault('nl', []).append({'url': subtitle_url})
 124
 125         return {
 126             'id': video_id,
 127             'display_id': video_id,
 128             'title': title,
 129             'description': description,
 130             'formats': formats,
 131             'duration': float_or_none(data.get('duration'), 1000),
 132             'thumbnail': data.get('posterImageUrl'),
 133             'subtitles': subtitles,
 134         }
 135
 136
 137 class CanvasEenIE(InfoExtractor):
 138     IE_DESC = 'canvas.be and een.be'
 139     _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 140     _TESTS = [{
 141         'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
 142         'md5': 'ed66976748d12350b118455979cca293',
 143         'info_dict': {
 144             'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
 145             'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
 146             'ext': 'flv',
 147             'title': 'De afspraak veilt voor de Warmste Week',
 148             'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
 149             'thumbnail': r're:^https?://.*\.jpg$',
 150             'duration': 49.02,
 151         },
 152         'expected_warnings': ['is not a supported codec'],
 153     }, {
 154         # with subtitles
 155         'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
 156         'info_dict': {
 157             'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
 158             'display_id': 'pieter-0167',
 159             'ext': 'mp4',
 160             'title': 'Pieter 0167',
 161             'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
 162             'thumbnail': r're:^https?://.*\.jpg$',
 163             'duration': 2553.08,
 164             'subtitles': {
 165                 'nl': [{
 166                     'ext': 'vtt',
 167                 }],
 168             },
 169         },
 170         'params': {
 171             'skip_download': True,
 172         },
 173         'skip': 'Pagina niet gevonden',
 174     }, {
 175         'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
 176         'info_dict': {
 177             'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
 178             'display_id': 'emma-pakt-thilly-aan',
 179             'ext': 'mp4',
 180             'title': 'Emma pakt Thilly aan',
 181             'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
 182             'thumbnail': r're:^https?://.*\.jpg$',
 183             'duration': 118.24,
 184         },
 185         'params': {
 186             'skip_download': True,
 187         },
 188         'expected_warnings': ['is not a supported codec'],
 189     }, {
 190         'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
 191         'only_matching': True,
 192     }]
 193
 194     def _real_extract(self, url):
 195         mobj = re.match(self._VALID_URL, url)
 196         site_id, display_id = mobj.group('site_id'), mobj.group('id')
 197
 198         webpage = self._download_webpage(url, display_id)
 199
 200         title = strip_or_none(self._search_regex(
 201             r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
 202             webpage, 'title', default=None) or self._og_search_title(
 203             webpage, default=None))
 204
 205         video_id = self._html_search_regex(
 206             r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
 207             group='id')
 208
 209         return {
 210             '_type': 'url_transparent',
 211             'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
 212             'ie_key': CanvasIE.ie_key(),
 213             'id': video_id,
 214             'display_id': display_id,
 215             'title': title,
 216             'description': self._og_search_description(webpage),
 217         }
 218
 219
 220 class VrtNUIE(GigyaBaseIE):
 221     IE_DESC = 'VrtNU.be'
 222     _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
 223     _TESTS = [{
 224         # Available via old API endpoint
 225         'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',
 226         'info_dict': {
 227             'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
 228             'ext': 'mp4',
 229             'title': 'Postbus X - Aflevering 1 (Seizoen 1989)',
 230             'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7',
 231             'duration': 1457.04,
 232             'thumbnail': r're:^https?://.*\.jpg$',
 233             'series': 'Postbus X',
 234             'season': 'Seizoen 1989',
 235             'season_number': 1989,
 236             'episode': 'De zwarte weduwe',
 237             'episode_number': 1,
 238             'timestamp': 1595822400,
 239             'upload_date': '20200727',
 240         },
 241         'skip': 'This video is only available for registered users',
 242         'params': {
 243             'username': '<snip>',
 244             'password': '<snip>',
 245         },
 246         'expected_warnings': ['is not a supported codec'],
 247     }, {
 248         # Only available via new API endpoint
 249         'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
 250         'info_dict': {
 251             'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
 252             'ext': 'mp4',
 253             'title': 'Aflevering 5',
 254             'description': 'Wie valt door de mand tijdens een missie?',
 255             'duration': 2967.06,
 256             'season': 'Season 1',
 257             'season_number': 1,
 258             'episode_number': 5,
 259         },
 260         'skip': 'This video is only available for registered users',
 261         'params': {
 262             'username': '<snip>',
 263             'password': '<snip>',
 264         },
 265         'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
 266     }]
 267     _NETRC_MACHINE = 'vrtnu'
 268     _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
 269     _CONTEXT_ID = 'R3595707040'
 270
 271     def _real_initialize(self):
 272         self._login()
 273
 274     def _login(self):
 275         username, password = self._get_login_info()
 276         if username is None:
 277             return
 278
 279         auth_data = {
 280             'APIKey': self._APIKEY,
 281             'targetEnv': 'jssdk',
 282             'loginID': username,
 283             'password': password,
 284             'authMode': 'cookie',
 285         }
 286
 287         auth_info = self._gigya_login(auth_data)
 288
 289         # Sometimes authentication fails for no good reason, retry
 290         login_attempt = 1
 291         while login_attempt <= 3:
 292             try:
 293                 # When requesting a token, no actual token is returned, but the
 294                 # necessary cookies are set.
 295                 self._request_webpage(
 296                     'https://token.vrt.be',
 297                     None, note='Requesting a token', errnote='Could not get a token',
 298                     headers={
 299                         'Content-Type': 'application/json',
 300                         'Referer': 'https://www.vrt.be/vrtnu/',
 301                     },
 302                     data=json.dumps({
 303                         'uid': auth_info['UID'],
 304                         'uidsig': auth_info['UIDSignature'],
 305                         'ts': auth_info['signatureTimestamp'],
 306                         'email': auth_info['profile']['email'],
 307                     }).encode('utf-8'))
 308             except ExtractorError as e:
 309                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
 310                     login_attempt += 1
 311                     self.report_warning('Authentication failed')
 312                     self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
 313                 else:
 314                     raise e
 315             else:
 316                 break
 317
 318     def _real_extract(self, url):
 319         display_id = self._match_id(url)
 320
 321         webpage = self._download_webpage(url, display_id)
 322
 323         attrs = extract_attributes(self._search_regex(
 324             r'(<nui-media[^>]+>)', webpage, 'media element'))
 325         video_id = attrs['videoid']
 326         publication_id = attrs.get('publicationid')
 327         if publication_id:
 328             video_id = publication_id + '$' + video_id
 329
 330         page = (self._parse_json(self._search_regex(
 331             r'digitalData\s*=\s*({.+?});', webpage, 'digial data',
 332             default='{}'), video_id, fatal=False) or {}).get('page') or {}
 333
 334         info = self._search_json_ld(webpage, display_id, default={})
 335         return merge_dicts(info, {
 336             '_type': 'url_transparent',
 337             'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
 338             'ie_key': CanvasIE.ie_key(),
 339             'id': video_id,
 340             'display_id': display_id,
 341             'season_number': int_or_none(page.get('episode_season')),
 342         })
 343
 344
 345 class DagelijkseKostIE(InfoExtractor):
 346     IE_DESC = 'dagelijksekost.een.be'
 347     _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
 348     _TEST = {
 349         'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
 350         'md5': '30bfffc323009a3e5f689bef6efa2365',
 351         'info_dict': {
 352             'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
 353             'display_id': 'hachis-parmentier-met-witloof',
 354             'ext': 'mp4',
 355             'title': 'Hachis parmentier met witloof',
 356             'description': 'md5:9960478392d87f63567b5b117688cdc5',
 357             'thumbnail': r're:^https?://.*\.jpg$',
 358             'duration': 283.02,
 359         },
 360         'expected_warnings': ['is not a supported codec'],
 361     }
 362
 363     def _real_extract(self, url):
 364         display_id = self._match_id(url)
 365         webpage = self._download_webpage(url, display_id)
 366
 367         title = strip_or_none(get_element_by_class(
 368             'dish-metadata__title', webpage
 369         ) or self._html_search_meta(
 370             'twitter:title', webpage))
 371
 372         description = clean_html(get_element_by_class(
 373             'dish-description', webpage)
 374         ) or self._html_search_meta(
 375             ('description', 'twitter:description', 'og:description'),
 376             webpage)
 377
 378         video_id = self._html_search_regex(
 379             r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
 380             group='id')
 381
 382         return {
 383             '_type': 'url_transparent',
 384             'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
 385             'ie_key': CanvasIE.ie_key(),
 386             'id': video_id,
 387             'display_id': display_id,
 388             'title': title,
 389             'description': description,
 390         }