yt_dlp/extractor/cda.py

   1 import base64
   2 import codecs
   3 import datetime as dt
   4 import hashlib
   5 import hmac
   6 import json
   7 import random
   8 import re
   9
  10 from .common import InfoExtractor
  11 from ..compat import compat_ord, compat_urllib_parse_unquote
  12 from ..utils import (
  13     ExtractorError,
  14     float_or_none,
  15     int_or_none,
  16     merge_dicts,
  17     multipart_encode,
  18     parse_duration,
  19     traverse_obj,
  20     try_call,
  21     try_get,
  22     urljoin,
  23 )
  24
  25
  26 class CDAIE(InfoExtractor):
  27     _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
  28     _NETRC_MACHINE = 'cdapl'
  29
  30     _BASE_URL = 'https://www.cda.pl'
  31     _BASE_API_URL = 'https://api.cda.pl'
  32     _API_HEADERS = {
  33         'Accept': 'application/vnd.cda.public+json',
  34     }
  35     # hardcoded in the app
  36     _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
  37     _BEARER_CACHE = 'cda-bearer'
  38
  39     _TESTS = [{
  40         'url': 'http://www.cda.pl/video/5749950c',
  41         'md5': '6f844bf51b15f31fae165365707ae970',
  42         'info_dict': {
  43             'id': '5749950c',
  44             'ext': 'mp4',
  45             'height': 720,
  46             'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
  47             'description': 'md5:269ccd135d550da90d1662651fcb9772',
  48             'thumbnail': r're:^https?://.*\.jpg$',
  49             'average_rating': float,
  50             'duration': 39,
  51             'age_limit': 0,
  52             'upload_date': '20160221',
  53             'timestamp': 1456078244,
  54         }
  55     }, {
  56         'url': 'http://www.cda.pl/video/57413289',
  57         'md5': 'a88828770a8310fc00be6c95faf7f4d5',
  58         'info_dict': {
  59             'id': '57413289',
  60             'ext': 'mp4',
  61             'title': 'Lądowanie na lotnisku na Maderze',
  62             'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
  63             'thumbnail': r're:^https?://.*\.jpg$',
  64             'uploader': 'crash404',
  65             'average_rating': float,
  66             'duration': 137,
  67             'age_limit': 0,
  68             'upload_date': '20160220',
  69             'timestamp': 1455968218,
  70         }
  71     }, {
  72         # Age-restricted with vfilm redirection
  73         'url': 'https://www.cda.pl/video/8753244c4',
  74         'md5': 'd8eeb83d63611289507010d3df3bb8b3',
  75         'info_dict': {
  76             'id': '8753244c4',
  77             'ext': 'mp4',
  78             'title': '[18+] Bez Filtra: Rezerwowe Psy czyli...  najwulgarniejsza polska gra?',
  79             'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
  80             'height': 1080,
  81             'uploader': 'arhn eu',
  82             'thumbnail': r're:^https?://.*\.jpg$',
  83             'duration': 991,
  84             'age_limit': 18,
  85             'average_rating': float,
  86             'timestamp': 1633888264,
  87             'upload_date': '20211010',
  88         }
  89     }, {
  90         # Age-restricted without vfilm redirection
  91         'url': 'https://www.cda.pl/video/17028157b8',
  92         'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
  93         'info_dict': {
  94             'id': '17028157b8',
  95             'ext': 'mp4',
  96             'title': 'STENDUPY MICHAŁ OGIŃSKI',
  97             'description': 'md5:5851f3272bfc31f762d616040a1d609a',
  98             'height': 480,
  99             'uploader': 'oginski',
 100             'thumbnail': r're:^https?://.*\.jpg$',
 101             'duration': 18855,
 102             'age_limit': 18,
 103             'average_rating': float,
 104             'timestamp': 1699705901,
 105             'upload_date': '20231111',
 106         }
 107     }, {
 108         'url': 'http://ebd.cda.pl/0x0/5749950c',
 109         'only_matching': True,
 110     }]
 111
 112     def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
 113         data, content_type = multipart_encode({'age_confirm': ''})
 114         return self._download_webpage(
 115             url, video_id, *args,
 116             data=data, headers={
 117                 'Referer': url,
 118                 'Content-Type': content_type,
 119             }, **kwargs)
 120
 121     def _perform_login(self, username, password):
 122         app_version = random.choice((
 123             '1.2.88 build 15306',
 124             '1.2.174 build 18469',
 125         ))
 126         android_version = random.randrange(8, 14)
 127         phone_model = random.choice((
 128             # x-kom.pl top selling Android smartphones, as of 2022-12-26
 129             # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
 130             'ASUS ZenFone 8',
 131             'Motorola edge 20 5G',
 132             'Motorola edge 30 neo 5G',
 133             'Motorola moto g22',
 134             'OnePlus Nord 2T 5G',
 135             'Samsung Galaxy A32 SM‑A325F',
 136             'Samsung Galaxy M13',
 137             'Samsung Galaxy S20 FE 5G',
 138             'Xiaomi 11T',
 139             'Xiaomi POCO M4 Pro',
 140             'Xiaomi Redmi 10',
 141             'Xiaomi Redmi 10C',
 142             'Xiaomi Redmi 9C NFC',
 143             'Xiaomi Redmi Note 10 Pro',
 144             'Xiaomi Redmi Note 11 Pro',
 145             'Xiaomi Redmi Note 11',
 146             'Xiaomi Redmi Note 11S 5G',
 147             'Xiaomi Redmi Note 11S',
 148             'realme 10',
 149             'realme 9 Pro+',
 150             'vivo Y33s',
 151         ))
 152         self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
 153
 154         cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
 155         if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
 156             self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
 157             return
 158
 159         password_hash = base64.urlsafe_b64encode(hmac.new(
 160             b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
 161             ''.join(f'{bytes((bt & 255, )).hex():0>2}'
 162                     for bt in hashlib.md5(password.encode()).digest()).encode(),
 163             hashlib.sha256).digest()).decode().replace('=', '')
 164
 165         token_res = self._download_json(
 166             f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
 167             headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
 168             query={
 169                 'grant_type': 'password',
 170                 'login': username,
 171                 'password': password_hash,
 172             })
 173         self.cache.store(self._BEARER_CACHE, username, {
 174             'token': token_res['access_token'],
 175             'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
 176         })
 177         self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
 178
 179     def _real_extract(self, url):
 180         video_id = self._match_id(url)
 181
 182         if 'Authorization' in self._API_HEADERS:
 183             return self._api_extract(video_id)
 184         else:
 185             return self._web_extract(video_id)
 186
 187     def _api_extract(self, video_id):
 188         meta = self._download_json(
 189             f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
 190
 191         uploader = traverse_obj(meta, 'author', 'login')
 192
 193         formats = [{
 194             'url': quality['file'],
 195             'format': quality.get('title'),
 196             'resolution': quality.get('name'),
 197             'height': try_call(lambda: int(quality['name'][:-1])),
 198             'filesize': quality.get('length'),
 199         } for quality in meta['qualities'] if quality.get('file')]
 200
 201         if meta.get('premium') and not meta.get('premium_free') and not formats:
 202             raise ExtractorError(
 203                 'Video requires CDA Premium - subscription needed', expected=True)
 204
 205         return {
 206             'id': video_id,
 207             'title': meta.get('title'),
 208             'description': meta.get('description'),
 209             'uploader': None if uploader == 'anonim' else uploader,
 210             'average_rating': float_or_none(meta.get('rating')),
 211             'thumbnail': meta.get('thumb'),
 212             'formats': formats,
 213             'duration': meta.get('duration'),
 214             'age_limit': 18 if meta.get('for_adults') else 0,
 215             'view_count': meta.get('views'),
 216         }
 217
 218     def _web_extract(self, video_id):
 219         self._set_cookie('cda.pl', 'cda.player', 'html5')
 220         webpage, urlh = self._download_webpage_handle(
 221             f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
 222
 223         if 'Ten film jest dostępny dla użytkowników premium' in webpage:
 224             self.raise_login_required('This video is only available for premium users')
 225
 226         if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
 227             self.raise_geo_restricted()
 228
 229         need_confirm_age = False
 230         if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
 231                                    webpage, 'birthday validate form', default=None):
 232             webpage = self._download_age_confirm_page(
 233                 urlh.url, video_id, note='Confirming age')
 234             need_confirm_age = True
 235
 236         formats = []
 237
 238         uploader = self._search_regex(r'''(?x)
 239             <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
 240             (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
 241             <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
 242         ''', webpage, 'uploader', default=None, group='uploader')
 243         average_rating = self._search_regex(
 244             (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
 245              r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
 246             group='rating_value')
 247
 248         info_dict = {
 249             'id': video_id,
 250             'title': self._og_search_title(webpage),
 251             'description': self._og_search_description(webpage),
 252             'uploader': uploader,
 253             'average_rating': float_or_none(average_rating),
 254             'thumbnail': self._og_search_thumbnail(webpage),
 255             'formats': formats,
 256             'duration': None,
 257             'age_limit': 18 if need_confirm_age else 0,
 258         }
 259
 260         info = self._search_json_ld(webpage, video_id, default={})
 261
 262         # Source: https://www.cda.pl/js/player.js?t=1606154898
 263         def decrypt_file(a):
 264             for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
 265                 a = a.replace(p, '')
 266             a = compat_urllib_parse_unquote(a)
 267             b = []
 268             for c in a:
 269                 f = compat_ord(c)
 270                 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
 271             a = ''.join(b)
 272             a = a.replace('.cda.mp4', '')
 273             for p in ('.2cda.pl', '.3cda.pl'):
 274                 a = a.replace(p, '.cda.pl')
 275             if '/upstream' in a:
 276                 a = a.replace('/upstream', '.mp4/upstream')
 277                 return 'https://' + a
 278             return 'https://' + a + '.mp4'
 279
 280         def extract_format(page, version):
 281             json_str = self._html_search_regex(
 282                 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
 283                 '%s player_json' % version, fatal=False, group='player_data')
 284             if not json_str:
 285                 return
 286             player_data = self._parse_json(
 287                 json_str, '%s player_data' % version, fatal=False)
 288             if not player_data:
 289                 return
 290             video = player_data.get('video')
 291             if not video or 'file' not in video:
 292                 self.report_warning('Unable to extract %s version information' % version)
 293                 return
 294             if video['file'].startswith('uggc'):
 295                 video['file'] = codecs.decode(video['file'], 'rot_13')
 296                 if video['file'].endswith('adc.mp4'):
 297                     video['file'] = video['file'].replace('adc.mp4', '.mp4')
 298             elif not video['file'].startswith('http'):
 299                 video['file'] = decrypt_file(video['file'])
 300             video_quality = video.get('quality')
 301             qualities = video.get('qualities', {})
 302             video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
 303             info_dict['formats'].append({
 304                 'url': video['file'],
 305                 'format_id': video_quality,
 306                 'height': int_or_none(video_quality[:-1]),
 307             })
 308             for quality, cda_quality in qualities.items():
 309                 if quality == video_quality:
 310                     continue
 311                 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
 312                         'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
 313                 data = json.dumps(data).encode('utf-8')
 314                 video_url = self._download_json(
 315                     f'https://www.cda.pl/video/{video_id}', video_id, headers={
 316                         'Content-Type': 'application/json',
 317                         'X-Requested-With': 'XMLHttpRequest'
 318                     }, data=data, note=f'Fetching {quality} url',
 319                     errnote=f'Failed to fetch {quality} url', fatal=False)
 320                 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
 321                     video_url = try_get(video_url, lambda x: x['result']['resp'])
 322                     info_dict['formats'].append({
 323                         'url': video_url,
 324                         'format_id': quality,
 325                         'height': int_or_none(quality[:-1])
 326                     })
 327
 328             if not info_dict['duration']:
 329                 info_dict['duration'] = parse_duration(video.get('duration'))
 330
 331         extract_format(webpage, 'default')
 332
 333         for href, resolution in re.findall(
 334                 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
 335                 webpage):
 336             if need_confirm_age:
 337                 handler = self._download_age_confirm_page
 338             else:
 339                 handler = self._download_webpage
 340
 341             webpage = handler(
 342                 urljoin(self._BASE_URL, href), video_id,
 343                 'Downloading %s version information' % resolution, fatal=False)
 344             if not webpage:
 345                 # Manually report warning because empty page is returned when
 346                 # invalid version is requested.
 347                 self.report_warning('Unable to download %s version information' % resolution)
 348                 continue
 349
 350             extract_format(webpage, resolution)
 351
 352         return merge_dicts(info_dict, info)