yt_dlp/extractor/cda.py

   1 import base64
   2 import codecs
   3 import datetime as dt
   4 import hashlib
   5 import hmac
   6 import json
   7 import random
   8 import re
   9 import urllib.parse
  10
  11 from .common import InfoExtractor
  12 from ..compat import compat_ord
  13 from ..utils import (
  14     ExtractorError,
  15     float_or_none,
  16     int_or_none,
  17     merge_dicts,
  18     multipart_encode,
  19     parse_duration,
  20     traverse_obj,
  21     try_call,
  22     try_get,
  23     urljoin,
  24 )
  25
  26
  27 class CDAIE(InfoExtractor):
  28     _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
  29     _NETRC_MACHINE = 'cdapl'
  30
  31     _BASE_URL = 'https://www.cda.pl'
  32     _BASE_API_URL = 'https://api.cda.pl'
  33     _API_HEADERS = {
  34         'Accept': 'application/vnd.cda.public+json',
  35     }
  36     # hardcoded in the app
  37     _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
  38     _BEARER_CACHE = 'cda-bearer'
  39
  40     _TESTS = [{
  41         'url': 'http://www.cda.pl/video/5749950c',
  42         'md5': '6f844bf51b15f31fae165365707ae970',
  43         'info_dict': {
  44             'id': '5749950c',
  45             'ext': 'mp4',
  46             'height': 720,
  47             'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
  48             'description': 'md5:269ccd135d550da90d1662651fcb9772',
  49             'thumbnail': r're:^https?://.*\.jpg$',
  50             'average_rating': float,
  51             'duration': 39,
  52             'age_limit': 0,
  53             'upload_date': '20160221',
  54             'timestamp': 1456078244,
  55         },
  56     }, {
  57         'url': 'http://www.cda.pl/video/57413289',
  58         'md5': 'a88828770a8310fc00be6c95faf7f4d5',
  59         'info_dict': {
  60             'id': '57413289',
  61             'ext': 'mp4',
  62             'title': 'Lądowanie na lotnisku na Maderze',
  63             'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
  64             'thumbnail': r're:^https?://.*\.jpg$',
  65             'uploader': 'crash404',
  66             'average_rating': float,
  67             'duration': 137,
  68             'age_limit': 0,
  69             'upload_date': '20160220',
  70             'timestamp': 1455968218,
  71         },
  72     }, {
  73         # Age-restricted with vfilm redirection
  74         'url': 'https://www.cda.pl/video/8753244c4',
  75         'md5': 'd8eeb83d63611289507010d3df3bb8b3',
  76         'info_dict': {
  77             'id': '8753244c4',
  78             'ext': 'mp4',
  79             'title': '[18+] Bez Filtra: Rezerwowe Psy czyli...  najwulgarniejsza polska gra?',
  80             'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
  81             'height': 1080,
  82             'uploader': 'arhn eu',
  83             'thumbnail': r're:^https?://.*\.jpg$',
  84             'duration': 991,
  85             'age_limit': 18,
  86             'average_rating': float,
  87             'timestamp': 1633888264,
  88             'upload_date': '20211010',
  89         },
  90     }, {
  91         # Age-restricted without vfilm redirection
  92         'url': 'https://www.cda.pl/video/17028157b8',
  93         'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
  94         'info_dict': {
  95             'id': '17028157b8',
  96             'ext': 'mp4',
  97             'title': 'STENDUPY MICHAŁ OGIŃSKI',
  98             'description': 'md5:5851f3272bfc31f762d616040a1d609a',
  99             'height': 480,
 100             'uploader': 'oginski',
 101             'thumbnail': r're:^https?://.*\.jpg$',
 102             'duration': 18855,
 103             'age_limit': 18,
 104             'average_rating': float,
 105             'timestamp': 1699705901,
 106             'upload_date': '20231111',
 107         },
 108     }, {
 109         'url': 'http://ebd.cda.pl/0x0/5749950c',
 110         'only_matching': True,
 111     }]
 112
 113     def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
 114         data, content_type = multipart_encode({'age_confirm': ''})
 115         return self._download_webpage(
 116             url, video_id, *args,
 117             data=data, headers={
 118                 'Referer': url,
 119                 'Content-Type': content_type,
 120             }, **kwargs)
 121
 122     def _perform_login(self, username, password):
 123         app_version = random.choice((
 124             '1.2.88 build 15306',
 125             '1.2.174 build 18469',
 126         ))
 127         android_version = random.randrange(8, 14)
 128         phone_model = random.choice((
 129             # x-kom.pl top selling Android smartphones, as of 2022-12-26
 130             # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
 131             'ASUS ZenFone 8',
 132             'Motorola edge 20 5G',
 133             'Motorola edge 30 neo 5G',
 134             'Motorola moto g22',
 135             'OnePlus Nord 2T 5G',
 136             'Samsung Galaxy A32 SM‑A325F',
 137             'Samsung Galaxy M13',
 138             'Samsung Galaxy S20 FE 5G',
 139             'Xiaomi 11T',
 140             'Xiaomi POCO M4 Pro',
 141             'Xiaomi Redmi 10',
 142             'Xiaomi Redmi 10C',
 143             'Xiaomi Redmi 9C NFC',
 144             'Xiaomi Redmi Note 10 Pro',
 145             'Xiaomi Redmi Note 11 Pro',
 146             'Xiaomi Redmi Note 11',
 147             'Xiaomi Redmi Note 11S 5G',
 148             'Xiaomi Redmi Note 11S',
 149             'realme 10',
 150             'realme 9 Pro+',
 151             'vivo Y33s',
 152         ))
 153         self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
 154
 155         cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
 156         if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
 157             self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
 158             return
 159
 160         password_hash = base64.urlsafe_b64encode(hmac.new(
 161             b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
 162             ''.join(f'{bytes((bt & 255, )).hex():0>2}'
 163                     for bt in hashlib.md5(password.encode()).digest()).encode(),
 164             hashlib.sha256).digest()).decode().replace('=', '')
 165
 166         token_res = self._download_json(
 167             f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
 168             headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
 169             query={
 170                 'grant_type': 'password',
 171                 'login': username,
 172                 'password': password_hash,
 173             })
 174         self.cache.store(self._BEARER_CACHE, username, {
 175             'token': token_res['access_token'],
 176             'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
 177         })
 178         self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
 179
 180     def _real_extract(self, url):
 181         video_id = self._match_id(url)
 182
 183         if 'Authorization' in self._API_HEADERS:
 184             return self._api_extract(video_id)
 185         else:
 186             return self._web_extract(video_id)
 187
 188     def _api_extract(self, video_id):
 189         meta = self._download_json(
 190             f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
 191
 192         uploader = traverse_obj(meta, 'author', 'login')
 193
 194         formats = [{
 195             'url': quality['file'],
 196             'format': quality.get('title'),
 197             'resolution': quality.get('name'),
 198             'height': try_call(lambda: int(quality['name'][:-1])),
 199             'filesize': quality.get('length'),
 200         } for quality in meta['qualities'] if quality.get('file')]
 201
 202         if meta.get('premium') and not meta.get('premium_free') and not formats:
 203             raise ExtractorError(
 204                 'Video requires CDA Premium - subscription needed', expected=True)
 205
 206         return {
 207             'id': video_id,
 208             'title': meta.get('title'),
 209             'description': meta.get('description'),
 210             'uploader': None if uploader == 'anonim' else uploader,
 211             'average_rating': float_or_none(meta.get('rating')),
 212             'thumbnail': meta.get('thumb'),
 213             'formats': formats,
 214             'duration': meta.get('duration'),
 215             'age_limit': 18 if meta.get('for_adults') else 0,
 216             'view_count': meta.get('views'),
 217         }
 218
 219     def _web_extract(self, video_id):
 220         self._set_cookie('cda.pl', 'cda.player', 'html5')
 221         webpage, urlh = self._download_webpage_handle(
 222             f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
 223
 224         if 'Ten film jest dostępny dla użytkowników premium' in webpage:
 225             self.raise_login_required('This video is only available for premium users')
 226
 227         if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
 228             self.raise_geo_restricted()
 229
 230         need_confirm_age = False
 231         if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
 232                                    webpage, 'birthday validate form', default=None):
 233             webpage = self._download_age_confirm_page(
 234                 urlh.url, video_id, note='Confirming age')
 235             need_confirm_age = True
 236
 237         formats = []
 238
 239         uploader = self._search_regex(r'''(?x)
 240             <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
 241             (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
 242             <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
 243         ''', webpage, 'uploader', default=None, group='uploader')
 244         average_rating = self._search_regex(
 245             (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
 246              r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
 247             group='rating_value')
 248
 249         info_dict = {
 250             'id': video_id,
 251             'title': self._og_search_title(webpage),
 252             'description': self._og_search_description(webpage),
 253             'uploader': uploader,
 254             'average_rating': float_or_none(average_rating),
 255             'thumbnail': self._og_search_thumbnail(webpage),
 256             'formats': formats,
 257             'duration': None,
 258             'age_limit': 18 if need_confirm_age else 0,
 259         }
 260
 261         info = self._search_json_ld(webpage, video_id, default={})
 262
 263         # Source: https://www.cda.pl/js/player.js?t=1606154898
 264         def decrypt_file(a):
 265             for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
 266                 a = a.replace(p, '')
 267             a = urllib.parse.unquote(a)
 268             b = []
 269             for c in a:
 270                 f = compat_ord(c)
 271                 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
 272             a = ''.join(b)
 273             a = a.replace('.cda.mp4', '')
 274             for p in ('.2cda.pl', '.3cda.pl'):
 275                 a = a.replace(p, '.cda.pl')
 276             if '/upstream' in a:
 277                 a = a.replace('/upstream', '.mp4/upstream')
 278                 return 'https://' + a
 279             return 'https://' + a + '.mp4'
 280
 281         def extract_format(page, version):
 282             json_str = self._html_search_regex(
 283                 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
 284                 f'{version} player_json', fatal=False, group='player_data')
 285             if not json_str:
 286                 return
 287             player_data = self._parse_json(
 288                 json_str, f'{version} player_data', fatal=False)
 289             if not player_data:
 290                 return
 291             video = player_data.get('video')
 292             if not video or 'file' not in video:
 293                 self.report_warning(f'Unable to extract {version} version information')
 294                 return
 295             if video['file'].startswith('uggc'):
 296                 video['file'] = codecs.decode(video['file'], 'rot_13')
 297                 if video['file'].endswith('adc.mp4'):
 298                     video['file'] = video['file'].replace('adc.mp4', '.mp4')
 299             elif not video['file'].startswith('http'):
 300                 video['file'] = decrypt_file(video['file'])
 301             video_quality = video.get('quality')
 302             qualities = video.get('qualities', {})
 303             video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
 304             info_dict['formats'].append({
 305                 'url': video['file'],
 306                 'format_id': video_quality,
 307                 'height': int_or_none(video_quality[:-1]),
 308             })
 309             for quality, cda_quality in qualities.items():
 310                 if quality == video_quality:
 311                     continue
 312                 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
 313                         'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
 314                 data = json.dumps(data).encode()
 315                 video_url = self._download_json(
 316                     f'https://www.cda.pl/video/{video_id}', video_id, headers={
 317                         'Content-Type': 'application/json',
 318                         'X-Requested-With': 'XMLHttpRequest',
 319                     }, data=data, note=f'Fetching {quality} url',
 320                     errnote=f'Failed to fetch {quality} url', fatal=False)
 321                 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
 322                     video_url = try_get(video_url, lambda x: x['result']['resp'])
 323                     info_dict['formats'].append({
 324                         'url': video_url,
 325                         'format_id': quality,
 326                         'height': int_or_none(quality[:-1]),
 327                     })
 328
 329             if not info_dict['duration']:
 330                 info_dict['duration'] = parse_duration(video.get('duration'))
 331
 332         extract_format(webpage, 'default')
 333
 334         for href, resolution in re.findall(
 335                 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
 336                 webpage):
 337             if need_confirm_age:
 338                 handler = self._download_age_confirm_page
 339             else:
 340                 handler = self._download_webpage
 341
 342             webpage = handler(
 343                 urljoin(self._BASE_URL, href), video_id,
 344                 f'Downloading {resolution} version information', fatal=False)
 345             if not webpage:
 346                 # Manually report warning because empty page is returned when
 347                 # invalid version is requested.
 348                 self.report_warning(f'Unable to download {resolution} version information')
 349                 continue
 350
 351             extract_format(webpage, resolution)
 352
 353         return merge_dicts(info_dict, info)