]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/cda.py
[cleanup] Add more ruff rules (#10149)
[yt-dlp.git] / yt_dlp / extractor / cda.py
CommitLineData
34f00179 1import base64
fdeea726 2import codecs
c305a25c 3import datetime as dt
34f00179 4import hashlib
5import hmac
05664a2f 6import json
da8d2de2 7import random
ac668111 8import re
add96eb9 9import urllib.parse
8b0d7a66
KM
10
11from .common import InfoExtractor
add96eb9 12from ..compat import compat_ord
8b0d7a66 13from ..utils import (
8b0d7a66 14 ExtractorError,
577281b0
KM
15 float_or_none,
16 int_or_none,
38d70284 17 merge_dicts,
0c265486 18 multipart_encode,
577281b0 19 parse_duration,
34f00179 20 traverse_obj,
21 try_call,
05664a2f 22 try_get,
ac668111 23 urljoin,
8b0d7a66
KM
24)
25
26
27class CDAIE(InfoExtractor):
f1ced6df 28 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
34f00179 29 _NETRC_MACHINE = 'cdapl'
30
da8d2de2 31 _BASE_URL = 'https://www.cda.pl'
34f00179 32 _BASE_API_URL = 'https://api.cda.pl'
33 _API_HEADERS = {
34 'Accept': 'application/vnd.cda.public+json',
34f00179 35 }
36 # hardcoded in the app
37 _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
38 _BEARER_CACHE = 'cda-bearer'
39
f1ced6df
S
40 _TESTS = [{
41 'url': 'http://www.cda.pl/video/5749950c',
42 'md5': '6f844bf51b15f31fae165365707ae970',
43 'info_dict': {
44 'id': '5749950c',
45 'ext': 'mp4',
46 'height': 720,
47 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
577281b0 48 'description': 'md5:269ccd135d550da90d1662651fcb9772',
ec85ded8 49 'thumbnail': r're:^https?://.*\.jpg$',
577281b0 50 'average_rating': float,
0c265486
YCH
51 'duration': 39,
52 'age_limit': 0,
05664a2f 53 'upload_date': '20160221',
54 'timestamp': 1456078244,
add96eb9 55 },
f1ced6df
S
56 }, {
57 'url': 'http://www.cda.pl/video/57413289',
58 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
59 'info_dict': {
60 'id': '57413289',
61 'ext': 'mp4',
62 'title': 'Lądowanie na lotnisku na Maderze',
577281b0 63 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
ec85ded8 64 'thumbnail': r're:^https?://.*\.jpg$',
577281b0 65 'uploader': 'crash404',
577281b0 66 'average_rating': float,
0c265486
YCH
67 'duration': 137,
68 'age_limit': 0,
6d8a53d8
P
69 'upload_date': '20160220',
70 'timestamp': 1455968218,
add96eb9 71 },
0c265486 72 }, {
6d8a53d8
P
73 # Age-restricted with vfilm redirection
74 'url': 'https://www.cda.pl/video/8753244c4',
75 'md5': 'd8eeb83d63611289507010d3df3bb8b3',
0c265486 76 'info_dict': {
6d8a53d8 77 'id': '8753244c4',
0c265486 78 'ext': 'mp4',
6d8a53d8
P
79 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?',
80 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
0c265486 81 'height': 1080,
6d8a53d8 82 'uploader': 'arhn eu',
0c265486 83 'thumbnail': r're:^https?://.*\.jpg$',
6d8a53d8 84 'duration': 991,
0c265486 85 'age_limit': 18,
0c265486 86 'average_rating': float,
6d8a53d8
P
87 'timestamp': 1633888264,
88 'upload_date': '20211010',
add96eb9 89 },
6d8a53d8
P
90 }, {
91 # Age-restricted without vfilm redirection
92 'url': 'https://www.cda.pl/video/17028157b8',
93 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
94 'info_dict': {
95 'id': '17028157b8',
96 'ext': 'mp4',
97 'title': 'STENDUPY MICHAŁ OGIŃSKI',
98 'description': 'md5:5851f3272bfc31f762d616040a1d609a',
99 'height': 480,
100 'uploader': 'oginski',
101 'thumbnail': r're:^https?://.*\.jpg$',
102 'duration': 18855,
103 'age_limit': 18,
104 'average_rating': float,
105 'timestamp': 1699705901,
106 'upload_date': '20231111',
add96eb9 107 },
f1ced6df
S
108 }, {
109 'url': 'http://ebd.cda.pl/0x0/5749950c',
110 'only_matching': True,
111 }]
8b0d7a66 112
0c265486 113 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
6d8a53d8 114 data, content_type = multipart_encode({'age_confirm': ''})
0c265486 115 return self._download_webpage(
6d8a53d8 116 url, video_id, *args,
0c265486
YCH
117 data=data, headers={
118 'Referer': url,
119 'Content-Type': content_type,
120 }, **kwargs)
121
34f00179 122 def _perform_login(self, username, password):
da8d2de2 123 app_version = random.choice((
124 '1.2.88 build 15306',
125 '1.2.174 build 18469',
126 ))
127 android_version = random.randrange(8, 14)
128 phone_model = random.choice((
129 # x-kom.pl top selling Android smartphones, as of 2022-12-26
130 # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
131 'ASUS ZenFone 8',
132 'Motorola edge 20 5G',
133 'Motorola edge 30 neo 5G',
134 'Motorola moto g22',
135 'OnePlus Nord 2T 5G',
136 'Samsung Galaxy A32 SM‑A325F',
137 'Samsung Galaxy M13',
138 'Samsung Galaxy S20 FE 5G',
139 'Xiaomi 11T',
140 'Xiaomi POCO M4 Pro',
141 'Xiaomi Redmi 10',
142 'Xiaomi Redmi 10C',
143 'Xiaomi Redmi 9C NFC',
144 'Xiaomi Redmi Note 10 Pro',
145 'Xiaomi Redmi Note 11 Pro',
146 'Xiaomi Redmi Note 11',
147 'Xiaomi Redmi Note 11S 5G',
148 'Xiaomi Redmi Note 11S',
149 'realme 10',
150 'realme 9 Pro+',
151 'vivo Y33s',
152 ))
153 self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
154
34f00179 155 cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
c305a25c 156 if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
34f00179 157 self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
158 return
159
160 password_hash = base64.urlsafe_b64encode(hmac.new(
161 b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
162 ''.join(f'{bytes((bt & 255, )).hex():0>2}'
163 for bt in hashlib.md5(password.encode()).digest()).encode(),
164 hashlib.sha256).digest()).decode().replace('=', '')
165
166 token_res = self._download_json(
167 f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
168 headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
169 query={
170 'grant_type': 'password',
171 'login': username,
172 'password': password_hash,
173 })
174 self.cache.store(self._BEARER_CACHE, username, {
175 'token': token_res['access_token'],
c305a25c 176 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
34f00179 177 })
178 self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
179
8b0d7a66
KM
180 def _real_extract(self, url):
181 video_id = self._match_id(url)
34f00179 182
183 if 'Authorization' in self._API_HEADERS:
184 return self._api_extract(video_id)
185 else:
6d8a53d8 186 return self._web_extract(video_id)
34f00179 187
188 def _api_extract(self, video_id):
189 meta = self._download_json(
190 f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
191
34f00179 192 uploader = traverse_obj(meta, 'author', 'login')
193
194 formats = [{
195 'url': quality['file'],
196 'format': quality.get('title'),
197 'resolution': quality.get('name'),
198 'height': try_call(lambda: int(quality['name'][:-1])),
199 'filesize': quality.get('length'),
200 } for quality in meta['qualities'] if quality.get('file')]
201
da8d2de2 202 if meta.get('premium') and not meta.get('premium_free') and not formats:
203 raise ExtractorError(
204 'Video requires CDA Premium - subscription needed', expected=True)
205
34f00179 206 return {
207 'id': video_id,
208 'title': meta.get('title'),
209 'description': meta.get('description'),
210 'uploader': None if uploader == 'anonim' else uploader,
211 'average_rating': float_or_none(meta.get('rating')),
212 'thumbnail': meta.get('thumb'),
213 'formats': formats,
214 'duration': meta.get('duration'),
215 'age_limit': 18 if meta.get('for_adults') else 0,
216 'view_count': meta.get('views'),
217 }
218
6d8a53d8 219 def _web_extract(self, video_id):
577281b0 220 self._set_cookie('cda.pl', 'cda.player', 'html5')
6d8a53d8 221 webpage, urlh = self._download_webpage_handle(
da8d2de2 222 f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
8b0d7a66
KM
223
224 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
da8d2de2 225 self.raise_login_required('This video is only available for premium users')
8b0d7a66 226
cc2db878 227 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
228 self.raise_geo_restricted()
229
0c265486 230 need_confirm_age = False
6d8a53d8 231 if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
0c265486
YCH
232 webpage, 'birthday validate form', default=None):
233 webpage = self._download_age_confirm_page(
6d8a53d8 234 urlh.url, video_id, note='Confirming age')
0c265486
YCH
235 need_confirm_age = True
236
8b0d7a66
KM
237 formats = []
238
577281b0
KM
239 uploader = self._search_regex(r'''(?x)
240 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
241 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
242 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
243 ''', webpage, 'uploader', default=None, group='uploader')
577281b0 244 average_rating = self._search_regex(
38d70284 245 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
246 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
247 group='rating_value')
577281b0 248
f1ced6df
S
249 info_dict = {
250 'id': video_id,
577281b0
KM
251 'title': self._og_search_title(webpage),
252 'description': self._og_search_description(webpage),
253 'uploader': uploader,
577281b0
KM
254 'average_rating': float_or_none(average_rating),
255 'thumbnail': self._og_search_thumbnail(webpage),
f1ced6df
S
256 'formats': formats,
257 'duration': None,
0c265486 258 'age_limit': 18 if need_confirm_age else 0,
f1ced6df 259 }
8b0d7a66 260
41d1cca3 261 info = self._search_json_ld(webpage, video_id, default={})
262
38d70284 263 # Source: https://www.cda.pl/js/player.js?t=1606154898
264 def decrypt_file(a):
265 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
266 a = a.replace(p, '')
add96eb9 267 a = urllib.parse.unquote(a)
38d70284 268 b = []
269 for c in a:
270 f = compat_ord(c)
ac668111 271 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
38d70284 272 a = ''.join(b)
273 a = a.replace('.cda.mp4', '')
274 for p in ('.2cda.pl', '.3cda.pl'):
275 a = a.replace(p, '.cda.pl')
276 if '/upstream' in a:
277 a = a.replace('/upstream', '.mp4/upstream')
278 return 'https://' + a
279 return 'https://' + a + '.mp4'
280
f1ced6df 281 def extract_format(page, version):
f8f18f33 282 json_str = self._html_search_regex(
577281b0 283 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
add96eb9 284 f'{version} player_json', fatal=False, group='player_data')
577281b0
KM
285 if not json_str:
286 return
287 player_data = self._parse_json(
add96eb9 288 json_str, f'{version} player_data', fatal=False)
577281b0
KM
289 if not player_data:
290 return
291 video = player_data.get('video')
292 if not video or 'file' not in video:
add96eb9 293 self.report_warning(f'Unable to extract {version} version information')
f1ced6df 294 return
fdeea726
AS
295 if video['file'].startswith('uggc'):
296 video['file'] = codecs.decode(video['file'], 'rot_13')
297 if video['file'].endswith('adc.mp4'):
298 video['file'] = video['file'].replace('adc.mp4', '.mp4')
38d70284 299 elif not video['file'].startswith('http'):
300 video['file'] = decrypt_file(video['file'])
05664a2f 301 video_quality = video.get('quality')
302 qualities = video.get('qualities', {})
303 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
304 info_dict['formats'].append({
577281b0 305 'url': video['file'],
05664a2f 306 'format_id': video_quality,
307 'height': int_or_none(video_quality[:-1]),
308 })
309 for quality, cda_quality in qualities.items():
310 if quality == video_quality:
311 continue
312 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
313 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
add96eb9 314 data = json.dumps(data).encode()
05664a2f 315 video_url = self._download_json(
316 f'https://www.cda.pl/video/{video_id}', video_id, headers={
317 'Content-Type': 'application/json',
add96eb9 318 'X-Requested-With': 'XMLHttpRequest',
05664a2f 319 }, data=data, note=f'Fetching {quality} url',
320 errnote=f'Failed to fetch {quality} url', fatal=False)
321 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
322 video_url = try_get(video_url, lambda x: x['result']['resp'])
323 info_dict['formats'].append({
324 'url': video_url,
325 'format_id': quality,
add96eb9 326 'height': int_or_none(quality[:-1]),
05664a2f 327 })
328
f1ced6df 329 if not info_dict['duration']:
577281b0 330 info_dict['duration'] = parse_duration(video.get('duration'))
f1ced6df
S
331
332 extract_format(webpage, 'default')
333
334 for href, resolution in re.findall(
335 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
336 webpage):
0c265486
YCH
337 if need_confirm_age:
338 handler = self._download_age_confirm_page
339 else:
340 handler = self._download_webpage
341
342 webpage = handler(
41d1cca3 343 urljoin(self._BASE_URL, href), video_id,
add96eb9 344 f'Downloading {resolution} version information', fatal=False)
8b0d7a66 345 if not webpage:
f1ced6df
S
346 # Manually report warning because empty page is returned when
347 # invalid version is requested.
add96eb9 348 self.report_warning(f'Unable to download {resolution} version information')
8b0d7a66 349 continue
0c265486 350
f1ced6df 351 extract_format(webpage, resolution)
8b0d7a66 352
38d70284 353 return merge_dicts(info_dict, info)