]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/cda.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / cda.py
CommitLineData
34f00179 1import base64
fdeea726 2import codecs
c305a25c 3import datetime as dt
34f00179 4import hashlib
5import hmac
05664a2f 6import json
da8d2de2 7import random
ac668111 8import re
8b0d7a66
KM
9
10from .common import InfoExtractor
ac668111 11from ..compat import compat_ord, compat_urllib_parse_unquote
8b0d7a66 12from ..utils import (
8b0d7a66 13 ExtractorError,
577281b0
KM
14 float_or_none,
15 int_or_none,
38d70284 16 merge_dicts,
0c265486 17 multipart_encode,
577281b0 18 parse_duration,
34f00179 19 traverse_obj,
20 try_call,
05664a2f 21 try_get,
ac668111 22 urljoin,
8b0d7a66
KM
23)
24
25
26class CDAIE(InfoExtractor):
f1ced6df 27 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
34f00179 28 _NETRC_MACHINE = 'cdapl'
29
da8d2de2 30 _BASE_URL = 'https://www.cda.pl'
34f00179 31 _BASE_API_URL = 'https://api.cda.pl'
32 _API_HEADERS = {
33 'Accept': 'application/vnd.cda.public+json',
34f00179 34 }
35 # hardcoded in the app
36 _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
37 _BEARER_CACHE = 'cda-bearer'
38
f1ced6df
S
39 _TESTS = [{
40 'url': 'http://www.cda.pl/video/5749950c',
41 'md5': '6f844bf51b15f31fae165365707ae970',
42 'info_dict': {
43 'id': '5749950c',
44 'ext': 'mp4',
45 'height': 720,
46 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
577281b0 47 'description': 'md5:269ccd135d550da90d1662651fcb9772',
ec85ded8 48 'thumbnail': r're:^https?://.*\.jpg$',
577281b0 49 'average_rating': float,
0c265486
YCH
50 'duration': 39,
51 'age_limit': 0,
05664a2f 52 'upload_date': '20160221',
53 'timestamp': 1456078244,
f1ced6df
S
54 }
55 }, {
56 'url': 'http://www.cda.pl/video/57413289',
57 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
58 'info_dict': {
59 'id': '57413289',
60 'ext': 'mp4',
61 'title': 'Lądowanie na lotnisku na Maderze',
577281b0 62 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
ec85ded8 63 'thumbnail': r're:^https?://.*\.jpg$',
577281b0 64 'uploader': 'crash404',
577281b0 65 'average_rating': float,
0c265486
YCH
66 'duration': 137,
67 'age_limit': 0,
6d8a53d8
P
68 'upload_date': '20160220',
69 'timestamp': 1455968218,
8b0d7a66 70 }
0c265486 71 }, {
6d8a53d8
P
72 # Age-restricted with vfilm redirection
73 'url': 'https://www.cda.pl/video/8753244c4',
74 'md5': 'd8eeb83d63611289507010d3df3bb8b3',
0c265486 75 'info_dict': {
6d8a53d8 76 'id': '8753244c4',
0c265486 77 'ext': 'mp4',
6d8a53d8
P
78 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?',
79 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
0c265486 80 'height': 1080,
6d8a53d8 81 'uploader': 'arhn eu',
0c265486 82 'thumbnail': r're:^https?://.*\.jpg$',
6d8a53d8 83 'duration': 991,
0c265486 84 'age_limit': 18,
0c265486 85 'average_rating': float,
6d8a53d8
P
86 'timestamp': 1633888264,
87 'upload_date': '20211010',
88 }
89 }, {
90 # Age-restricted without vfilm redirection
91 'url': 'https://www.cda.pl/video/17028157b8',
92 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
93 'info_dict': {
94 'id': '17028157b8',
95 'ext': 'mp4',
96 'title': 'STENDUPY MICHAŁ OGIŃSKI',
97 'description': 'md5:5851f3272bfc31f762d616040a1d609a',
98 'height': 480,
99 'uploader': 'oginski',
100 'thumbnail': r're:^https?://.*\.jpg$',
101 'duration': 18855,
102 'age_limit': 18,
103 'average_rating': float,
104 'timestamp': 1699705901,
105 'upload_date': '20231111',
106 }
f1ced6df
S
107 }, {
108 'url': 'http://ebd.cda.pl/0x0/5749950c',
109 'only_matching': True,
110 }]
8b0d7a66 111
0c265486 112 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
6d8a53d8 113 data, content_type = multipart_encode({'age_confirm': ''})
0c265486 114 return self._download_webpage(
6d8a53d8 115 url, video_id, *args,
0c265486
YCH
116 data=data, headers={
117 'Referer': url,
118 'Content-Type': content_type,
119 }, **kwargs)
120
34f00179 121 def _perform_login(self, username, password):
da8d2de2 122 app_version = random.choice((
123 '1.2.88 build 15306',
124 '1.2.174 build 18469',
125 ))
126 android_version = random.randrange(8, 14)
127 phone_model = random.choice((
128 # x-kom.pl top selling Android smartphones, as of 2022-12-26
129 # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
130 'ASUS ZenFone 8',
131 'Motorola edge 20 5G',
132 'Motorola edge 30 neo 5G',
133 'Motorola moto g22',
134 'OnePlus Nord 2T 5G',
135 'Samsung Galaxy A32 SM‑A325F',
136 'Samsung Galaxy M13',
137 'Samsung Galaxy S20 FE 5G',
138 'Xiaomi 11T',
139 'Xiaomi POCO M4 Pro',
140 'Xiaomi Redmi 10',
141 'Xiaomi Redmi 10C',
142 'Xiaomi Redmi 9C NFC',
143 'Xiaomi Redmi Note 10 Pro',
144 'Xiaomi Redmi Note 11 Pro',
145 'Xiaomi Redmi Note 11',
146 'Xiaomi Redmi Note 11S 5G',
147 'Xiaomi Redmi Note 11S',
148 'realme 10',
149 'realme 9 Pro+',
150 'vivo Y33s',
151 ))
152 self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
153
34f00179 154 cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
c305a25c 155 if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
34f00179 156 self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
157 return
158
159 password_hash = base64.urlsafe_b64encode(hmac.new(
160 b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
161 ''.join(f'{bytes((bt & 255, )).hex():0>2}'
162 for bt in hashlib.md5(password.encode()).digest()).encode(),
163 hashlib.sha256).digest()).decode().replace('=', '')
164
165 token_res = self._download_json(
166 f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
167 headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
168 query={
169 'grant_type': 'password',
170 'login': username,
171 'password': password_hash,
172 })
173 self.cache.store(self._BEARER_CACHE, username, {
174 'token': token_res['access_token'],
c305a25c 175 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
34f00179 176 })
177 self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
178
8b0d7a66
KM
179 def _real_extract(self, url):
180 video_id = self._match_id(url)
34f00179 181
182 if 'Authorization' in self._API_HEADERS:
183 return self._api_extract(video_id)
184 else:
6d8a53d8 185 return self._web_extract(video_id)
34f00179 186
187 def _api_extract(self, video_id):
188 meta = self._download_json(
189 f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
190
34f00179 191 uploader = traverse_obj(meta, 'author', 'login')
192
193 formats = [{
194 'url': quality['file'],
195 'format': quality.get('title'),
196 'resolution': quality.get('name'),
197 'height': try_call(lambda: int(quality['name'][:-1])),
198 'filesize': quality.get('length'),
199 } for quality in meta['qualities'] if quality.get('file')]
200
da8d2de2 201 if meta.get('premium') and not meta.get('premium_free') and not formats:
202 raise ExtractorError(
203 'Video requires CDA Premium - subscription needed', expected=True)
204
34f00179 205 return {
206 'id': video_id,
207 'title': meta.get('title'),
208 'description': meta.get('description'),
209 'uploader': None if uploader == 'anonim' else uploader,
210 'average_rating': float_or_none(meta.get('rating')),
211 'thumbnail': meta.get('thumb'),
212 'formats': formats,
213 'duration': meta.get('duration'),
214 'age_limit': 18 if meta.get('for_adults') else 0,
215 'view_count': meta.get('views'),
216 }
217
6d8a53d8 218 def _web_extract(self, video_id):
577281b0 219 self._set_cookie('cda.pl', 'cda.player', 'html5')
6d8a53d8 220 webpage, urlh = self._download_webpage_handle(
da8d2de2 221 f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
8b0d7a66
KM
222
223 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
da8d2de2 224 self.raise_login_required('This video is only available for premium users')
8b0d7a66 225
cc2db878 226 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
227 self.raise_geo_restricted()
228
0c265486 229 need_confirm_age = False
6d8a53d8 230 if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
0c265486
YCH
231 webpage, 'birthday validate form', default=None):
232 webpage = self._download_age_confirm_page(
6d8a53d8 233 urlh.url, video_id, note='Confirming age')
0c265486
YCH
234 need_confirm_age = True
235
8b0d7a66
KM
236 formats = []
237
577281b0
KM
238 uploader = self._search_regex(r'''(?x)
239 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
240 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
241 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
242 ''', webpage, 'uploader', default=None, group='uploader')
577281b0 243 average_rating = self._search_regex(
38d70284 244 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
245 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
246 group='rating_value')
577281b0 247
f1ced6df
S
248 info_dict = {
249 'id': video_id,
577281b0
KM
250 'title': self._og_search_title(webpage),
251 'description': self._og_search_description(webpage),
252 'uploader': uploader,
577281b0
KM
253 'average_rating': float_or_none(average_rating),
254 'thumbnail': self._og_search_thumbnail(webpage),
f1ced6df
S
255 'formats': formats,
256 'duration': None,
0c265486 257 'age_limit': 18 if need_confirm_age else 0,
f1ced6df 258 }
8b0d7a66 259
41d1cca3 260 info = self._search_json_ld(webpage, video_id, default={})
261
38d70284 262 # Source: https://www.cda.pl/js/player.js?t=1606154898
263 def decrypt_file(a):
264 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
265 a = a.replace(p, '')
266 a = compat_urllib_parse_unquote(a)
267 b = []
268 for c in a:
269 f = compat_ord(c)
ac668111 270 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
38d70284 271 a = ''.join(b)
272 a = a.replace('.cda.mp4', '')
273 for p in ('.2cda.pl', '.3cda.pl'):
274 a = a.replace(p, '.cda.pl')
275 if '/upstream' in a:
276 a = a.replace('/upstream', '.mp4/upstream')
277 return 'https://' + a
278 return 'https://' + a + '.mp4'
279
f1ced6df 280 def extract_format(page, version):
f8f18f33 281 json_str = self._html_search_regex(
577281b0
KM
282 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
283 '%s player_json' % version, fatal=False, group='player_data')
284 if not json_str:
285 return
286 player_data = self._parse_json(
287 json_str, '%s player_data' % version, fatal=False)
288 if not player_data:
289 return
290 video = player_data.get('video')
291 if not video or 'file' not in video:
292 self.report_warning('Unable to extract %s version information' % version)
f1ced6df 293 return
fdeea726
AS
294 if video['file'].startswith('uggc'):
295 video['file'] = codecs.decode(video['file'], 'rot_13')
296 if video['file'].endswith('adc.mp4'):
297 video['file'] = video['file'].replace('adc.mp4', '.mp4')
38d70284 298 elif not video['file'].startswith('http'):
299 video['file'] = decrypt_file(video['file'])
05664a2f 300 video_quality = video.get('quality')
301 qualities = video.get('qualities', {})
302 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
303 info_dict['formats'].append({
577281b0 304 'url': video['file'],
05664a2f 305 'format_id': video_quality,
306 'height': int_or_none(video_quality[:-1]),
307 })
308 for quality, cda_quality in qualities.items():
309 if quality == video_quality:
310 continue
311 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
312 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
313 data = json.dumps(data).encode('utf-8')
314 video_url = self._download_json(
315 f'https://www.cda.pl/video/{video_id}', video_id, headers={
316 'Content-Type': 'application/json',
317 'X-Requested-With': 'XMLHttpRequest'
318 }, data=data, note=f'Fetching {quality} url',
319 errnote=f'Failed to fetch {quality} url', fatal=False)
320 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
321 video_url = try_get(video_url, lambda x: x['result']['resp'])
322 info_dict['formats'].append({
323 'url': video_url,
324 'format_id': quality,
325 'height': int_or_none(quality[:-1])
326 })
327
f1ced6df 328 if not info_dict['duration']:
577281b0 329 info_dict['duration'] = parse_duration(video.get('duration'))
f1ced6df
S
330
331 extract_format(webpage, 'default')
332
333 for href, resolution in re.findall(
334 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
335 webpage):
0c265486
YCH
336 if need_confirm_age:
337 handler = self._download_age_confirm_page
338 else:
339 handler = self._download_webpage
340
341 webpage = handler(
41d1cca3 342 urljoin(self._BASE_URL, href), video_id,
577281b0 343 'Downloading %s version information' % resolution, fatal=False)
8b0d7a66 344 if not webpage:
f1ced6df
S
345 # Manually report warning because empty page is returned when
346 # invalid version is requested.
347 self.report_warning('Unable to download %s version information' % resolution)
8b0d7a66 348 continue
0c265486 349
f1ced6df 350 extract_format(webpage, resolution)
8b0d7a66 351
38d70284 352 return merge_dicts(info_dict, info)