]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/cda.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / cda.py
1 import base64
2 import codecs
3 import datetime as dt
4 import hashlib
5 import hmac
6 import json
7 import random
8 import re
9
10 from .common import InfoExtractor
11 from ..compat import compat_ord, compat_urllib_parse_unquote
12 from ..utils import (
13 ExtractorError,
14 float_or_none,
15 int_or_none,
16 merge_dicts,
17 multipart_encode,
18 parse_duration,
19 traverse_obj,
20 try_call,
21 try_get,
22 urljoin,
23 )
24
25
26 class CDAIE(InfoExtractor):
27 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
28 _NETRC_MACHINE = 'cdapl'
29
30 _BASE_URL = 'https://www.cda.pl'
31 _BASE_API_URL = 'https://api.cda.pl'
32 _API_HEADERS = {
33 'Accept': 'application/vnd.cda.public+json',
34 }
35 # hardcoded in the app
36 _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
37 _BEARER_CACHE = 'cda-bearer'
38
39 _TESTS = [{
40 'url': 'http://www.cda.pl/video/5749950c',
41 'md5': '6f844bf51b15f31fae165365707ae970',
42 'info_dict': {
43 'id': '5749950c',
44 'ext': 'mp4',
45 'height': 720,
46 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
47 'description': 'md5:269ccd135d550da90d1662651fcb9772',
48 'thumbnail': r're:^https?://.*\.jpg$',
49 'average_rating': float,
50 'duration': 39,
51 'age_limit': 0,
52 'upload_date': '20160221',
53 'timestamp': 1456078244,
54 }
55 }, {
56 'url': 'http://www.cda.pl/video/57413289',
57 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
58 'info_dict': {
59 'id': '57413289',
60 'ext': 'mp4',
61 'title': 'Lądowanie na lotnisku na Maderze',
62 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
63 'thumbnail': r're:^https?://.*\.jpg$',
64 'uploader': 'crash404',
65 'average_rating': float,
66 'duration': 137,
67 'age_limit': 0,
68 'upload_date': '20160220',
69 'timestamp': 1455968218,
70 }
71 }, {
72 # Age-restricted with vfilm redirection
73 'url': 'https://www.cda.pl/video/8753244c4',
74 'md5': 'd8eeb83d63611289507010d3df3bb8b3',
75 'info_dict': {
76 'id': '8753244c4',
77 'ext': 'mp4',
78 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?',
79 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
80 'height': 1080,
81 'uploader': 'arhn eu',
82 'thumbnail': r're:^https?://.*\.jpg$',
83 'duration': 991,
84 'age_limit': 18,
85 'average_rating': float,
86 'timestamp': 1633888264,
87 'upload_date': '20211010',
88 }
89 }, {
90 # Age-restricted without vfilm redirection
91 'url': 'https://www.cda.pl/video/17028157b8',
92 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
93 'info_dict': {
94 'id': '17028157b8',
95 'ext': 'mp4',
96 'title': 'STENDUPY MICHAŁ OGIŃSKI',
97 'description': 'md5:5851f3272bfc31f762d616040a1d609a',
98 'height': 480,
99 'uploader': 'oginski',
100 'thumbnail': r're:^https?://.*\.jpg$',
101 'duration': 18855,
102 'age_limit': 18,
103 'average_rating': float,
104 'timestamp': 1699705901,
105 'upload_date': '20231111',
106 }
107 }, {
108 'url': 'http://ebd.cda.pl/0x0/5749950c',
109 'only_matching': True,
110 }]
111
112 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
113 data, content_type = multipart_encode({'age_confirm': ''})
114 return self._download_webpage(
115 url, video_id, *args,
116 data=data, headers={
117 'Referer': url,
118 'Content-Type': content_type,
119 }, **kwargs)
120
121 def _perform_login(self, username, password):
122 app_version = random.choice((
123 '1.2.88 build 15306',
124 '1.2.174 build 18469',
125 ))
126 android_version = random.randrange(8, 14)
127 phone_model = random.choice((
128 # x-kom.pl top selling Android smartphones, as of 2022-12-26
129 # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
130 'ASUS ZenFone 8',
131 'Motorola edge 20 5G',
132 'Motorola edge 30 neo 5G',
133 'Motorola moto g22',
134 'OnePlus Nord 2T 5G',
135 'Samsung Galaxy A32 SM‑A325F',
136 'Samsung Galaxy M13',
137 'Samsung Galaxy S20 FE 5G',
138 'Xiaomi 11T',
139 'Xiaomi POCO M4 Pro',
140 'Xiaomi Redmi 10',
141 'Xiaomi Redmi 10C',
142 'Xiaomi Redmi 9C NFC',
143 'Xiaomi Redmi Note 10 Pro',
144 'Xiaomi Redmi Note 11 Pro',
145 'Xiaomi Redmi Note 11',
146 'Xiaomi Redmi Note 11S 5G',
147 'Xiaomi Redmi Note 11S',
148 'realme 10',
149 'realme 9 Pro+',
150 'vivo Y33s',
151 ))
152 self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
153
154 cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
155 if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
156 self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
157 return
158
159 password_hash = base64.urlsafe_b64encode(hmac.new(
160 b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
161 ''.join(f'{bytes((bt & 255, )).hex():0>2}'
162 for bt in hashlib.md5(password.encode()).digest()).encode(),
163 hashlib.sha256).digest()).decode().replace('=', '')
164
165 token_res = self._download_json(
166 f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
167 headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
168 query={
169 'grant_type': 'password',
170 'login': username,
171 'password': password_hash,
172 })
173 self.cache.store(self._BEARER_CACHE, username, {
174 'token': token_res['access_token'],
175 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
176 })
177 self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
178
179 def _real_extract(self, url):
180 video_id = self._match_id(url)
181
182 if 'Authorization' in self._API_HEADERS:
183 return self._api_extract(video_id)
184 else:
185 return self._web_extract(video_id)
186
187 def _api_extract(self, video_id):
188 meta = self._download_json(
189 f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
190
191 uploader = traverse_obj(meta, 'author', 'login')
192
193 formats = [{
194 'url': quality['file'],
195 'format': quality.get('title'),
196 'resolution': quality.get('name'),
197 'height': try_call(lambda: int(quality['name'][:-1])),
198 'filesize': quality.get('length'),
199 } for quality in meta['qualities'] if quality.get('file')]
200
201 if meta.get('premium') and not meta.get('premium_free') and not formats:
202 raise ExtractorError(
203 'Video requires CDA Premium - subscription needed', expected=True)
204
205 return {
206 'id': video_id,
207 'title': meta.get('title'),
208 'description': meta.get('description'),
209 'uploader': None if uploader == 'anonim' else uploader,
210 'average_rating': float_or_none(meta.get('rating')),
211 'thumbnail': meta.get('thumb'),
212 'formats': formats,
213 'duration': meta.get('duration'),
214 'age_limit': 18 if meta.get('for_adults') else 0,
215 'view_count': meta.get('views'),
216 }
217
218 def _web_extract(self, video_id):
219 self._set_cookie('cda.pl', 'cda.player', 'html5')
220 webpage, urlh = self._download_webpage_handle(
221 f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
222
223 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
224 self.raise_login_required('This video is only available for premium users')
225
226 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
227 self.raise_geo_restricted()
228
229 need_confirm_age = False
230 if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
231 webpage, 'birthday validate form', default=None):
232 webpage = self._download_age_confirm_page(
233 urlh.url, video_id, note='Confirming age')
234 need_confirm_age = True
235
236 formats = []
237
238 uploader = self._search_regex(r'''(?x)
239 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
240 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
241 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
242 ''', webpage, 'uploader', default=None, group='uploader')
243 average_rating = self._search_regex(
244 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
245 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
246 group='rating_value')
247
248 info_dict = {
249 'id': video_id,
250 'title': self._og_search_title(webpage),
251 'description': self._og_search_description(webpage),
252 'uploader': uploader,
253 'average_rating': float_or_none(average_rating),
254 'thumbnail': self._og_search_thumbnail(webpage),
255 'formats': formats,
256 'duration': None,
257 'age_limit': 18 if need_confirm_age else 0,
258 }
259
260 info = self._search_json_ld(webpage, video_id, default={})
261
262 # Source: https://www.cda.pl/js/player.js?t=1606154898
263 def decrypt_file(a):
264 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
265 a = a.replace(p, '')
266 a = compat_urllib_parse_unquote(a)
267 b = []
268 for c in a:
269 f = compat_ord(c)
270 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
271 a = ''.join(b)
272 a = a.replace('.cda.mp4', '')
273 for p in ('.2cda.pl', '.3cda.pl'):
274 a = a.replace(p, '.cda.pl')
275 if '/upstream' in a:
276 a = a.replace('/upstream', '.mp4/upstream')
277 return 'https://' + a
278 return 'https://' + a + '.mp4'
279
280 def extract_format(page, version):
281 json_str = self._html_search_regex(
282 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
283 '%s player_json' % version, fatal=False, group='player_data')
284 if not json_str:
285 return
286 player_data = self._parse_json(
287 json_str, '%s player_data' % version, fatal=False)
288 if not player_data:
289 return
290 video = player_data.get('video')
291 if not video or 'file' not in video:
292 self.report_warning('Unable to extract %s version information' % version)
293 return
294 if video['file'].startswith('uggc'):
295 video['file'] = codecs.decode(video['file'], 'rot_13')
296 if video['file'].endswith('adc.mp4'):
297 video['file'] = video['file'].replace('adc.mp4', '.mp4')
298 elif not video['file'].startswith('http'):
299 video['file'] = decrypt_file(video['file'])
300 video_quality = video.get('quality')
301 qualities = video.get('qualities', {})
302 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
303 info_dict['formats'].append({
304 'url': video['file'],
305 'format_id': video_quality,
306 'height': int_or_none(video_quality[:-1]),
307 })
308 for quality, cda_quality in qualities.items():
309 if quality == video_quality:
310 continue
311 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
312 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
313 data = json.dumps(data).encode('utf-8')
314 video_url = self._download_json(
315 f'https://www.cda.pl/video/{video_id}', video_id, headers={
316 'Content-Type': 'application/json',
317 'X-Requested-With': 'XMLHttpRequest'
318 }, data=data, note=f'Fetching {quality} url',
319 errnote=f'Failed to fetch {quality} url', fatal=False)
320 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
321 video_url = try_get(video_url, lambda x: x['result']['resp'])
322 info_dict['formats'].append({
323 'url': video_url,
324 'format_id': quality,
325 'height': int_or_none(quality[:-1])
326 })
327
328 if not info_dict['duration']:
329 info_dict['duration'] = parse_duration(video.get('duration'))
330
331 extract_format(webpage, 'default')
332
333 for href, resolution in re.findall(
334 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
335 webpage):
336 if need_confirm_age:
337 handler = self._download_age_confirm_page
338 else:
339 handler = self._download_webpage
340
341 webpage = handler(
342 urljoin(self._BASE_URL, href), video_id,
343 'Downloading %s version information' % resolution, fatal=False)
344 if not webpage:
345 # Manually report warning because empty page is returned when
346 # invalid version is requested.
347 self.report_warning('Unable to download %s version information' % resolution)
348 continue
349
350 extract_format(webpage, resolution)
351
352 return merge_dicts(info_dict, info)