]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/cda.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / cda.py
1 import base64
2 import codecs
3 import datetime as dt
4 import hashlib
5 import hmac
6 import json
7 import random
8 import re
9 import urllib.parse
10
11 from .common import InfoExtractor
12 from ..compat import compat_ord
13 from ..utils import (
14 ExtractorError,
15 float_or_none,
16 int_or_none,
17 merge_dicts,
18 multipart_encode,
19 parse_duration,
20 traverse_obj,
21 try_call,
22 try_get,
23 urljoin,
24 )
25
26
27 class CDAIE(InfoExtractor):
28 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
29 _NETRC_MACHINE = 'cdapl'
30
31 _BASE_URL = 'https://www.cda.pl'
32 _BASE_API_URL = 'https://api.cda.pl'
33 _API_HEADERS = {
34 'Accept': 'application/vnd.cda.public+json',
35 }
36 # hardcoded in the app
37 _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
38 _BEARER_CACHE = 'cda-bearer'
39
40 _TESTS = [{
41 'url': 'http://www.cda.pl/video/5749950c',
42 'md5': '6f844bf51b15f31fae165365707ae970',
43 'info_dict': {
44 'id': '5749950c',
45 'ext': 'mp4',
46 'height': 720,
47 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
48 'description': 'md5:269ccd135d550da90d1662651fcb9772',
49 'thumbnail': r're:^https?://.*\.jpg$',
50 'average_rating': float,
51 'duration': 39,
52 'age_limit': 0,
53 'upload_date': '20160221',
54 'timestamp': 1456078244,
55 },
56 }, {
57 'url': 'http://www.cda.pl/video/57413289',
58 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
59 'info_dict': {
60 'id': '57413289',
61 'ext': 'mp4',
62 'title': 'Lądowanie na lotnisku na Maderze',
63 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
64 'thumbnail': r're:^https?://.*\.jpg$',
65 'uploader': 'crash404',
66 'average_rating': float,
67 'duration': 137,
68 'age_limit': 0,
69 'upload_date': '20160220',
70 'timestamp': 1455968218,
71 },
72 }, {
73 # Age-restricted with vfilm redirection
74 'url': 'https://www.cda.pl/video/8753244c4',
75 'md5': 'd8eeb83d63611289507010d3df3bb8b3',
76 'info_dict': {
77 'id': '8753244c4',
78 'ext': 'mp4',
79 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?',
80 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
81 'height': 1080,
82 'uploader': 'arhn eu',
83 'thumbnail': r're:^https?://.*\.jpg$',
84 'duration': 991,
85 'age_limit': 18,
86 'average_rating': float,
87 'timestamp': 1633888264,
88 'upload_date': '20211010',
89 },
90 }, {
91 # Age-restricted without vfilm redirection
92 'url': 'https://www.cda.pl/video/17028157b8',
93 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
94 'info_dict': {
95 'id': '17028157b8',
96 'ext': 'mp4',
97 'title': 'STENDUPY MICHAŁ OGIŃSKI',
98 'description': 'md5:5851f3272bfc31f762d616040a1d609a',
99 'height': 480,
100 'uploader': 'oginski',
101 'thumbnail': r're:^https?://.*\.jpg$',
102 'duration': 18855,
103 'age_limit': 18,
104 'average_rating': float,
105 'timestamp': 1699705901,
106 'upload_date': '20231111',
107 },
108 }, {
109 'url': 'http://ebd.cda.pl/0x0/5749950c',
110 'only_matching': True,
111 }]
112
113 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
114 data, content_type = multipart_encode({'age_confirm': ''})
115 return self._download_webpage(
116 url, video_id, *args,
117 data=data, headers={
118 'Referer': url,
119 'Content-Type': content_type,
120 }, **kwargs)
121
122 def _perform_login(self, username, password):
123 app_version = random.choice((
124 '1.2.88 build 15306',
125 '1.2.174 build 18469',
126 ))
127 android_version = random.randrange(8, 14)
128 phone_model = random.choice((
129 # x-kom.pl top selling Android smartphones, as of 2022-12-26
130 # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
131 'ASUS ZenFone 8',
132 'Motorola edge 20 5G',
133 'Motorola edge 30 neo 5G',
134 'Motorola moto g22',
135 'OnePlus Nord 2T 5G',
136 'Samsung Galaxy A32 SM‑A325F',
137 'Samsung Galaxy M13',
138 'Samsung Galaxy S20 FE 5G',
139 'Xiaomi 11T',
140 'Xiaomi POCO M4 Pro',
141 'Xiaomi Redmi 10',
142 'Xiaomi Redmi 10C',
143 'Xiaomi Redmi 9C NFC',
144 'Xiaomi Redmi Note 10 Pro',
145 'Xiaomi Redmi Note 11 Pro',
146 'Xiaomi Redmi Note 11',
147 'Xiaomi Redmi Note 11S 5G',
148 'Xiaomi Redmi Note 11S',
149 'realme 10',
150 'realme 9 Pro+',
151 'vivo Y33s',
152 ))
153 self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
154
155 cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
156 if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
157 self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
158 return
159
160 password_hash = base64.urlsafe_b64encode(hmac.new(
161 b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
162 ''.join(f'{bytes((bt & 255, )).hex():0>2}'
163 for bt in hashlib.md5(password.encode()).digest()).encode(),
164 hashlib.sha256).digest()).decode().replace('=', '')
165
166 token_res = self._download_json(
167 f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
168 headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
169 query={
170 'grant_type': 'password',
171 'login': username,
172 'password': password_hash,
173 })
174 self.cache.store(self._BEARER_CACHE, username, {
175 'token': token_res['access_token'],
176 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
177 })
178 self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
179
180 def _real_extract(self, url):
181 video_id = self._match_id(url)
182
183 if 'Authorization' in self._API_HEADERS:
184 return self._api_extract(video_id)
185 else:
186 return self._web_extract(video_id)
187
188 def _api_extract(self, video_id):
189 meta = self._download_json(
190 f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
191
192 uploader = traverse_obj(meta, 'author', 'login')
193
194 formats = [{
195 'url': quality['file'],
196 'format': quality.get('title'),
197 'resolution': quality.get('name'),
198 'height': try_call(lambda: int(quality['name'][:-1])),
199 'filesize': quality.get('length'),
200 } for quality in meta['qualities'] if quality.get('file')]
201
202 if meta.get('premium') and not meta.get('premium_free') and not formats:
203 raise ExtractorError(
204 'Video requires CDA Premium - subscription needed', expected=True)
205
206 return {
207 'id': video_id,
208 'title': meta.get('title'),
209 'description': meta.get('description'),
210 'uploader': None if uploader == 'anonim' else uploader,
211 'average_rating': float_or_none(meta.get('rating')),
212 'thumbnail': meta.get('thumb'),
213 'formats': formats,
214 'duration': meta.get('duration'),
215 'age_limit': 18 if meta.get('for_adults') else 0,
216 'view_count': meta.get('views'),
217 }
218
219 def _web_extract(self, video_id):
220 self._set_cookie('cda.pl', 'cda.player', 'html5')
221 webpage, urlh = self._download_webpage_handle(
222 f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
223
224 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
225 self.raise_login_required('This video is only available for premium users')
226
227 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
228 self.raise_geo_restricted()
229
230 need_confirm_age = False
231 if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
232 webpage, 'birthday validate form', default=None):
233 webpage = self._download_age_confirm_page(
234 urlh.url, video_id, note='Confirming age')
235 need_confirm_age = True
236
237 formats = []
238
239 uploader = self._search_regex(r'''(?x)
240 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
241 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
242 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
243 ''', webpage, 'uploader', default=None, group='uploader')
244 average_rating = self._search_regex(
245 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
246 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
247 group='rating_value')
248
249 info_dict = {
250 'id': video_id,
251 'title': self._og_search_title(webpage),
252 'description': self._og_search_description(webpage),
253 'uploader': uploader,
254 'average_rating': float_or_none(average_rating),
255 'thumbnail': self._og_search_thumbnail(webpage),
256 'formats': formats,
257 'duration': None,
258 'age_limit': 18 if need_confirm_age else 0,
259 }
260
261 info = self._search_json_ld(webpage, video_id, default={})
262
263 # Source: https://www.cda.pl/js/player.js?t=1606154898
264 def decrypt_file(a):
265 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
266 a = a.replace(p, '')
267 a = urllib.parse.unquote(a)
268 b = []
269 for c in a:
270 f = compat_ord(c)
271 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
272 a = ''.join(b)
273 a = a.replace('.cda.mp4', '')
274 for p in ('.2cda.pl', '.3cda.pl'):
275 a = a.replace(p, '.cda.pl')
276 if '/upstream' in a:
277 a = a.replace('/upstream', '.mp4/upstream')
278 return 'https://' + a
279 return 'https://' + a + '.mp4'
280
281 def extract_format(page, version):
282 json_str = self._html_search_regex(
283 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
284 f'{version} player_json', fatal=False, group='player_data')
285 if not json_str:
286 return
287 player_data = self._parse_json(
288 json_str, f'{version} player_data', fatal=False)
289 if not player_data:
290 return
291 video = player_data.get('video')
292 if not video or 'file' not in video:
293 self.report_warning(f'Unable to extract {version} version information')
294 return
295 if video['file'].startswith('uggc'):
296 video['file'] = codecs.decode(video['file'], 'rot_13')
297 if video['file'].endswith('adc.mp4'):
298 video['file'] = video['file'].replace('adc.mp4', '.mp4')
299 elif not video['file'].startswith('http'):
300 video['file'] = decrypt_file(video['file'])
301 video_quality = video.get('quality')
302 qualities = video.get('qualities', {})
303 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
304 info_dict['formats'].append({
305 'url': video['file'],
306 'format_id': video_quality,
307 'height': int_or_none(video_quality[:-1]),
308 })
309 for quality, cda_quality in qualities.items():
310 if quality == video_quality:
311 continue
312 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
313 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
314 data = json.dumps(data).encode()
315 video_url = self._download_json(
316 f'https://www.cda.pl/video/{video_id}', video_id, headers={
317 'Content-Type': 'application/json',
318 'X-Requested-With': 'XMLHttpRequest',
319 }, data=data, note=f'Fetching {quality} url',
320 errnote=f'Failed to fetch {quality} url', fatal=False)
321 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
322 video_url = try_get(video_url, lambda x: x['result']['resp'])
323 info_dict['formats'].append({
324 'url': video_url,
325 'format_id': quality,
326 'height': int_or_none(quality[:-1]),
327 })
328
329 if not info_dict['duration']:
330 info_dict['duration'] = parse_duration(video.get('duration'))
331
332 extract_format(webpage, 'default')
333
334 for href, resolution in re.findall(
335 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
336 webpage):
337 if need_confirm_age:
338 handler = self._download_age_confirm_page
339 else:
340 handler = self._download_webpage
341
342 webpage = handler(
343 urljoin(self._BASE_URL, href), video_id,
344 f'Downloading {resolution} version information', fatal=False)
345 if not webpage:
346 # Manually report warning because empty page is returned when
347 # invalid version is requested.
348 self.report_warning(f'Unable to download {resolution} version information')
349 continue
350
351 extract_format(webpage, resolution)
352
353 return merge_dicts(info_dict, info)