]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/cda.py
[extractor/goplay] Fix extractor (#6654)
[yt-dlp.git] / yt_dlp / extractor / cda.py
CommitLineData
34f00179 1import base64
fdeea726 2import codecs
34f00179 3import datetime
4import hashlib
5import hmac
05664a2f 6import json
da8d2de2 7import random
ac668111 8import re
8b0d7a66
KM
9
10from .common import InfoExtractor
ac668111 11from ..compat import compat_ord, compat_urllib_parse_unquote
8b0d7a66 12from ..utils import (
8b0d7a66 13 ExtractorError,
577281b0
KM
14 float_or_none,
15 int_or_none,
38d70284 16 merge_dicts,
0c265486 17 multipart_encode,
577281b0 18 parse_duration,
0c265486 19 random_birthday,
34f00179 20 traverse_obj,
21 try_call,
05664a2f 22 try_get,
ac668111 23 urljoin,
8b0d7a66
KM
24)
25
26
27class CDAIE(InfoExtractor):
f1ced6df 28 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
34f00179 29 _NETRC_MACHINE = 'cdapl'
30
da8d2de2 31 _BASE_URL = 'https://www.cda.pl'
34f00179 32 _BASE_API_URL = 'https://api.cda.pl'
33 _API_HEADERS = {
34 'Accept': 'application/vnd.cda.public+json',
34f00179 35 }
36 # hardcoded in the app
37 _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
38 _BEARER_CACHE = 'cda-bearer'
39
f1ced6df
S
40 _TESTS = [{
41 'url': 'http://www.cda.pl/video/5749950c',
42 'md5': '6f844bf51b15f31fae165365707ae970',
43 'info_dict': {
44 'id': '5749950c',
45 'ext': 'mp4',
46 'height': 720,
47 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
577281b0 48 'description': 'md5:269ccd135d550da90d1662651fcb9772',
ec85ded8 49 'thumbnail': r're:^https?://.*\.jpg$',
577281b0 50 'average_rating': float,
0c265486
YCH
51 'duration': 39,
52 'age_limit': 0,
05664a2f 53 'upload_date': '20160221',
54 'timestamp': 1456078244,
f1ced6df
S
55 }
56 }, {
57 'url': 'http://www.cda.pl/video/57413289',
58 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
59 'info_dict': {
60 'id': '57413289',
61 'ext': 'mp4',
62 'title': 'Lądowanie na lotnisku na Maderze',
577281b0 63 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
ec85ded8 64 'thumbnail': r're:^https?://.*\.jpg$',
577281b0
KM
65 'uploader': 'crash404',
66 'view_count': int,
67 'average_rating': float,
0c265486
YCH
68 'duration': 137,
69 'age_limit': 0,
8b0d7a66 70 }
0c265486
YCH
71 }, {
72 # Age-restricted
73 'url': 'http://www.cda.pl/video/1273454c4',
74 'info_dict': {
75 'id': '1273454c4',
76 'ext': 'mp4',
77 'title': 'Bronson (2008) napisy HD 1080p',
78 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
79 'height': 1080,
80 'uploader': 'boniek61',
81 'thumbnail': r're:^https?://.*\.jpg$',
82 'duration': 5554,
83 'age_limit': 18,
84 'view_count': int,
85 'average_rating': float,
86 },
f1ced6df
S
87 }, {
88 'url': 'http://ebd.cda.pl/0x0/5749950c',
89 'only_matching': True,
90 }]
8b0d7a66 91
0c265486
YCH
92 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
93 form_data = random_birthday('rok', 'miesiac', 'dzien')
94 form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
95 data, content_type = multipart_encode(form_data)
96 return self._download_webpage(
97 urljoin(url, '/a/validatebirth'), video_id, *args,
98 data=data, headers={
99 'Referer': url,
100 'Content-Type': content_type,
101 }, **kwargs)
102
34f00179 103 def _perform_login(self, username, password):
da8d2de2 104 app_version = random.choice((
105 '1.2.88 build 15306',
106 '1.2.174 build 18469',
107 ))
108 android_version = random.randrange(8, 14)
109 phone_model = random.choice((
110 # x-kom.pl top selling Android smartphones, as of 2022-12-26
111 # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android
112 'ASUS ZenFone 8',
113 'Motorola edge 20 5G',
114 'Motorola edge 30 neo 5G',
115 'Motorola moto g22',
116 'OnePlus Nord 2T 5G',
117 'Samsung Galaxy A32 SM‑A325F',
118 'Samsung Galaxy M13',
119 'Samsung Galaxy S20 FE 5G',
120 'Xiaomi 11T',
121 'Xiaomi POCO M4 Pro',
122 'Xiaomi Redmi 10',
123 'Xiaomi Redmi 10C',
124 'Xiaomi Redmi 9C NFC',
125 'Xiaomi Redmi Note 10 Pro',
126 'Xiaomi Redmi Note 11 Pro',
127 'Xiaomi Redmi Note 11',
128 'Xiaomi Redmi Note 11S 5G',
129 'Xiaomi Redmi Note 11S',
130 'realme 10',
131 'realme 9 Pro+',
132 'vivo Y33s',
133 ))
134 self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
135
34f00179 136 cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
137 if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5:
138 self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
139 return
140
141 password_hash = base64.urlsafe_b64encode(hmac.new(
142 b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
143 ''.join(f'{bytes((bt & 255, )).hex():0>2}'
144 for bt in hashlib.md5(password.encode()).digest()).encode(),
145 hashlib.sha256).digest()).decode().replace('=', '')
146
147 token_res = self._download_json(
148 f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
149 headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
150 query={
151 'grant_type': 'password',
152 'login': username,
153 'password': password_hash,
154 })
155 self.cache.store(self._BEARER_CACHE, username, {
156 'token': token_res['access_token'],
157 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(),
158 })
159 self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
160
8b0d7a66
KM
161 def _real_extract(self, url):
162 video_id = self._match_id(url)
34f00179 163
164 if 'Authorization' in self._API_HEADERS:
165 return self._api_extract(video_id)
166 else:
167 return self._web_extract(video_id, url)
168
169 def _api_extract(self, video_id):
170 meta = self._download_json(
171 f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
172
34f00179 173 uploader = traverse_obj(meta, 'author', 'login')
174
175 formats = [{
176 'url': quality['file'],
177 'format': quality.get('title'),
178 'resolution': quality.get('name'),
179 'height': try_call(lambda: int(quality['name'][:-1])),
180 'filesize': quality.get('length'),
181 } for quality in meta['qualities'] if quality.get('file')]
182
da8d2de2 183 if meta.get('premium') and not meta.get('premium_free') and not formats:
184 raise ExtractorError(
185 'Video requires CDA Premium - subscription needed', expected=True)
186
34f00179 187 return {
188 'id': video_id,
189 'title': meta.get('title'),
190 'description': meta.get('description'),
191 'uploader': None if uploader == 'anonim' else uploader,
192 'average_rating': float_or_none(meta.get('rating')),
193 'thumbnail': meta.get('thumb'),
194 'formats': formats,
195 'duration': meta.get('duration'),
196 'age_limit': 18 if meta.get('for_adults') else 0,
197 'view_count': meta.get('views'),
198 }
199
200 def _web_extract(self, video_id, url):
577281b0
KM
201 self._set_cookie('cda.pl', 'cda.player', 'html5')
202 webpage = self._download_webpage(
da8d2de2 203 f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
8b0d7a66
KM
204
205 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
da8d2de2 206 self.raise_login_required('This video is only available for premium users')
8b0d7a66 207
cc2db878 208 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
209 self.raise_geo_restricted()
210
0c265486 211 need_confirm_age = False
2181983a 212 if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")',
0c265486
YCH
213 webpage, 'birthday validate form', default=None):
214 webpage = self._download_age_confirm_page(
215 url, video_id, note='Confirming age')
216 need_confirm_age = True
217
8b0d7a66
KM
218 formats = []
219
577281b0
KM
220 uploader = self._search_regex(r'''(?x)
221 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
222 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
223 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
224 ''', webpage, 'uploader', default=None, group='uploader')
225 view_count = self._search_regex(
226 r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
227 'view_count', default=None)
228 average_rating = self._search_regex(
38d70284 229 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
230 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
231 group='rating_value')
577281b0 232
f1ced6df
S
233 info_dict = {
234 'id': video_id,
577281b0
KM
235 'title': self._og_search_title(webpage),
236 'description': self._og_search_description(webpage),
237 'uploader': uploader,
238 'view_count': int_or_none(view_count),
239 'average_rating': float_or_none(average_rating),
240 'thumbnail': self._og_search_thumbnail(webpage),
f1ced6df
S
241 'formats': formats,
242 'duration': None,
0c265486 243 'age_limit': 18 if need_confirm_age else 0,
f1ced6df 244 }
8b0d7a66 245
41d1cca3 246 info = self._search_json_ld(webpage, video_id, default={})
247
38d70284 248 # Source: https://www.cda.pl/js/player.js?t=1606154898
249 def decrypt_file(a):
250 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
251 a = a.replace(p, '')
252 a = compat_urllib_parse_unquote(a)
253 b = []
254 for c in a:
255 f = compat_ord(c)
ac668111 256 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
38d70284 257 a = ''.join(b)
258 a = a.replace('.cda.mp4', '')
259 for p in ('.2cda.pl', '.3cda.pl'):
260 a = a.replace(p, '.cda.pl')
261 if '/upstream' in a:
262 a = a.replace('/upstream', '.mp4/upstream')
263 return 'https://' + a
264 return 'https://' + a + '.mp4'
265
f1ced6df 266 def extract_format(page, version):
f8f18f33 267 json_str = self._html_search_regex(
577281b0
KM
268 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
269 '%s player_json' % version, fatal=False, group='player_data')
270 if not json_str:
271 return
272 player_data = self._parse_json(
273 json_str, '%s player_data' % version, fatal=False)
274 if not player_data:
275 return
276 video = player_data.get('video')
277 if not video or 'file' not in video:
278 self.report_warning('Unable to extract %s version information' % version)
f1ced6df 279 return
fdeea726
AS
280 if video['file'].startswith('uggc'):
281 video['file'] = codecs.decode(video['file'], 'rot_13')
282 if video['file'].endswith('adc.mp4'):
283 video['file'] = video['file'].replace('adc.mp4', '.mp4')
38d70284 284 elif not video['file'].startswith('http'):
285 video['file'] = decrypt_file(video['file'])
05664a2f 286 video_quality = video.get('quality')
287 qualities = video.get('qualities', {})
288 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
289 info_dict['formats'].append({
577281b0 290 'url': video['file'],
05664a2f 291 'format_id': video_quality,
292 'height': int_or_none(video_quality[:-1]),
293 })
294 for quality, cda_quality in qualities.items():
295 if quality == video_quality:
296 continue
297 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
298 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
299 data = json.dumps(data).encode('utf-8')
300 video_url = self._download_json(
301 f'https://www.cda.pl/video/{video_id}', video_id, headers={
302 'Content-Type': 'application/json',
303 'X-Requested-With': 'XMLHttpRequest'
304 }, data=data, note=f'Fetching {quality} url',
305 errnote=f'Failed to fetch {quality} url', fatal=False)
306 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
307 video_url = try_get(video_url, lambda x: x['result']['resp'])
308 info_dict['formats'].append({
309 'url': video_url,
310 'format_id': quality,
311 'height': int_or_none(quality[:-1])
312 })
313
f1ced6df 314 if not info_dict['duration']:
577281b0 315 info_dict['duration'] = parse_duration(video.get('duration'))
f1ced6df
S
316
317 extract_format(webpage, 'default')
318
319 for href, resolution in re.findall(
320 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
321 webpage):
0c265486
YCH
322 if need_confirm_age:
323 handler = self._download_age_confirm_page
324 else:
325 handler = self._download_webpage
326
327 webpage = handler(
41d1cca3 328 urljoin(self._BASE_URL, href), video_id,
577281b0 329 'Downloading %s version information' % resolution, fatal=False)
8b0d7a66 330 if not webpage:
f1ced6df
S
331 # Manually report warning because empty page is returned when
332 # invalid version is requested.
333 self.report_warning('Unable to download %s version information' % resolution)
8b0d7a66 334 continue
0c265486 335
f1ced6df 336 extract_format(webpage, resolution)
8b0d7a66 337
38d70284 338 return merge_dicts(info_dict, info)