]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/cda.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / cda.py
CommitLineData
34f00179 1import base64
fdeea726 2import codecs
34f00179 3import datetime
4import hashlib
5import hmac
05664a2f 6import json
ac668111 7import re
8b0d7a66
KM
8
9from .common import InfoExtractor
ac668111 10from ..compat import compat_ord, compat_urllib_parse_unquote
8b0d7a66 11from ..utils import (
8b0d7a66 12 ExtractorError,
577281b0
KM
13 float_or_none,
14 int_or_none,
38d70284 15 merge_dicts,
0c265486 16 multipart_encode,
577281b0 17 parse_duration,
0c265486 18 random_birthday,
34f00179 19 traverse_obj,
20 try_call,
05664a2f 21 try_get,
ac668111 22 urljoin,
8b0d7a66
KM
23)
24
25
26class CDAIE(InfoExtractor):
f1ced6df 27 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
34f00179 28 _NETRC_MACHINE = 'cdapl'
29
577281b0 30 _BASE_URL = 'http://www.cda.pl/'
34f00179 31 _BASE_API_URL = 'https://api.cda.pl'
32 _API_HEADERS = {
33 'Accept': 'application/vnd.cda.public+json',
34 'User-Agent': 'pl.cda 1.0 (version 1.2.88 build 15306; Android 9; Xiaomi Redmi 3S)',
35 }
36 # hardcoded in the app
37 _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q'
38 _BEARER_CACHE = 'cda-bearer'
39
f1ced6df
S
40 _TESTS = [{
41 'url': 'http://www.cda.pl/video/5749950c',
42 'md5': '6f844bf51b15f31fae165365707ae970',
43 'info_dict': {
44 'id': '5749950c',
45 'ext': 'mp4',
46 'height': 720,
47 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
577281b0 48 'description': 'md5:269ccd135d550da90d1662651fcb9772',
ec85ded8 49 'thumbnail': r're:^https?://.*\.jpg$',
577281b0 50 'average_rating': float,
0c265486
YCH
51 'duration': 39,
52 'age_limit': 0,
05664a2f 53 'upload_date': '20160221',
54 'timestamp': 1456078244,
f1ced6df
S
55 }
56 }, {
57 'url': 'http://www.cda.pl/video/57413289',
58 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
59 'info_dict': {
60 'id': '57413289',
61 'ext': 'mp4',
62 'title': 'Lądowanie na lotnisku na Maderze',
577281b0 63 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
ec85ded8 64 'thumbnail': r're:^https?://.*\.jpg$',
577281b0
KM
65 'uploader': 'crash404',
66 'view_count': int,
67 'average_rating': float,
0c265486
YCH
68 'duration': 137,
69 'age_limit': 0,
8b0d7a66 70 }
0c265486
YCH
71 }, {
72 # Age-restricted
73 'url': 'http://www.cda.pl/video/1273454c4',
74 'info_dict': {
75 'id': '1273454c4',
76 'ext': 'mp4',
77 'title': 'Bronson (2008) napisy HD 1080p',
78 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
79 'height': 1080,
80 'uploader': 'boniek61',
81 'thumbnail': r're:^https?://.*\.jpg$',
82 'duration': 5554,
83 'age_limit': 18,
84 'view_count': int,
85 'average_rating': float,
86 },
f1ced6df
S
87 }, {
88 'url': 'http://ebd.cda.pl/0x0/5749950c',
89 'only_matching': True,
90 }]
8b0d7a66 91
0c265486
YCH
92 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
93 form_data = random_birthday('rok', 'miesiac', 'dzien')
94 form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
95 data, content_type = multipart_encode(form_data)
96 return self._download_webpage(
97 urljoin(url, '/a/validatebirth'), video_id, *args,
98 data=data, headers={
99 'Referer': url,
100 'Content-Type': content_type,
101 }, **kwargs)
102
34f00179 103 def _perform_login(self, username, password):
104 cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
105 if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5:
106 self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
107 return
108
109 password_hash = base64.urlsafe_b64encode(hmac.new(
110 b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P',
111 ''.join(f'{bytes((bt & 255, )).hex():0>2}'
112 for bt in hashlib.md5(password.encode()).digest()).encode(),
113 hashlib.sha256).digest()).decode().replace('=', '')
114
115 token_res = self._download_json(
116 f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'',
117 headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH},
118 query={
119 'grant_type': 'password',
120 'login': username,
121 'password': password_hash,
122 })
123 self.cache.store(self._BEARER_CACHE, username, {
124 'token': token_res['access_token'],
125 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(),
126 })
127 self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
128
8b0d7a66
KM
129 def _real_extract(self, url):
130 video_id = self._match_id(url)
34f00179 131
132 if 'Authorization' in self._API_HEADERS:
133 return self._api_extract(video_id)
134 else:
135 return self._web_extract(video_id, url)
136
137 def _api_extract(self, video_id):
138 meta = self._download_json(
139 f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video']
140
141 if meta.get('premium') and not meta.get('premium_free'):
142 self.report_drm(video_id)
143
144 uploader = traverse_obj(meta, 'author', 'login')
145
146 formats = [{
147 'url': quality['file'],
148 'format': quality.get('title'),
149 'resolution': quality.get('name'),
150 'height': try_call(lambda: int(quality['name'][:-1])),
151 'filesize': quality.get('length'),
152 } for quality in meta['qualities'] if quality.get('file')]
153
34f00179 154 return {
155 'id': video_id,
156 'title': meta.get('title'),
157 'description': meta.get('description'),
158 'uploader': None if uploader == 'anonim' else uploader,
159 'average_rating': float_or_none(meta.get('rating')),
160 'thumbnail': meta.get('thumb'),
161 'formats': formats,
162 'duration': meta.get('duration'),
163 'age_limit': 18 if meta.get('for_adults') else 0,
164 'view_count': meta.get('views'),
165 }
166
167 def _web_extract(self, video_id, url):
577281b0
KM
168 self._set_cookie('cda.pl', 'cda.player', 'html5')
169 webpage = self._download_webpage(
170 self._BASE_URL + '/video/' + video_id, video_id)
8b0d7a66
KM
171
172 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
173 raise ExtractorError('This video is only available for premium users.', expected=True)
174
cc2db878 175 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
176 self.raise_geo_restricted()
177
0c265486 178 need_confirm_age = False
2181983a 179 if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")',
0c265486
YCH
180 webpage, 'birthday validate form', default=None):
181 webpage = self._download_age_confirm_page(
182 url, video_id, note='Confirming age')
183 need_confirm_age = True
184
8b0d7a66
KM
185 formats = []
186
577281b0
KM
187 uploader = self._search_regex(r'''(?x)
188 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
189 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
190 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
191 ''', webpage, 'uploader', default=None, group='uploader')
192 view_count = self._search_regex(
193 r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
194 'view_count', default=None)
195 average_rating = self._search_regex(
38d70284 196 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
197 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
198 group='rating_value')
577281b0 199
f1ced6df
S
200 info_dict = {
201 'id': video_id,
577281b0
KM
202 'title': self._og_search_title(webpage),
203 'description': self._og_search_description(webpage),
204 'uploader': uploader,
205 'view_count': int_or_none(view_count),
206 'average_rating': float_or_none(average_rating),
207 'thumbnail': self._og_search_thumbnail(webpage),
f1ced6df
S
208 'formats': formats,
209 'duration': None,
0c265486 210 'age_limit': 18 if need_confirm_age else 0,
f1ced6df 211 }
8b0d7a66 212
41d1cca3 213 info = self._search_json_ld(webpage, video_id, default={})
214
38d70284 215 # Source: https://www.cda.pl/js/player.js?t=1606154898
216 def decrypt_file(a):
217 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
218 a = a.replace(p, '')
219 a = compat_urllib_parse_unquote(a)
220 b = []
221 for c in a:
222 f = compat_ord(c)
ac668111 223 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
38d70284 224 a = ''.join(b)
225 a = a.replace('.cda.mp4', '')
226 for p in ('.2cda.pl', '.3cda.pl'):
227 a = a.replace(p, '.cda.pl')
228 if '/upstream' in a:
229 a = a.replace('/upstream', '.mp4/upstream')
230 return 'https://' + a
231 return 'https://' + a + '.mp4'
232
f1ced6df 233 def extract_format(page, version):
f8f18f33 234 json_str = self._html_search_regex(
577281b0
KM
235 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
236 '%s player_json' % version, fatal=False, group='player_data')
237 if not json_str:
238 return
239 player_data = self._parse_json(
240 json_str, '%s player_data' % version, fatal=False)
241 if not player_data:
242 return
243 video = player_data.get('video')
244 if not video or 'file' not in video:
245 self.report_warning('Unable to extract %s version information' % version)
f1ced6df 246 return
fdeea726
AS
247 if video['file'].startswith('uggc'):
248 video['file'] = codecs.decode(video['file'], 'rot_13')
249 if video['file'].endswith('adc.mp4'):
250 video['file'] = video['file'].replace('adc.mp4', '.mp4')
38d70284 251 elif not video['file'].startswith('http'):
252 video['file'] = decrypt_file(video['file'])
05664a2f 253 video_quality = video.get('quality')
254 qualities = video.get('qualities', {})
255 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
256 info_dict['formats'].append({
577281b0 257 'url': video['file'],
05664a2f 258 'format_id': video_quality,
259 'height': int_or_none(video_quality[:-1]),
260 })
261 for quality, cda_quality in qualities.items():
262 if quality == video_quality:
263 continue
264 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
265 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
266 data = json.dumps(data).encode('utf-8')
267 video_url = self._download_json(
268 f'https://www.cda.pl/video/{video_id}', video_id, headers={
269 'Content-Type': 'application/json',
270 'X-Requested-With': 'XMLHttpRequest'
271 }, data=data, note=f'Fetching {quality} url',
272 errnote=f'Failed to fetch {quality} url', fatal=False)
273 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
274 video_url = try_get(video_url, lambda x: x['result']['resp'])
275 info_dict['formats'].append({
276 'url': video_url,
277 'format_id': quality,
278 'height': int_or_none(quality[:-1])
279 })
280
f1ced6df 281 if not info_dict['duration']:
577281b0 282 info_dict['duration'] = parse_duration(video.get('duration'))
f1ced6df
S
283
284 extract_format(webpage, 'default')
285
286 for href, resolution in re.findall(
287 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
288 webpage):
0c265486
YCH
289 if need_confirm_age:
290 handler = self._download_age_confirm_page
291 else:
292 handler = self._download_webpage
293
294 webpage = handler(
41d1cca3 295 urljoin(self._BASE_URL, href), video_id,
577281b0 296 'Downloading %s version information' % resolution, fatal=False)
8b0d7a66 297 if not webpage:
f1ced6df
S
298 # Manually report warning because empty page is returned when
299 # invalid version is requested.
300 self.report_warning('Unable to download %s version information' % resolution)
8b0d7a66 301 continue
0c265486 302
f1ced6df 303 extract_format(webpage, resolution)
8b0d7a66 304
38d70284 305 return merge_dicts(info_dict, info)