]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/cda.py
[compat] Remove more functions
[yt-dlp.git] / yt_dlp / extractor / cda.py
... / ...
CommitLineData
1import codecs
2import json
3import re
4
5from .common import InfoExtractor
6from ..compat import compat_ord, compat_urllib_parse_unquote
7from ..utils import (
8 ExtractorError,
9 float_or_none,
10 int_or_none,
11 merge_dicts,
12 multipart_encode,
13 parse_duration,
14 random_birthday,
15 try_get,
16 urljoin,
17)
18
19
20class CDAIE(InfoExtractor):
21 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
22 _BASE_URL = 'http://www.cda.pl/'
23 _TESTS = [{
24 'url': 'http://www.cda.pl/video/5749950c',
25 'md5': '6f844bf51b15f31fae165365707ae970',
26 'info_dict': {
27 'id': '5749950c',
28 'ext': 'mp4',
29 'height': 720,
30 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
31 'description': 'md5:269ccd135d550da90d1662651fcb9772',
32 'thumbnail': r're:^https?://.*\.jpg$',
33 'average_rating': float,
34 'duration': 39,
35 'age_limit': 0,
36 'upload_date': '20160221',
37 'timestamp': 1456078244,
38 }
39 }, {
40 'url': 'http://www.cda.pl/video/57413289',
41 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
42 'info_dict': {
43 'id': '57413289',
44 'ext': 'mp4',
45 'title': 'Lądowanie na lotnisku na Maderze',
46 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
47 'thumbnail': r're:^https?://.*\.jpg$',
48 'uploader': 'crash404',
49 'view_count': int,
50 'average_rating': float,
51 'duration': 137,
52 'age_limit': 0,
53 }
54 }, {
55 # Age-restricted
56 'url': 'http://www.cda.pl/video/1273454c4',
57 'info_dict': {
58 'id': '1273454c4',
59 'ext': 'mp4',
60 'title': 'Bronson (2008) napisy HD 1080p',
61 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
62 'height': 1080,
63 'uploader': 'boniek61',
64 'thumbnail': r're:^https?://.*\.jpg$',
65 'duration': 5554,
66 'age_limit': 18,
67 'view_count': int,
68 'average_rating': float,
69 },
70 }, {
71 'url': 'http://ebd.cda.pl/0x0/5749950c',
72 'only_matching': True,
73 }]
74
75 def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
76 form_data = random_birthday('rok', 'miesiac', 'dzien')
77 form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
78 data, content_type = multipart_encode(form_data)
79 return self._download_webpage(
80 urljoin(url, '/a/validatebirth'), video_id, *args,
81 data=data, headers={
82 'Referer': url,
83 'Content-Type': content_type,
84 }, **kwargs)
85
86 def _real_extract(self, url):
87 video_id = self._match_id(url)
88 self._set_cookie('cda.pl', 'cda.player', 'html5')
89 webpage = self._download_webpage(
90 self._BASE_URL + '/video/' + video_id, video_id)
91
92 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
93 raise ExtractorError('This video is only available for premium users.', expected=True)
94
95 if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
96 self.raise_geo_restricted()
97
98 need_confirm_age = False
99 if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")',
100 webpage, 'birthday validate form', default=None):
101 webpage = self._download_age_confirm_page(
102 url, video_id, note='Confirming age')
103 need_confirm_age = True
104
105 formats = []
106
107 uploader = self._search_regex(r'''(?x)
108 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
109 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
110 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
111 ''', webpage, 'uploader', default=None, group='uploader')
112 view_count = self._search_regex(
113 r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
114 'view_count', default=None)
115 average_rating = self._search_regex(
116 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
117 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
118 group='rating_value')
119
120 info_dict = {
121 'id': video_id,
122 'title': self._og_search_title(webpage),
123 'description': self._og_search_description(webpage),
124 'uploader': uploader,
125 'view_count': int_or_none(view_count),
126 'average_rating': float_or_none(average_rating),
127 'thumbnail': self._og_search_thumbnail(webpage),
128 'formats': formats,
129 'duration': None,
130 'age_limit': 18 if need_confirm_age else 0,
131 }
132
133 info = self._search_json_ld(webpage, video_id, default={})
134
135 # Source: https://www.cda.pl/js/player.js?t=1606154898
136 def decrypt_file(a):
137 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
138 a = a.replace(p, '')
139 a = compat_urllib_parse_unquote(a)
140 b = []
141 for c in a:
142 f = compat_ord(c)
143 b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f))
144 a = ''.join(b)
145 a = a.replace('.cda.mp4', '')
146 for p in ('.2cda.pl', '.3cda.pl'):
147 a = a.replace(p, '.cda.pl')
148 if '/upstream' in a:
149 a = a.replace('/upstream', '.mp4/upstream')
150 return 'https://' + a
151 return 'https://' + a + '.mp4'
152
153 def extract_format(page, version):
154 json_str = self._html_search_regex(
155 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
156 '%s player_json' % version, fatal=False, group='player_data')
157 if not json_str:
158 return
159 player_data = self._parse_json(
160 json_str, '%s player_data' % version, fatal=False)
161 if not player_data:
162 return
163 video = player_data.get('video')
164 if not video or 'file' not in video:
165 self.report_warning('Unable to extract %s version information' % version)
166 return
167 if video['file'].startswith('uggc'):
168 video['file'] = codecs.decode(video['file'], 'rot_13')
169 if video['file'].endswith('adc.mp4'):
170 video['file'] = video['file'].replace('adc.mp4', '.mp4')
171 elif not video['file'].startswith('http'):
172 video['file'] = decrypt_file(video['file'])
173 video_quality = video.get('quality')
174 qualities = video.get('qualities', {})
175 video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality)
176 info_dict['formats'].append({
177 'url': video['file'],
178 'format_id': video_quality,
179 'height': int_or_none(video_quality[:-1]),
180 })
181 for quality, cda_quality in qualities.items():
182 if quality == video_quality:
183 continue
184 data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2,
185 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]}
186 data = json.dumps(data).encode('utf-8')
187 video_url = self._download_json(
188 f'https://www.cda.pl/video/{video_id}', video_id, headers={
189 'Content-Type': 'application/json',
190 'X-Requested-With': 'XMLHttpRequest'
191 }, data=data, note=f'Fetching {quality} url',
192 errnote=f'Failed to fetch {quality} url', fatal=False)
193 if try_get(video_url, lambda x: x['result']['status']) == 'ok':
194 video_url = try_get(video_url, lambda x: x['result']['resp'])
195 info_dict['formats'].append({
196 'url': video_url,
197 'format_id': quality,
198 'height': int_or_none(quality[:-1])
199 })
200
201 if not info_dict['duration']:
202 info_dict['duration'] = parse_duration(video.get('duration'))
203
204 extract_format(webpage, 'default')
205
206 for href, resolution in re.findall(
207 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
208 webpage):
209 if need_confirm_age:
210 handler = self._download_age_confirm_page
211 else:
212 handler = self._download_webpage
213
214 webpage = handler(
215 urljoin(self._BASE_URL, href), video_id,
216 'Downloading %s version information' % resolution, fatal=False)
217 if not webpage:
218 # Manually report warning because empty page is returned when
219 # invalid version is requested.
220 self.report_warning('Unable to download %s version information' % resolution)
221 continue
222
223 extract_format(webpage, resolution)
224
225 self._sort_formats(formats)
226
227 return merge_dicts(info_dict, info)