]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/odnoklassniki.py
[extractor/odnoklassniki] Support boosty.to embeds (#5105)
[yt-dlp.git] / yt_dlp / extractor / odnoklassniki.py
1 from .common import InfoExtractor
2 from ..compat import (
3 compat_etree_fromstring,
4 compat_parse_qs,
5 compat_urllib_parse_unquote,
6 compat_urllib_parse_urlparse,
7 )
8 from ..utils import (
9 ExtractorError,
10 float_or_none,
11 int_or_none,
12 qualities,
13 smuggle_url,
14 unescapeHTML,
15 unified_strdate,
16 unsmuggle_url,
17 urlencode_postdata,
18 )
19
20
21 class OdnoklassnikiIE(InfoExtractor):
22 _VALID_URL = r'''(?x)
23 https?://
24 (?:(?:www|m|mobile)\.)?
25 (?:odnoklassniki|ok)\.ru/
26 (?:
27 video(?P<embed>embed)?/|
28 web-api/video/moviePlayer/|
29 live/|
30 dk\?.*?st\.mvId=
31 )
32 (?P<id>[\d-]+)
33 '''
34 _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
35 _TESTS = [{
36 'note': 'Coub embedded',
37 'url': 'http://ok.ru/video/1484130554189',
38 'info_dict': {
39 'id': '1keok9',
40 'ext': 'mp4',
41 'timestamp': 1545580896,
42 'view_count': int,
43 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
44 'title': 'Народная забава',
45 'uploader': 'Nevata',
46 'upload_date': '20181223',
47 'age_limit': 0,
48 'uploader_id': 'nevata.s',
49 'like_count': int,
50 'duration': 8.08,
51 'repost_count': int,
52 },
53 }, {
54 'note': 'vk.com embedded',
55 'url': 'https://ok.ru/video/3568183087575',
56 'info_dict': {
57 'id': '-165101755_456243749',
58 'ext': 'mp4',
59 'uploader_id': '-165101755',
60 'duration': 132,
61 'timestamp': 1642869935,
62 'upload_date': '20220122',
63 'thumbnail': str,
64 'title': str,
65 'uploader': str,
66 },
67 }, {
68 # metadata in JSON
69 'url': 'http://ok.ru/video/20079905452',
70 'md5': '5d2b64756e2af296e3b383a0bc02a6aa',
71 'info_dict': {
72 'id': '20079905452',
73 'ext': 'mp4',
74 'title': 'Культура меняет нас (прекрасный ролик!))',
75 'thumbnail': str,
76 'duration': 100,
77 'upload_date': '20141207',
78 'uploader_id': '330537914540',
79 'uploader': 'Виталий Добровольский',
80 'like_count': int,
81 'age_limit': 0,
82 },
83 }, {
84 # metadataUrl
85 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
86 'md5': 'f8c951122516af72e6e6ffdd3c41103b',
87 'info_dict': {
88 'id': '63567059965189-0',
89 'ext': 'mp4',
90 'title': 'Девушка без комплексов ...',
91 'thumbnail': str,
92 'duration': 191,
93 'upload_date': '20150518',
94 'uploader_id': '534380003155',
95 'uploader': '☭ Андрей Мещанинов ☭',
96 'like_count': int,
97 'age_limit': 0,
98 'start_time': 5,
99 },
100 }, {
101 # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
102 'url': 'https://ok.ru/video/3952212382174',
103 'md5': '91749d0bd20763a28d083fa335bbd37a',
104 'info_dict': {
105 'id': '5axVgHHDBvU',
106 'ext': 'mp4',
107 'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide',
108 'description': 'md5:b57209eeb9d5c2f20c984dfb58862097',
109 'uploader': 'Lod Mer',
110 'uploader_id': '575186401502',
111 'duration': 1529,
112 'age_limit': 0,
113 'upload_date': '20210405',
114 'comment_count': int,
115 'live_status': 'not_live',
116 'view_count': int,
117 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
118 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94',
119 'channel_follower_count': int,
120 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
121 'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
122 'like_count': int,
123 'availability': 'public',
124 'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug',
125 'categories': ['Education'],
126 'playable_in_embed': True,
127 'channel': 'BornToReact',
128 },
129 }, {
130 # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
131 'url': 'http://ok.ru/video/62036049272859-0',
132 'info_dict': {
133 'id': '62036049272859-0',
134 'ext': 'mp4',
135 'title': 'МУЗЫКА ДОЖДЯ .',
136 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
137 'upload_date': '20120106',
138 'uploader_id': '473534735899',
139 'uploader': 'МARINA D',
140 'age_limit': 0,
141 },
142 'params': {
143 'skip_download': True,
144 },
145 'skip': 'Video has not been found',
146 }, {
147 # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading
148 'note': 'Only available in mobile webpage',
149 'url': 'https://m.ok.ru/video/2361249957145',
150 'info_dict': {
151 'id': '2361249957145',
152 'ext': 'mp4',
153 'title': 'Быковское крещение',
154 'duration': 3038.181,
155 },
156 }, {
157 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
158 'only_matching': True,
159 }, {
160 'url': 'http://www.ok.ru/video/20648036891',
161 'only_matching': True,
162 }, {
163 'url': 'http://www.ok.ru/videoembed/20648036891',
164 'only_matching': True,
165 }, {
166 'url': 'http://m.ok.ru/video/20079905452',
167 'only_matching': True,
168 }, {
169 'url': 'http://mobile.ok.ru/video/20079905452',
170 'only_matching': True,
171 }, {
172 'url': 'https://www.ok.ru/live/484531969818',
173 'only_matching': True,
174 }, {
175 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
176 'only_matching': True,
177 }, {
178 # Paid video
179 'url': 'https://ok.ru/video/954886983203',
180 'only_matching': True,
181 }, {
182 'url': 'https://ok.ru/videoembed/2932705602075',
183 'info_dict': {
184 'id': '2932705602075',
185 'ext': 'mp4',
186 'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8',
187 'title': 'Boosty для тебя!',
188 'uploader_id': '597811038747',
189 'like_count': 0,
190 'duration': 35,
191 },
192 }]
193
194 _WEBPAGE_TESTS = [{
195 'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167',
196 'info_dict': {
197 'id': '3950343629563',
198 'ext': 'mp4',
199 'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8',
200 'title': 'Заяц Бусти.mp4',
201 'uploader_id': '571368965883',
202 'like_count': 0,
203 'duration': 10444,
204 },
205 }]
206
207 @classmethod
208 def _extract_embed_urls(cls, url, webpage):
209 for x in super()._extract_embed_urls(url, webpage):
210 yield smuggle_url(x, {'referrer': url})
211
212 def _real_extract(self, url):
213 try:
214 return self._extract_desktop(url)
215 except ExtractorError as e:
216 try:
217 return self._extract_mobile(url)
218 except ExtractorError:
219 # error message of desktop webpage is in English
220 raise e
221
222 def _extract_desktop(self, url):
223 start_time = int_or_none(compat_parse_qs(
224 compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
225
226 url, smuggled = unsmuggle_url(url, {})
227 video_id, is_embed = self._match_valid_url(url).group('id', 'embed')
228 mode = 'videoembed' if is_embed else 'video'
229
230 webpage = self._download_webpage(
231 f'https://ok.ru/{mode}/{video_id}', video_id,
232 note='Downloading desktop webpage',
233 headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {})
234
235 error = self._search_regex(
236 r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
237 webpage, 'error', default=None)
238 # Direct link from boosty
239 if (error == 'The author of this video has not been found or is blocked'
240 and not smuggled.get('referrer') and mode == 'videoembed'):
241 return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
242 elif error:
243 raise ExtractorError(error, expected=True)
244
245 player = self._parse_json(
246 unescapeHTML(self._search_regex(
247 r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
248 webpage, 'player', group='player')),
249 video_id)
250
251 # embedded external player
252 if player.get('isExternalPlayer') and player.get('url'):
253 return self.url_result(player['url'])
254
255 flashvars = player['flashvars']
256
257 metadata = flashvars.get('metadata')
258 if metadata:
259 metadata = self._parse_json(metadata, video_id)
260 else:
261 data = {}
262 st_location = flashvars.get('location')
263 if st_location:
264 data['st.location'] = st_location
265 metadata = self._download_json(
266 compat_urllib_parse_unquote(flashvars['metadataUrl']),
267 video_id, 'Downloading metadata JSON',
268 data=urlencode_postdata(data))
269
270 movie = metadata['movie']
271
272 # Some embedded videos may not contain title in movie dict (e.g.
273 # http://ok.ru/video/62036049272859-0) thus we allow missing title
274 # here and it's going to be extracted later by an extractor that
275 # will process the actual embed.
276 provider = metadata.get('provider')
277 title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
278
279 thumbnail = movie.get('poster')
280 duration = int_or_none(movie.get('duration'))
281
282 author = metadata.get('author', {})
283 uploader_id = author.get('id')
284 uploader = author.get('name')
285
286 upload_date = unified_strdate(self._html_search_meta(
287 'ya:ovs:upload_date', webpage, 'upload date', default=None))
288
289 age_limit = None
290 adult = self._html_search_meta(
291 'ya:ovs:adult', webpage, 'age limit', default=None)
292 if adult:
293 age_limit = 18 if adult == 'true' else 0
294
295 like_count = int_or_none(metadata.get('likeCount'))
296
297 info = {
298 'id': video_id,
299 'title': title,
300 'thumbnail': thumbnail,
301 'duration': duration,
302 'upload_date': upload_date,
303 'uploader': uploader,
304 'uploader_id': uploader_id,
305 'like_count': like_count,
306 'age_limit': age_limit,
307 'start_time': start_time,
308 }
309
310 # pladform
311 if provider == 'OPEN_GRAPH':
312 info.update({
313 '_type': 'url_transparent',
314 'url': movie['contentId'],
315 })
316 return info
317
318 if provider == 'USER_YOUTUBE':
319 info.update({
320 '_type': 'url_transparent',
321 'url': movie['contentId'],
322 })
323 return info
324
325 assert title
326 if provider == 'LIVE_TV_APP':
327 info['title'] = title
328
329 quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7'))
330
331 formats = [{
332 'url': f['url'],
333 'ext': 'mp4',
334 'format_id': f['name'],
335 } for f in metadata['videos']]
336
337 m3u8_url = metadata.get('hlsManifestUrl')
338 if m3u8_url:
339 formats.extend(self._extract_m3u8_formats(
340 m3u8_url, video_id, 'mp4', 'm3u8_native',
341 m3u8_id='hls', fatal=False))
342
343 dash_manifest = metadata.get('metadataEmbedded')
344 if dash_manifest:
345 formats.extend(self._parse_mpd_formats(
346 compat_etree_fromstring(dash_manifest), 'mpd'))
347
348 for fmt in formats:
349 fmt_type = self._search_regex(
350 r'\btype[/=](\d)', fmt['url'],
351 'format type', default=None)
352 if fmt_type:
353 fmt['quality'] = quality(fmt_type)
354
355 # Live formats
356 m3u8_url = metadata.get('hlsMasterPlaylistUrl')
357 if m3u8_url:
358 formats.extend(self._extract_m3u8_formats(
359 m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
360 rtmp_url = metadata.get('rtmpUrl')
361 if rtmp_url:
362 formats.append({
363 'url': rtmp_url,
364 'format_id': 'rtmp',
365 'ext': 'flv',
366 })
367
368 if not formats:
369 payment_info = metadata.get('paymentInfo')
370 if payment_info:
371 self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
372
373 self._sort_formats(formats)
374
375 info['formats'] = formats
376 return info
377
378 def _extract_mobile(self, url):
379 video_id = self._match_id(url)
380
381 webpage = self._download_webpage(
382 'http://m.ok.ru/video/%s' % video_id, video_id,
383 note='Downloading mobile webpage')
384
385 error = self._search_regex(
386 r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
387 webpage, 'error', default=None)
388 if error:
389 raise ExtractorError(error, expected=True)
390
391 json_data = self._search_regex(
392 r'data-video="(.+?)"', webpage, 'json data')
393 json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
394
395 return {
396 'id': video_id,
397 'title': json_data.get('videoName'),
398 'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
399 'thumbnail': json_data.get('videoPosterSrc'),
400 'formats': [{
401 'format_id': 'mobile',
402 'url': json_data.get('videoSrc'),
403 'ext': 'mp4',
404 }]
405 }