]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/odnoklassniki.py
[extractor/rutube] Extract chapters from description (#6345)
[yt-dlp.git] / yt_dlp / extractor / odnoklassniki.py
CommitLineData
4ffbf778 1from .common import InfoExtractor
c9fd5306 2from ..compat import (
1c35b3da 3 compat_etree_fromstring,
c9fd5306
S
4 compat_parse_qs,
5 compat_urllib_parse_unquote,
6 compat_urllib_parse_urlparse,
7)
4ffbf778 8from ..utils import (
1806a754 9 ExtractorError,
d984a98d 10 float_or_none,
4ffbf778
S
11 int_or_none,
12 qualities,
8196182a 13 smuggle_url,
b23b503e 14 traverse_obj,
372744c5 15 unescapeHTML,
8196182a 16 unified_strdate,
17 unsmuggle_url,
a3474aa5 18 urlencode_postdata,
4ffbf778
S
19)
20
21
22class OdnoklassnikiIE(InfoExtractor):
d04ca976
S
23 _VALID_URL = r'''(?x)
24 https?://
25 (?:(?:www|m|mobile)\.)?
26 (?:odnoklassniki|ok)\.ru/
27 (?:
8196182a 28 video(?P<embed>embed)?/|
d04ca976
S
29 web-api/video/moviePlayer/|
30 live/|
31 dk\?.*?st\.mvId=
32 )
33 (?P<id>[\d-]+)
34 '''
bfd973ec 35 _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1']
4ffbf778 36 _TESTS = [{
b8b3f456
K
37 'note': 'Coub embedded',
38 'url': 'http://ok.ru/video/1484130554189',
39 'info_dict': {
40 'id': '1keok9',
41 'ext': 'mp4',
42 'timestamp': 1545580896,
43 'view_count': int,
8196182a 44 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg',
b8b3f456
K
45 'title': 'Народная забава',
46 'uploader': 'Nevata',
47 'upload_date': '20181223',
48 'age_limit': 0,
49 'uploader_id': 'nevata.s',
50 'like_count': int,
51 'duration': 8.08,
52 'repost_count': int,
53 },
54 }, {
55 'note': 'vk.com embedded',
56 'url': 'https://ok.ru/video/3568183087575',
57 'info_dict': {
58 'id': '-165101755_456243749',
59 'ext': 'mp4',
60 'uploader_id': '-165101755',
61 'duration': 132,
62 'timestamp': 1642869935,
63 'upload_date': '20220122',
64 'thumbnail': str,
65 'title': str,
66 'uploader': str,
67 },
68 }, {
c6bbdadd 69 # metadata in JSON
4ffbf778 70 'url': 'http://ok.ru/video/20079905452',
8196182a 71 'md5': '5d2b64756e2af296e3b383a0bc02a6aa',
4ffbf778
S
72 'info_dict': {
73 'id': '20079905452',
74 'ext': 'mp4',
75 'title': 'Культура меняет нас (прекрасный ролик!))',
8196182a 76 'thumbnail': str,
4ffbf778 77 'duration': 100,
887e9bc7 78 'upload_date': '20141207',
4ffbf778
S
79 'uploader_id': '330537914540',
80 'uploader': 'Виталий Добровольский',
81 'like_count': int,
9f2e7c2f 82 'age_limit': 0,
c6bbdadd
S
83 },
84 }, {
85 # metadataUrl
c9fd5306 86 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
8196182a 87 'md5': 'f8c951122516af72e6e6ffdd3c41103b',
c6bbdadd
S
88 'info_dict': {
89 'id': '63567059965189-0',
90 'ext': 'mp4',
91 'title': 'Девушка без комплексов ...',
8196182a 92 'thumbnail': str,
c6bbdadd 93 'duration': 191,
887e9bc7 94 'upload_date': '20150518',
c6bbdadd 95 'uploader_id': '534380003155',
887e9bc7 96 'uploader': '☭ Андрей Мещанинов ☭',
c6bbdadd 97 'like_count': int,
9f2e7c2f 98 'age_limit': 0,
c9fd5306 99 'start_time': 5,
4ffbf778 100 },
88720ed0
S
101 }, {
102 # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
8196182a 103 'url': 'https://ok.ru/video/3952212382174',
104 'md5': '91749d0bd20763a28d083fa335bbd37a',
88720ed0 105 'info_dict': {
8196182a 106 'id': '5axVgHHDBvU',
88720ed0 107 'ext': 'mp4',
8196182a 108 'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide',
109 'description': 'md5:b57209eeb9d5c2f20c984dfb58862097',
110 'uploader': 'Lod Mer',
111 'uploader_id': '575186401502',
112 'duration': 1529,
88720ed0 113 'age_limit': 0,
8196182a 114 'upload_date': '20210405',
115 'comment_count': int,
116 'live_status': 'not_live',
117 'view_count': int,
118 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8',
119 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94',
120 'channel_follower_count': int,
121 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'],
122 'channel_id': 'UCVGtvURtEURYHtJFUegdSug',
123 'like_count': int,
124 'availability': 'public',
125 'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug',
126 'categories': ['Education'],
127 'playable_in_embed': True,
128 'channel': 'BornToReact',
88720ed0 129 },
749b0046
S
130 }, {
131 # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
132 'url': 'http://ok.ru/video/62036049272859-0',
133 'info_dict': {
134 'id': '62036049272859-0',
135 'ext': 'mp4',
136 'title': 'МУЗЫКА ДОЖДЯ .',
137 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
138 'upload_date': '20120106',
139 'uploader_id': '473534735899',
140 'uploader': 'МARINA D',
141 'age_limit': 0,
142 },
143 'params': {
144 'skip_download': True,
145 },
58f6ab72 146 'skip': 'Video has not been found',
d984a98d 147 }, {
8196182a 148 # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading
d984a98d
THD
149 'note': 'Only available in mobile webpage',
150 'url': 'https://m.ok.ru/video/2361249957145',
151 'info_dict': {
152 'id': '2361249957145',
8196182a 153 'ext': 'mp4',
d984a98d
THD
154 'title': 'Быковское крещение',
155 'duration': 3038.181,
156 },
b23b503e 157 'skip': 'HTTP Error 400',
158 }, {
159 'note': 'subtitles',
160 'url': 'https://ok.ru/video/4249587550747',
161 'info_dict': {
162 'id': '4249587550747',
163 'ext': 'mp4',
164 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle',
165 'uploader': 'Sunflower Movies',
166 'uploader_id': '595802161179',
167 'upload_date': '20220816',
168 'duration': 6728,
169 'age_limit': 0,
170 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+',
171 'like_count': int,
172 'subtitles': dict,
173 },
174 'params': {
175 'skip_download': True,
176 },
4ffbf778
S
177 }, {
178 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
179 'only_matching': True,
cdc8d0c3
YCH
180 }, {
181 'url': 'http://www.ok.ru/video/20648036891',
182 'only_matching': True,
d762f86e
S
183 }, {
184 'url': 'http://www.ok.ru/videoembed/20648036891',
185 'only_matching': True,
10e6ed93
S
186 }, {
187 'url': 'http://m.ok.ru/video/20079905452',
188 'only_matching': True,
189 }, {
190 'url': 'http://mobile.ok.ru/video/20079905452',
191 'only_matching': True,
8005dc68
S
192 }, {
193 'url': 'https://www.ok.ru/live/484531969818',
194 'only_matching': True,
608c738c
G
195 }, {
196 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
197 'only_matching': True,
15870747 198 }, {
199 # Paid video
200 'url': 'https://ok.ru/video/954886983203',
201 'only_matching': True,
8196182a 202 }, {
203 'url': 'https://ok.ru/videoembed/2932705602075',
204 'info_dict': {
205 'id': '2932705602075',
206 'ext': 'mp4',
207 'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8',
208 'title': 'Boosty для тебя!',
209 'uploader_id': '597811038747',
210 'like_count': 0,
211 'duration': 35,
212 },
213 }]
214
215 _WEBPAGE_TESTS = [{
216 'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167',
217 'info_dict': {
218 'id': '3950343629563',
219 'ext': 'mp4',
220 'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8',
221 'title': 'Заяц Бусти.mp4',
222 'uploader_id': '571368965883',
223 'like_count': 0,
224 'duration': 10444,
225 },
b23b503e 226 'skip': 'Site no longer embeds',
4ffbf778
S
227 }]
228
8196182a 229 @classmethod
230 def _extract_embed_urls(cls, url, webpage):
231 for x in super()._extract_embed_urls(url, webpage):
232 yield smuggle_url(x, {'referrer': url})
233
4ffbf778 234 def _real_extract(self, url):
d984a98d
THD
235 try:
236 return self._extract_desktop(url)
237 except ExtractorError as e:
238 try:
239 return self._extract_mobile(url)
240 except ExtractorError:
241 # error message of desktop webpage is in English
242 raise e
243
244 def _extract_desktop(self, url):
c9fd5306
S
245 start_time = int_or_none(compat_parse_qs(
246 compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
247
8196182a 248 url, smuggled = unsmuggle_url(url, {})
249 video_id, is_embed = self._match_valid_url(url).group('id', 'embed')
250 mode = 'videoembed' if is_embed else 'video'
4ffbf778 251
ba2df04b 252 webpage = self._download_webpage(
8196182a 253 f'https://ok.ru/{mode}/{video_id}', video_id,
254 note='Downloading desktop webpage',
255 headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {})
4ffbf778 256
1806a754
S
257 error = self._search_regex(
258 r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
259 webpage, 'error', default=None)
8196182a 260 # Direct link from boosty
261 if (error == 'The author of this video has not been found or is blocked'
262 and not smuggled.get('referrer') and mode == 'videoembed'):
263 return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
264 elif error:
1806a754
S
265 raise ExtractorError(error, expected=True)
266
4ffbf778 267 player = self._parse_json(
372744c5 268 unescapeHTML(self._search_regex(
1e804244
S
269 r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
270 webpage, 'player', group='player')),
4ffbf778
S
271 video_id)
272
b8b3f456
K
273 # embedded external player
274 if player.get('isExternalPlayer') and player.get('url'):
275 return self.url_result(player['url'])
276
c6bbdadd
S
277 flashvars = player['flashvars']
278
279 metadata = flashvars.get('metadata')
280 if metadata:
281 metadata = self._parse_json(metadata, video_id)
282 else:
a3474aa5
RA
283 data = {}
284 st_location = flashvars.get('location')
285 if st_location:
286 data['st.location'] = st_location
c6bbdadd 287 metadata = self._download_json(
b78f5ec4 288 compat_urllib_parse_unquote(flashvars['metadataUrl']),
a3474aa5
RA
289 video_id, 'Downloading metadata JSON',
290 data=urlencode_postdata(data))
4ffbf778
S
291
292 movie = metadata['movie']
749b0046
S
293
294 # Some embedded videos may not contain title in movie dict (e.g.
295 # http://ok.ru/video/62036049272859-0) thus we allow missing title
296 # here and it's going to be extracted later by an extractor that
297 # will process the actual embed.
298 provider = metadata.get('provider')
299 title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
300
4ffbf778
S
301 thumbnail = movie.get('poster')
302 duration = int_or_none(movie.get('duration'))
303
304 author = metadata.get('author', {})
305 uploader_id = author.get('id')
306 uploader = author.get('name')
307
308 upload_date = unified_strdate(self._html_search_meta(
c6bbdadd 309 'ya:ovs:upload_date', webpage, 'upload date', default=None))
4ffbf778
S
310
311 age_limit = None
312 adult = self._html_search_meta(
c6bbdadd 313 'ya:ovs:adult', webpage, 'age limit', default=None)
4ffbf778
S
314 if adult:
315 age_limit = 18 if adult == 'true' else 0
316
317 like_count = int_or_none(metadata.get('likeCount'))
318
b23b503e 319 subtitles = {}
320 for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict):
321 sub_url = sub.get('url')
322 if not sub_url:
323 continue
324 subtitles.setdefault(sub.get('language') or 'en', []).append({
325 'url': sub_url,
326 'ext': 'vtt',
327 })
328
88720ed0 329 info = {
4ffbf778
S
330 'id': video_id,
331 'title': title,
332 'thumbnail': thumbnail,
333 'duration': duration,
334 'upload_date': upload_date,
335 'uploader': uploader,
336 'uploader_id': uploader_id,
337 'like_count': like_count,
338 'age_limit': age_limit,
c9fd5306 339 'start_time': start_time,
b23b503e 340 'subtitles': subtitles,
4ffbf778 341 }
88720ed0 342
b8b3f456
K
343 # pladform
344 if provider == 'OPEN_GRAPH':
345 info.update({
346 '_type': 'url_transparent',
347 'url': movie['contentId'],
348 })
349 return info
350
749b0046 351 if provider == 'USER_YOUTUBE':
88720ed0
S
352 info.update({
353 '_type': 'url_transparent',
354 'url': movie['contentId'],
355 })
356 return info
357
8005dc68
S
358 assert title
359 if provider == 'LIVE_TV_APP':
39ca3b5c 360 info['title'] = title
8005dc68 361
8196182a 362 quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7'))
88720ed0
S
363
364 formats = [{
365 'url': f['url'],
366 'ext': 'mp4',
367 'format_id': f['name'],
88720ed0 368 } for f in metadata['videos']]
1c35b3da
RA
369
370 m3u8_url = metadata.get('hlsManifestUrl')
371 if m3u8_url:
372 formats.extend(self._extract_m3u8_formats(
373 m3u8_url, video_id, 'mp4', 'm3u8_native',
374 m3u8_id='hls', fatal=False))
375
376 dash_manifest = metadata.get('metadataEmbedded')
377 if dash_manifest:
378 formats.extend(self._parse_mpd_formats(
379 compat_etree_fromstring(dash_manifest), 'mpd'))
380
381 for fmt in formats:
382 fmt_type = self._search_regex(
383 r'\btype[/=](\d)', fmt['url'],
384 'format type', default=None)
385 if fmt_type:
386 fmt['quality'] = quality(fmt_type)
387
8005dc68
S
388 # Live formats
389 m3u8_url = metadata.get('hlsMasterPlaylistUrl')
390 if m3u8_url:
391 formats.extend(self._extract_m3u8_formats(
177877c5 392 m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
8005dc68
S
393 rtmp_url = metadata.get('rtmpUrl')
394 if rtmp_url:
395 formats.append({
396 'url': rtmp_url,
397 'format_id': 'rtmp',
398 'ext': 'flv',
399 })
400
15870747 401 if not formats:
402 payment_info = metadata.get('paymentInfo')
403 if payment_info:
b7da73eb 404 self.raise_no_formats('This video is paid, subscribe to download it', expected=True)
15870747 405
88720ed0
S
406 info['formats'] = formats
407 return info
d984a98d
THD
408
409 def _extract_mobile(self, url):
410 video_id = self._match_id(url)
411
412 webpage = self._download_webpage(
413 'http://m.ok.ru/video/%s' % video_id, video_id,
414 note='Downloading mobile webpage')
415
416 error = self._search_regex(
417 r'видео</a>\s*<div\s+class="empty">(.+?)</div>',
418 webpage, 'error', default=None)
419 if error:
420 raise ExtractorError(error, expected=True)
421
422 json_data = self._search_regex(
423 r'data-video="(.+?)"', webpage, 'json data')
424 json_data = self._parse_json(unescapeHTML(json_data), video_id) or {}
425
426 return {
427 'id': video_id,
428 'title': json_data.get('videoName'),
429 'duration': float_or_none(json_data.get('videoDuration'), scale=1000),
430 'thumbnail': json_data.get('videoPosterSrc'),
431 'formats': [{
432 'format_id': 'mobile',
433 'url': json_data.get('videoSrc'),
434 'ext': 'mp4',
435 }]
436 }