]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/mailru.py
[core] Fix HTTP headers and cookie handling
[yt-dlp.git] / yt_dlp / extractor / mailru.py
CommitLineData
f2908d07
S
1import itertools
2import json
69bb54eb 3import re
6c5211ce 4import urllib.parse
69bb54eb
S
5
6from .common import InfoExtractor
f2908d07 7from ..compat import compat_urllib_parse_unquote
b081350b
S
8from ..utils import (
9 int_or_none,
f2908d07 10 parse_duration,
b081350b 11 remove_end,
f2908d07 12 try_get,
8abd647c 13 urljoin,
b081350b 14)
69bb54eb
S
15
16
17class MailRuIE(InfoExtractor):
18 IE_NAME = 'mailru'
19 IE_DESC = 'Видео@Mail.Ru'
549bb416
S
20 _VALID_URL = r'''(?x)
21 https?://
d53fec69 22 (?:(?:www|m|videoapi)\.)?my\.mail\.ru/+
549bb416
S
23 (?:
24 video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|
d53fec69 25 (?:videos/embed/)?(?:(?P<idv2prefix>(?:[^/]+/+){2})(?:video/(?:embed/)?)?(?P<idv2suffix>[^/]+/\d+))(?:\.html)?|
549bb416
S
26 (?:video/embed|\+/video/meta)/(?P<metaid>\d+)
27 )
28 '''
ceb7a17f
S
29 _TESTS = [
30 {
31 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
32 'md5': 'dea205f03120046894db4ebb6159879a',
33 'info_dict': {
00d9ef0b 34 'id': '46301138_76',
ceb7a17f
S
35 'ext': 'mp4',
36 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
549bb416 37 'timestamp': 1393235077,
ceb7a17f
S
38 'upload_date': '20140224',
39 'uploader': 'sonypicturesrus',
40 'uploader_id': 'sonypicturesrus@mail.ru',
41 'duration': 184,
42 },
95e431e9 43 'skip': 'Not accessible from Travis CI server',
ceb7a17f
S
44 },
45 {
46 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
47 'md5': '00a91a58c3402204dcced523777b475f',
48 'info_dict': {
00d9ef0b 49 'id': '46843144_1263',
ceb7a17f
S
50 'ext': 'mp4',
51 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
3967a761
S
52 'timestamp': 1397039888,
53 'upload_date': '20140409',
549bb416 54 'uploader': 'hitech',
ceb7a17f
S
55 'uploader_id': 'hitech@corp.mail.ru',
56 'duration': 245,
57 },
95e431e9 58 'skip': 'Not accessible from Travis CI server',
ceb7a17f 59 },
16f1430b
S
60 {
61 # only available via metaUrl API
62 'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html',
63 'md5': '3b26d2491c6949d031a32b96bd97c096',
64 'info_dict': {
65 'id': '56664382_502',
66 'ext': 'mp4',
67 'title': ':8336',
68 'timestamp': 1449094163,
69 'upload_date': '20151202',
70 'uploader': '720pizle@mail.ru',
71 'uploader_id': '720pizle@mail.ru',
72 'duration': 6001,
73 },
74 'skip': 'Not accessible from Travis CI server',
b5a5bbf3
S
75 },
76 {
77 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html',
78 'only_matching': True,
25475dfa
HD
79 },
80 {
81 'url': 'https://my.mail.ru/video/embed/7949340477499637815',
82 'only_matching': True,
549bb416
S
83 },
84 {
85 'url': 'http://my.mail.ru/+/video/meta/7949340477499637815',
86 'only_matching': True,
278be57b
S
87 },
88 {
89 'url': 'https://my.mail.ru//list/sinyutin10/video/_myvideo/4.html',
90 'only_matching': True,
91 },
92 {
93 'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html',
94 'only_matching': True,
8abd647c 95 },
96 {
97 'url': 'https://my.mail.ru/mail/cloud-strife/video/embed/Games/2009',
98 'only_matching': True,
99 },
100 {
101 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html',
102 'only_matching': True,
16f1430b 103 }
ceb7a17f 104 ]
69bb54eb
S
105
106 def _real_extract(self, url):
5ad28e7f 107 mobj = self._match_valid_url(url)
549bb416
S
108 meta_id = mobj.group('metaid')
109
25475dfa 110 video_id = None
549bb416
S
111 if meta_id:
112 meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id
113 else:
25475dfa
HD
114 video_id = mobj.group('idv1')
115 if not video_id:
116 video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')
25475dfa 117 webpage = self._download_webpage(url, video_id)
d53fec69 118 page_config = self._parse_json(self._search_regex([
25475dfa 119 r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>',
8abd647c 120 r'(?s)"video":\s*({.+?}),'],
25475dfa 121 webpage, 'page config', default='{}'), video_id, fatal=False)
25475dfa 122 if page_config:
d53fec69 123 meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl')
549bb416
S
124 else:
125 meta_url = None
126
127 video_data = None
d53fec69 128
129 # fix meta_url if missing the host address
130 if re.match(r'^\/\+\/', meta_url):
8abd647c 131 meta_url = urljoin('https://my.mail.ru', meta_url)
d53fec69 132
549bb416
S
133 if meta_url:
134 video_data = self._download_json(
135 meta_url, video_id or meta_id, 'Downloading video meta JSON',
136 fatal=not video_id)
16f1430b
S
137
138 # Fallback old approach
139 if not video_data:
140 video_data = self._download_json(
141 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
142 video_id, 'Downloading video JSON')
69bb54eb 143
adc13b07 144 video_key = self._get_cookies('https://my.mail.ru').get('video_key')
adc13b07 145
b081350b
S
146 formats = []
147 for f in video_data['videos']:
148 video_url = f.get('url')
149 if not video_url:
150 continue
6c5211ce 151 if video_key:
152 self._set_cookie(urllib.parse.urlparse(video_url).hostname, 'video_key', video_key.value)
b081350b
S
153 format_id = f.get('key')
154 height = int_or_none(self._search_regex(
155 r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None
156 formats.append({
157 'url': video_url,
158 'format_id': format_id,
159 'height': height,
160 })
69bb54eb 161
00d9ef0b 162 meta_data = video_data['meta']
b081350b
S
163 title = remove_end(meta_data['title'], '.mp4')
164
165 author = video_data.get('author')
166 uploader = author.get('name')
167 uploader_id = author.get('id') or author.get('email')
168 view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count'))
169
170 acc_id = meta_data.get('accId')
171 item_id = meta_data.get('itemId')
172 content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id
173
174 thumbnail = meta_data.get('poster')
175 duration = int_or_none(meta_data.get('duration'))
176 timestamp = int_or_none(meta_data.get('timestamp'))
69bb54eb
S
177
178 return {
179 'id': content_id,
180 'title': title,
181 'thumbnail': thumbnail,
00d9ef0b 182 'timestamp': timestamp,
69bb54eb
S
183 'uploader': uploader,
184 'uploader_id': uploader_id,
185 'duration': duration,
186 'view_count': view_count,
187 'formats': formats,
00d9ef0b 188 }
f2908d07
S
189
190
191class MailRuMusicSearchBaseIE(InfoExtractor):
192 def _search(self, query, url, audio_id, limit=100, offset=0):
193 search = self._download_json(
194 'https://my.mail.ru/cgi-bin/my/ajax', audio_id,
195 'Downloading songs JSON page %d' % (offset // limit + 1),
196 headers={
197 'Referer': url,
198 'X-Requested-With': 'XMLHttpRequest',
199 }, query={
200 'xemail': '',
201 'ajax_call': '1',
202 'func_name': 'music.search',
203 'mna': '',
204 'mnb': '',
205 'arg_query': query,
206 'arg_extended': '1',
207 'arg_search_params': json.dumps({
208 'music': {
209 'limit': limit,
210 'offset': offset,
211 },
212 }),
213 'arg_limit': limit,
214 'arg_offset': offset,
215 })
216 return next(e for e in search if isinstance(e, dict))
217
218 @staticmethod
219 def _extract_track(t, fatal=True):
220 audio_url = t['URL'] if fatal else t.get('URL')
221 if not audio_url:
222 return
223
224 audio_id = t['File'] if fatal else t.get('File')
225 if not audio_id:
226 return
227
228 thumbnail = t.get('AlbumCoverURL') or t.get('FiledAlbumCover')
229 uploader = t.get('OwnerName') or t.get('OwnerName_Text_HTML')
230 uploader_id = t.get('UploaderID')
231 duration = int_or_none(t.get('DurationInSeconds')) or parse_duration(
232 t.get('Duration') or t.get('DurationStr'))
233 view_count = int_or_none(t.get('PlayCount') or t.get('PlayCount_hr'))
234
235 track = t.get('Name') or t.get('Name_Text_HTML')
236 artist = t.get('Author') or t.get('Author_Text_HTML')
237
238 if track:
239 title = '%s - %s' % (artist, track) if artist else track
240 else:
241 title = audio_id
242
243 return {
244 'extractor_key': MailRuMusicIE.ie_key(),
245 'id': audio_id,
246 'title': title,
247 'thumbnail': thumbnail,
248 'uploader': uploader,
249 'uploader_id': uploader_id,
250 'duration': duration,
251 'view_count': view_count,
252 'vcodec': 'none',
253 'abr': int_or_none(t.get('BitRate')),
254 'track': track,
255 'artist': artist,
256 'album': t.get('Album'),
257 'url': audio_url,
258 }
259
260
261class MailRuMusicIE(MailRuMusicSearchBaseIE):
262 IE_NAME = 'mailru:music'
263 IE_DESC = 'Музыка@Mail.Ru'
278be57b 264 _VALID_URL = r'https?://my\.mail\.ru/+music/+songs/+[^/?#&]+-(?P<id>[\da-f]+)'
f2908d07
S
265 _TESTS = [{
266 'url': 'https://my.mail.ru/music/songs/%D0%BC8%D0%BB8%D1%82%D1%85-l-a-h-luciferian-aesthetics-of-herrschaft-single-2017-4e31f7125d0dfaef505d947642366893',
267 'md5': '0f8c22ef8c5d665b13ac709e63025610',
268 'info_dict': {
269 'id': '4e31f7125d0dfaef505d947642366893',
270 'ext': 'mp3',
271 'title': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017 - М8Л8ТХ',
272 'uploader': 'Игорь Мудрый',
273 'uploader_id': '1459196328',
274 'duration': 280,
275 'view_count': int,
276 'vcodec': 'none',
277 'abr': 320,
278 'track': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017',
279 'artist': 'М8Л8ТХ',
280 },
281 }]
282
283 def _real_extract(self, url):
284 audio_id = self._match_id(url)
285
286 webpage = self._download_webpage(url, audio_id)
287
288 title = self._og_search_title(webpage)
289 music_data = self._search(title, url, audio_id)['MusicData']
290 t = next(t for t in music_data if t.get('File') == audio_id)
291
292 info = self._extract_track(t)
293 info['title'] = title
294 return info
295
296
297class MailRuMusicSearchIE(MailRuMusicSearchBaseIE):
298 IE_NAME = 'mailru:music:search'
299 IE_DESC = 'Музыка@Mail.Ru'
278be57b 300 _VALID_URL = r'https?://my\.mail\.ru/+music/+search/+(?P<id>[^/?#&]+)'
f2908d07
S
301 _TESTS = [{
302 'url': 'https://my.mail.ru/music/search/black%20shadow',
303 'info_dict': {
304 'id': 'black shadow',
305 },
306 'playlist_mincount': 532,
307 }]
308
309 def _real_extract(self, url):
310 query = compat_urllib_parse_unquote(self._match_id(url))
311
312 entries = []
313
314 LIMIT = 100
315 offset = 0
316
317 for _ in itertools.count(1):
318 search = self._search(query, url, query, LIMIT, offset)
319
320 music_data = search.get('MusicData')
321 if not music_data or not isinstance(music_data, list):
322 break
323
324 for t in music_data:
325 track = self._extract_track(t, fatal=False)
326 if track:
327 entries.append(track)
328
329 total = try_get(
330 search, lambda x: x['Results']['music']['Total'], int)
331
332 if total is not None:
333 if offset > total:
334 break
335
336 offset += LIMIT
337
338 return self.playlist_result(entries, query)