[yt-dlp.git] / youtube_dl / extractor / mailru.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    remove_end,
)


class MailRuIE(InfoExtractor):
    IE_NAME = 'mailru'
    IE_DESC = 'Видео@Mail.Ru'
    _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)'

    _TESTS = [
        {
            'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
            'md5': 'dea205f03120046894db4ebb6159879a',
            'info_dict': {
                'id': '46301138_76',
                'ext': 'mp4',
                'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
                'timestamp': 1393232740,
                'upload_date': '20140224',
                'uploader': 'sonypicturesrus',
                'uploader_id': 'sonypicturesrus@mail.ru',
                'duration': 184,
            },
            'skip': 'Not accessible from Travis CI server',
        },
        {
            'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
            'md5': '00a91a58c3402204dcced523777b475f',
            'info_dict': {
                'id': '46843144_1263',
                'ext': 'mp4',
                'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
                'timestamp': 1397039888,
                'upload_date': '20140409',
                'uploader': 'hitech@corp.mail.ru',
                'uploader_id': 'hitech@corp.mail.ru',
                'duration': 245,
            },
            'skip': 'Not accessible from Travis CI server',
        },
        {
            # only available via metaUrl API
            'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html',
            'md5': '3b26d2491c6949d031a32b96bd97c096',
            'info_dict': {
                'id': '56664382_502',
                'ext': 'mp4',
                'title': ':8336',
                'timestamp': 1449094163,
                'upload_date': '20151202',
                'uploader': '720pizle@mail.ru',
                'uploader_id': '720pizle@mail.ru',
                'duration': 6001,
            },
            'skip': 'Not accessible from Travis CI server',
        },
        {
            'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html',
            'only_matching': True,
        }
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('idv1')

        if not video_id:
            video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')

        webpage = self._download_webpage(url, video_id)

        video_data = None

        page_config = self._parse_json(self._search_regex(
            r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>',
            webpage, 'page config', default='{}'), video_id, fatal=False)
        if page_config:
            meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl')
            if meta_url:
                video_data = self._download_json(
                    meta_url, video_id, 'Downloading video meta JSON', fatal=False)

        # Fallback old approach
        if not video_data:
            video_data = self._download_json(
                'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
                video_id, 'Downloading video JSON')

        formats = []
        for f in video_data['videos']:
            video_url = f.get('url')
            if not video_url:
                continue
            format_id = f.get('key')
            height = int_or_none(self._search_regex(
                r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None
            formats.append({
                'url': video_url,
                'format_id': format_id,
                'height': height,
            })
        self._sort_formats(formats)

        meta_data = video_data['meta']
        title = remove_end(meta_data['title'], '.mp4')

        author = video_data.get('author')
        uploader = author.get('name')
        uploader_id = author.get('id') or author.get('email')
        view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count'))

        acc_id = meta_data.get('accId')
        item_id = meta_data.get('itemId')
        content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id

        thumbnail = meta_data.get('poster')
        duration = int_or_none(meta_data.get('duration'))
        timestamp = int_or_none(meta_data.get('timestamp'))

        return {
            'id': content_id,
            'title': title,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'uploader': uploader,
            'uploader_id': uploader_id,
            'duration': duration,
            'view_count': view_count,
            'formats': formats,
        }
Commit	Line	Data
dcdb292f	1	# coding: utf-8
69bb54eb S	2	from __future__ import unicode_literals
	3
	4	import re
69bb54eb S	5
69bb54eb S	6	from .common import InfoExtractor
b081350b S	7	from ..utils import (
	8	int_or_none,
	9	remove_end,
	10	)
69bb54eb S	11
	12
	13	class MailRuIE(InfoExtractor):
	14	IE_NAME = 'mailru'
	15	IE_DESC = 'Видео@Mail.Ru'
b5a5bbf3	16	_VALID_URL = r'https?://(?:(?:www\|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)\|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)'
69bb54eb	17
ceb7a17f S	18	_TESTS = [
	19	{
	20	'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
	21	'md5': 'dea205f03120046894db4ebb6159879a',
	22	'info_dict': {
00d9ef0b	23	'id': '46301138_76',
ceb7a17f S	24	'ext': 'mp4',
	25	'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
	26	'timestamp': 1393232740,
	27	'upload_date': '20140224',
	28	'uploader': 'sonypicturesrus',
	29	'uploader_id': 'sonypicturesrus@mail.ru',
	30	'duration': 184,
	31	},
95e431e9	32	'skip': 'Not accessible from Travis CI server',
ceb7a17f S	33	},
	34	{
	35	'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
	36	'md5': '00a91a58c3402204dcced523777b475f',
	37	'info_dict': {
00d9ef0b	38	'id': '46843144_1263',
ceb7a17f S	39	'ext': 'mp4',
ceb7a17f S	40	'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
3967a761 S	41	'timestamp': 1397039888,
	42	'upload_date': '20140409',
	43	'uploader': 'hitech@corp.mail.ru',
ceb7a17f S	44	'uploader_id': 'hitech@corp.mail.ru',
	45	'duration': 245,
	46	},
95e431e9	47	'skip': 'Not accessible from Travis CI server',
ceb7a17f	48	},
16f1430b S	49	{
	50	# only available via metaUrl API
	51	'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html',
	52	'md5': '3b26d2491c6949d031a32b96bd97c096',
	53	'info_dict': {
	54	'id': '56664382_502',
	55	'ext': 'mp4',
	56	'title': ':8336',
	57	'timestamp': 1449094163,
	58	'upload_date': '20151202',
	59	'uploader': '720pizle@mail.ru',
	60	'uploader_id': '720pizle@mail.ru',
	61	'duration': 6001,
	62	},
	63	'skip': 'Not accessible from Travis CI server',
b5a5bbf3 S	64	},
	65	{
	66	'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html',
	67	'only_matching': True,
16f1430b	68	}
ceb7a17f	69	]
69bb54eb S	70
	71	def _real_extract(self, url):
	72	mobj = re.match(self._VALID_URL, url)
ceb7a17f S	73	video_id = mobj.group('idv1')
	74
	75	if not video_id:
	76	video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')
69bb54eb	77
16f1430b S	78	webpage = self._download_webpage(url, video_id)
	79
	80	video_data = None
	81
	82	page_config = self._parse_json(self._search_regex(
	83	r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>',
	84	webpage, 'page config', default='{}'), video_id, fatal=False)
	85	if page_config:
	86	meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl')
	87	if meta_url:
	88	video_data = self._download_json(
	89	meta_url, video_id, 'Downloading video meta JSON', fatal=False)
	90
	91	# Fallback old approach
	92	if not video_data:
	93	video_data = self._download_json(
	94	'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
	95	video_id, 'Downloading video JSON')
69bb54eb	96
b081350b S	97	formats = []
	98	for f in video_data['videos']:
	99	video_url = f.get('url')
	100	if not video_url:
	101	continue
	102	format_id = f.get('key')
	103	height = int_or_none(self._search_regex(
	104	r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None
	105	formats.append({
	106	'url': video_url,
	107	'format_id': format_id,
	108	'height': height,
	109	})
	110	self._sort_formats(formats)
69bb54eb	111
00d9ef0b	112	meta_data = video_data['meta']
b081350b S	113	title = remove_end(meta_data['title'], '.mp4')
	114
	115	author = video_data.get('author')
	116	uploader = author.get('name')
	117	uploader_id = author.get('id') or author.get('email')
	118	view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count'))
	119
	120	acc_id = meta_data.get('accId')
	121	item_id = meta_data.get('itemId')
	122	content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id
	123
	124	thumbnail = meta_data.get('poster')
	125	duration = int_or_none(meta_data.get('duration'))
	126	timestamp = int_or_none(meta_data.get('timestamp'))
69bb54eb S	127
	128	return {
	129	'id': content_id,
	130	'title': title,
	131	'thumbnail': thumbnail,
00d9ef0b	132	'timestamp': timestamp,
69bb54eb S	133	'uploader': uploader,
	134	'uploader_id': uploader_id,
	135	'duration': duration,
	136	'view_count': view_count,
	137	'formats': formats,
00d9ef0b	138	}