[yt-dlp.git] / youtube_dlc / extractor / aparat.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    merge_dicts,
    mimetype2ext,
    url_or_none,
)


class AparatIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'

    _TESTS = [{
        'url': 'http://www.aparat.com/v/wP8On',
        'md5': '131aca2e14fe7c4dcb3c4877ba300c89',
        'info_dict': {
            'id': 'wP8On',
            'ext': 'mp4',
            'title': 'تیم گلکسی 11 - زومیت',
            'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028',
            'duration': 231,
            'timestamp': 1387394859,
            'upload_date': '20131218',
            'view_count': int,
        },
    }, {
        # multiple formats
        'url': 'https://www.aparat.com/v/8dflw/',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        # Provides more metadata
        webpage = self._download_webpage(url, video_id, fatal=False)

        if not webpage:
            # Note: There is an easier-to-parse configuration at
            # http://www.aparat.com/video/video/config/videohash/%video_id
            # but the URL in there does not work
            webpage = self._download_webpage(
                'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
                video_id)

        options = self._parse_json(
            self._search_regex(
                r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1\s*\)',
                webpage, 'options', group='value'),
            video_id)

        player = options['plugins']['sabaPlayerPlugin']

        formats = []
        for sources in player['multiSRC']:
            for item in sources:
                if not isinstance(item, dict):
                    continue
                file_url = url_or_none(item.get('src'))
                if not file_url:
                    continue
                item_type = item.get('type')
                if item_type == 'application/vnd.apple.mpegurl':
                    formats.extend(self._extract_m3u8_formats(
                        file_url, video_id, 'mp4',
                        entry_protocol='m3u8_native', m3u8_id='hls',
                        fatal=False))
                else:
                    ext = mimetype2ext(item.get('type'))
                    label = item.get('label')
                    formats.append({
                        'url': file_url,
                        'ext': ext,
                        'format_id': 'http-%s' % (label or ext),
                        'height': int_or_none(self._search_regex(
                            r'(\d+)[pP]', label or '', 'height',
                            default=None)),
                    })
        self._sort_formats(
            formats, field_preference=('height', 'width', 'tbr', 'format_id'))

        info = self._search_json_ld(webpage, video_id, default={})

        if not info.get('title'):
            info['title'] = player['title']

        return merge_dicts(info, {
            'id': video_id,
            'thumbnail': url_or_none(options.get('poster')),
            'duration': int_or_none(player.get('duration')),
            'formats': formats,
        })
Commit	Line	Data
5f6a1245	1	# coding: utf-8
5fcf2dbe PH	2	from __future__ import unicode_literals
5fcf2dbe PH	3
aa94a6d3 PH	4	from .common import InfoExtractor
aa94a6d3 PH	5	from ..utils import (
70851a95	6	int_or_none,
2943397e	7	merge_dicts,
70851a95	8	mimetype2ext,
3052a30d	9	url_or_none,
aa94a6d3 PH	10	)
	11
	12
	13	class AparatIE(InfoExtractor):
70851a95	14	_VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/\|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
aa94a6d3	15
2943397e	16	_TESTS = [{
5fcf2dbe	17	'url': 'http://www.aparat.com/v/wP8On',
b1c6f21c	18	'md5': '131aca2e14fe7c4dcb3c4877ba300c89',
5fcf2dbe PH	19	'info_dict': {
	20	'id': 'wP8On',
	21	'ext': 'mp4',
	22	'title': 'تیم گلکسی 11 - زومیت',
2943397e S	23	'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028',
	24	'duration': 231,
	25	'timestamp': 1387394859,
	26	'upload_date': '20131218',
	27	'view_count': int,
aa94a6d3	28	},
2943397e S	29	}, {
	30	# multiple formats
	31	'url': 'https://www.aparat.com/v/8dflw/',
	32	'only_matching': True,
	33	}]
aa94a6d3 PH	34
aa94a6d3 PH	35	def _real_extract(self, url):
27e1400f	36	video_id = self._match_id(url)
aa94a6d3	37
2943397e S	38	# Provides more metadata
	39	webpage = self._download_webpage(url, video_id, fatal=False)
	40
	41	if not webpage:
	42	# Note: There is an easier-to-parse configuration at
	43	# http://www.aparat.com/video/video/config/videohash/%video_id
	44	# but the URL in there does not work
	45	webpage = self._download_webpage(
	46	'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
	47	video_id)
aa94a6d3	48
2943397e	49	options = self._parse_json(
70851a95	50	self._search_regex(
2943397e S	51	r'options\s=\sJSON\.parse\(\s(["\'])(?P<value>(?:(?!\1).)+)\1\s\)',
2943397e S	52	webpage, 'options', group='value'),
70851a95 S	53	video_id)
70851a95 S	54
2943397e	55	player = options['plugins']['sabaPlayerPlugin']
9c4a83a1	56
70851a95	57	formats = []
2943397e S	58	for sources in player['multiSRC']:
	59	for item in sources:
	60	if not isinstance(item, dict):
	61	continue
9c4a83a1 AI	62	file_url = url_or_none(item.get('src'))
	63	if not file_url:
	64	continue
2943397e S	65	item_type = item.get('type')
	66	if item_type == 'application/vnd.apple.mpegurl':
	67	formats.extend(self._extract_m3u8_formats(
	68	file_url, video_id, 'mp4',
	69	entry_protocol='m3u8_native', m3u8_id='hls',
	70	fatal=False))
	71	else:
	72	ext = mimetype2ext(item.get('type'))
	73	label = item.get('label')
	74	formats.append({
	75	'url': file_url,
	76	'ext': ext,
	77	'format_id': 'http-%s' % (label or ext),
	78	'height': int_or_none(self._search_regex(
	79	r'(\d+)[pP]', label or '', 'height',
	80	default=None)),
	81	})
	82	self._sort_formats(
	83	formats, field_preference=('height', 'width', 'tbr', 'format_id'))
	84
	85	info = self._search_json_ld(webpage, video_id, default={})
70851a95	86
2943397e S	87	if not info.get('title'):
2943397e S	88	info['title'] = player['title']
aa94a6d3	89
2943397e	90	return merge_dicts(info, {
aa94a6d3	91	'id': video_id,
2943397e S	92	'thumbnail': url_or_none(options.get('poster')),
2943397e S	93	'duration': int_or_none(player.get('duration')),
70851a95	94	'formats': formats,
2943397e	95	})