[yt-dlp.git] / yt_dlp / extractor / zapiks.py

import re

from .common import InfoExtractor
from ..utils import (
    parse_duration,
    parse_iso8601,
    xpath_with_ns,
    xpath_text,
    int_or_none,
)


class ZapiksIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))'
    _TESTS = [
        {
            'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html',
            'md5': 'aeb3c473b2d564b2d46d664d28d5f050',
            'info_dict': {
                'id': '80798',
                'ext': 'mp4',
                'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!',
                'description': 'md5:7054d6f6f620c6519be1fe710d4da847',
                'thumbnail': r're:^https?://.*\.jpg$',
                'duration': 528,
                'timestamp': 1359044972,
                'upload_date': '20130124',
                'view_count': int,
            },
        },
        {
            'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html',
            'only_matching': True,
        },
        {
            'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html',
            'only_matching': True,
        },
        {
            'url': 'http://www.zapiks.fr/index.php?action=playerIframe&amp;media_id=118046&amp;width=640&amp;height=360&amp;autoStart=false&amp;language=fr',
            'only_matching': True,
        },
    ]

    def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        video_id = mobj.group('id')
        display_id = mobj.group('display_id') or video_id

        webpage = self._download_webpage(url, display_id)

        if not video_id:
            video_id = self._search_regex(
                r'data-media-id="(\d+)"', webpage, 'video id')

        playlist = self._download_xml(
            'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id,
            display_id)

        NS_MAP = {
            'jwplayer': 'http://rss.jwpcdn.com/'
        }

        def ns(path):
            return xpath_with_ns(path, NS_MAP)

        item = playlist.find('./channel/item')

        title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage)
        description = self._og_search_description(webpage, default=None)
        thumbnail = xpath_text(
            item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None)
        duration = parse_duration(self._html_search_meta(
            'duration', webpage, 'duration', default=None))
        timestamp = parse_iso8601(self._html_search_meta(
            'uploadDate', webpage, 'upload date', default=None), ' ')

        view_count = int_or_none(self._search_regex(
            r'UserPlays:(\d+)', webpage, 'view count', default=None))
        comment_count = int_or_none(self._search_regex(
            r'UserComments:(\d+)', webpage, 'comment count', default=None))

        formats = []
        for source in item.findall(ns('./jwplayer:source')):
            format_id = source.attrib['label']
            f = {
                'url': source.attrib['file'],
                'format_id': format_id,
            }
            m = re.search(r'^(?P<height>\d+)[pP]', format_id)
            if m:
                f['height'] = int(m.group('height'))
            formats.append(f)
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'timestamp': timestamp,
            'view_count': view_count,
            'comment_count': comment_count,
            'formats': formats,
        }
Commit	Line	Data
4aeccadf S	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	parse_duration,
	6	parse_iso8601,
	7	xpath_with_ns,
	8	xpath_text,
	9	int_or_none,
	10	)
	11
	12
	13	class ZapiksIE(InfoExtractor):
ea5152ca	14	_VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr\|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html\|index\.php\?.*\bmedia_id=(?P<id>\d+))'
4aeccadf S	15	_TESTS = [
	16	{
	17	'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html',
	18	'md5': 'aeb3c473b2d564b2d46d664d28d5f050',
	19	'info_dict': {
	20	'id': '80798',
	21	'ext': 'mp4',
	22	'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!',
	23	'description': 'md5:7054d6f6f620c6519be1fe710d4da847',
ec85ded8	24	'thumbnail': r're:^https?://.*\.jpg$',
4aeccadf S	25	'duration': 528,
	26	'timestamp': 1359044972,
	27	'upload_date': '20130124',
	28	'view_count': int,
4aeccadf S	29	},
4aeccadf S	30	},
ea5152ca S	31	{
	32	'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html',
	33	'only_matching': True,
	34	},
	35	{
	36	'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html',
	37	'only_matching': True,
	38	},
4aeccadf S	39	{
	40	'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr',
	41	'only_matching': True,
	42	},
	43	]
	44
	45	def _real_extract(self, url):
5ad28e7f	46	mobj = self._match_valid_url(url)
4aeccadf S	47	video_id = mobj.group('id')
	48	display_id = mobj.group('display_id') or video_id
	49
	50	webpage = self._download_webpage(url, display_id)
	51
	52	if not video_id:
	53	video_id = self._search_regex(
	54	r'data-media-id="(\d+)"', webpage, 'video id')
	55
	56	playlist = self._download_xml(
	57	'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id,
	58	display_id)
	59
	60	NS_MAP = {
	61	'jwplayer': 'http://rss.jwpcdn.com/'
	62	}
	63
	64	def ns(path):
	65	return xpath_with_ns(path, NS_MAP)
	66
	67	item = playlist.find('./channel/item')
	68
	69	title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage)
	70	description = self._og_search_description(webpage, default=None)
	71	thumbnail = xpath_text(
	72	item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None)
	73	duration = parse_duration(self._html_search_meta(
	74	'duration', webpage, 'duration', default=None))
	75	timestamp = parse_iso8601(self._html_search_meta(
	76	'uploadDate', webpage, 'upload date', default=None), ' ')
	77
	78	view_count = int_or_none(self._search_regex(
	79	r'UserPlays:(\d+)', webpage, 'view count', default=None))
	80	comment_count = int_or_none(self._search_regex(
	81	r'UserComments:(\d+)', webpage, 'comment count', default=None))
	82
	83	formats = []
	84	for source in item.findall(ns('./jwplayer:source')):
	85	format_id = source.attrib['label']
	86	f = {
	87	'url': source.attrib['file'],
	88	'format_id': format_id,
	89	}
	90	m = re.search(r'^(?P<height>\d+)[pP]', format_id)
	91	if m:
	92	f['height'] = int(m.group('height'))
	93	formats.append(f)
	94	self._sort_formats(formats)
	95
	96	return {
	97	'id': video_id,
	98	'title': title,
	99	'description': description,
	100	'thumbnail': thumbnail,
	101	'duration': duration,
	102	'timestamp': timestamp,
	103	'view_count': view_count,
	104	'comment_count': comment_count,
	105	'formats': formats,
	106	}