[yt-dlp.git] / youtube_dl / extractor / onet.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    ExtractorError,
    float_or_none,
    get_element_by_class,
    int_or_none,
    js_to_json,
    parse_iso8601,
    remove_start,
    strip_or_none,
    url_basename,
)


class OnetBaseIE(InfoExtractor):
    def _search_mvp_id(self, webpage):
        return self._search_regex(
            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')

    def _extract_from_id(self, video_id, webpage):
        response = self._download_json(
            'http://qi.ckm.onetapi.pl/', video_id,
            query={
                'body[id]': video_id,
                'body[jsonrpc]': '2.0',
                'body[method]': 'get_asset_detail',
                'body[params][ID_Publikacji]': video_id,
                'body[params][Service]': 'www.onet.pl',
                'content-type': 'application/jsonp',
                'x-onet-app': 'player.front.onetapi.pl',
            })

        error = response.get('error')
        if error:
            raise ExtractorError(
                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)

        video = response['result'].get('0')

        formats = []
        for _, formats_dict in video['formats'].items():
            if not isinstance(formats_dict, dict):
                continue
            for format_id, format_list in formats_dict.items():
                if not isinstance(format_list, list):
                    continue
                for f in format_list:
                    video_url = f.get('url')
                    if not video_url:
                        continue
                    ext = determine_ext(video_url)
                    if format_id == 'ism':
                        # TODO: Support Microsoft Smooth Streaming
                        continue
                    elif ext == 'mpd':
                        # TODO: Current DASH formats are broken - $Time$ pattern in
                        # <SegmentTemplate> not implemented yet
                        # formats.extend(self._extract_mpd_formats(
                        #    video_url, video_id, mpd_id='dash', fatal=False))
                        continue
                    else:
                        formats.append({
                            'url': video_url,
                            'format_id': format_id,
                            'height': int_or_none(f.get('vertical_resolution')),
                            'width': int_or_none(f.get('horizontal_resolution')),
                            'abr': float_or_none(f.get('audio_bitrate')),
                            'vbr': float_or_none(f.get('video_bitrate')),
                        })
        self._sort_formats(formats)

        meta = video.get('meta', {})

        title = self._og_search_title(webpage, default=None) or meta['title']
        description = self._og_search_description(webpage, default=None) or meta.get('description')
        duration = meta.get('length') or meta.get('lenght')
        timestamp = parse_iso8601(meta.get('addDate'), ' ')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'duration': duration,
            'timestamp': timestamp,
            'formats': formats,
        }


class OnetIE(OnetBaseIE):
    _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
    IE_NAME = 'onet.tv'

    _TEST = {
        'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
        'md5': 'e3ffbf47590032ac3f27249204173d50',
        'info_dict': {
            'id': 'qbpyqc',
            'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
            'ext': 'mp4',
            'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd',
            'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...',
            'upload_date': '20160705',
            'timestamp': 1467721580,
        },
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id, video_id = mobj.group('display_id', 'id')

        webpage = self._download_webpage(url, display_id)

        mvp_id = self._search_mvp_id(webpage)

        info_dict = self._extract_from_id(mvp_id, webpage)
        info_dict.update({
            'id': video_id,
            'display_id': display_id,
        })

        return info_dict


class OnetChannelIE(OnetBaseIE):
    _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)'
    IE_NAME = 'onet.tv:channel'

    _TEST = {
        'url': 'http://onet.tv/k/openerfestival',
        'info_dict': {
            'id': 'openerfestival',
            'title': 'Open\'er Festival Live',
            'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.',
        },
        'playlist_mincount': 46,
    }

    def _real_extract(self, url):
        channel_id = self._match_id(url)

        webpage = self._download_webpage(url, channel_id)

        current_clip_info = self._parse_json(self._search_regex(
            r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id,
            transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s)))
        video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
        video_name = url_basename(current_clip_info['url'])

        if self._downloader.params.get('noplaylist'):
            self.to_screen(
                'Downloading just video %s because of --no-playlist' % video_name)
            return self._extract_from_id(video_id, webpage)

        self.to_screen(
            'Downloading channel %s - add --no-playlist to just download video %s' % (
                channel_id, video_name))
        matches = re.findall(
            r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)',
            webpage)
        entries = [
            self.url_result(video_link, OnetIE.ie_key())
            for video_link in matches]

        channel_title = strip_or_none(get_element_by_class('o_channelName', webpage))
        channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage))
        return self.playlist_result(entries, channel_id, channel_title, channel_description)
Commit	Line	Data
f8752b86 YCH	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import re
	5
	6	from .common import InfoExtractor
	7	from ..utils import (
	8	determine_ext,
	9	ExtractorError,
	10	float_or_none,
	11	get_element_by_class,
	12	int_or_none,
	13	js_to_json,
	14	parse_iso8601,
	15	remove_start,
	16	strip_or_none,
	17	url_basename,
	18	)
	19
	20
	21	class OnetBaseIE(InfoExtractor):
	22	def _search_mvp_id(self, webpage):
	23	return self._search_regex(
	24	r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
	25
	26	def _extract_from_id(self, video_id, webpage):
	27	response = self._download_json(
	28	'http://qi.ckm.onetapi.pl/', video_id,
	29	query={
	30	'body[id]': video_id,
	31	'body[jsonrpc]': '2.0',
	32	'body[method]': 'get_asset_detail',
	33	'body[params][ID_Publikacji]': video_id,
	34	'body[params][Service]': 'www.onet.pl',
	35	'content-type': 'application/jsonp',
	36	'x-onet-app': 'player.front.onetapi.pl',
	37	})
	38
	39	error = response.get('error')
	40	if error:
	41	raise ExtractorError(
	42	'%s said: %s' % (self.IE_NAME, error['message']), expected=True)
	43
	44	video = response['result'].get('0')
	45
	46	formats = []
	47	for _, formats_dict in video['formats'].items():
	48	if not isinstance(formats_dict, dict):
	49	continue
	50	for format_id, format_list in formats_dict.items():
	51	if not isinstance(format_list, list):
	52	continue
	53	for f in format_list:
	54	video_url = f.get('url')
	55	if not video_url:
	56	continue
	57	ext = determine_ext(video_url)
	58	if format_id == 'ism':
	59	# TODO: Support Microsoft Smooth Streaming
	60	continue
	61	elif ext == 'mpd':
	62	# TODO: Current DASH formats are broken - $Time$ pattern in
	63	# <SegmentTemplate> not implemented yet
	64	# formats.extend(self._extract_mpd_formats(
65	# video_url, video_id, mpd_id='dash', fatal=False))
66	continue
67	else:
68	formats.append({
69	'url': video_url,
70	'format_id': format_id,
71	'height': int_or_none(f.get('vertical_resolution')),
72	'width': int_or_none(f.get('horizontal_resolution')),
73	'abr': float_or_none(f.get('audio_bitrate')),
74	'vbr': float_or_none(f.get('video_bitrate')),
75	})
76	self._sort_formats(formats)
77
78	meta = video.get('meta', {})
79
80	title = self._og_search_title(webpage, default=None) or meta['title']
81	description = self._og_search_description(webpage, default=None) or meta.get('description')
82	duration = meta.get('length') or meta.get('lenght')
83	timestamp = parse_iso8601(meta.get('addDate'), ' ')
84
85	return {
86	'id': video_id,
87	'title': title,
88	'description': description,
89	'duration': duration,
90	'timestamp': timestamp,
91	'formats': formats,
92	}
93
94
95	class OnetIE(OnetBaseIE):
96	_VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
97	IE_NAME = 'onet.tv'
98
99	_TEST = {
100	'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
43f1e4e4	101	'md5': 'e3ffbf47590032ac3f27249204173d50',
f8752b86 YCH	102	'info_dict': {
	103	'id': 'qbpyqc',
	104	'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
	105	'ext': 'mp4',
	106	'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd',
	107	'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...',
	108	'upload_date': '20160705',
	109	'timestamp': 1467721580,
	110	},
	111	}
	112
	113	def _real_extract(self, url):
	114	mobj = re.match(self._VALID_URL, url)
	115	display_id, video_id = mobj.group('display_id', 'id')
	116
	117	webpage = self._download_webpage(url, display_id)
	118
	119	mvp_id = self._search_mvp_id(webpage)
	120
	121	info_dict = self._extract_from_id(mvp_id, webpage)
	122	info_dict.update({
	123	'id': video_id,
	124	'display_id': display_id,
	125	})
	126
	127	return info_dict
	128
	129
	130	class OnetChannelIE(OnetBaseIE):
	131	_VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]\|$)'
	132	IE_NAME = 'onet.tv:channel'
	133
	134	_TEST = {
	135	'url': 'http://onet.tv/k/openerfestival',
	136	'info_dict': {
	137	'id': 'openerfestival',
	138	'title': 'Open\'er Festival Live',
	139	'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.',
	140	},
	141	'playlist_mincount': 46,
	142	}
	143
	144	def _real_extract(self, url):
	145	channel_id = self._match_id(url)
	146
	147	webpage = self._download_webpage(url, channel_id)
	148
	149	current_clip_info = self._parse_json(self._search_regex(
	150	r'var\s+currentClip\s=\s({[^}]+})', webpage, 'video info'), channel_id,
	151	transform_source=lambda s: js_to_json(re.sub(r'\'\s\+\s\'', '', s)))
	152	video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
	153	video_name = url_basename(current_clip_info['url'])
	154
	155	if self._downloader.params.get('noplaylist'):
	156	self.to_screen(
	157	'Downloading just video %s because of --no-playlist' % video_name)
	158	return self._extract_from_id(video_id, webpage)
	159
	160	self.to_screen(
	161	'Downloading channel %s - add --no-playlist to just download video %s' % (
	162	channel_id, video_name))
	163	matches = re.findall(
	164	r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)',
	165	webpage)
166	entries = [
167	self.url_result(video_link, OnetIE.ie_key())
168	for video_link in matches]
169
170	channel_title = strip_or_none(get_element_by_class('o_channelName', webpage))
171	channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage))
172	return self.playlist_result(entries, channel_id, channel_title, channel_description)