[yt-dlp.git] / youtube_dl / extractor / viki.py

from __future__ import unicode_literals

import re

from ..compat import (
    compat_urlparse,
    compat_urllib_request,
)
from ..utils import (
    ExtractorError,
    unescapeHTML,
    unified_strdate,
    US_RATINGS,
    determine_ext,
    mimetype2ext,
)
from .common import InfoExtractor


class VikiIE(InfoExtractor):
    IE_NAME = 'viki'

    # iPad2
    _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5'

    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
    _TESTS = [{
        'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
        'info_dict': {
            'id': '1023585v',
            'ext': 'mp4',
            'title': 'Heirs Episode 14',
            'uploader': 'SBS',
            'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
            'upload_date': '20131121',
            'age_limit': 13,
        },
        'skip': 'Blocked in the US',
    }, {
        'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
        'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c',
        'info_dict': {
            'id': '1067139v',
            'ext': 'mp4',
            'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
            'upload_date': '20150430',
            'title': '\'The Avengers: Age of Ultron\' Press Conference',
        }
    }, {
        'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
        'info_dict': {
            'id': '1048879v',
            'ext': 'mp4',
            'upload_date': '20140820',
            'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',
            'title': 'Ankhon Dekhi',
        },
        'params': {
            # requires ffmpeg
            'skip_download': True,
        }
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)
        title = self._og_search_title(webpage)
        description = self._og_search_description(webpage)
        thumbnail = self._og_search_thumbnail(webpage)

        uploader_m = re.search(
            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
        if uploader_m is None:
            uploader = None
        else:
            uploader = uploader_m.group(1).strip()

        rating_str = self._html_search_regex(
            r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
            'rating information', default='').strip()
        age_limit = US_RATINGS.get(rating_str)

        req = compat_urllib_request.Request(
            'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id)
        req.add_header('User-Agent', self._USER_AGENT)
        info_webpage = self._download_webpage(
            req, video_id, note='Downloading info page')
        err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None)
        if err_msg:
            if 'not available in your region' in err_msg:
                raise ExtractorError(
                    'Video %s is blocked from your location.' % video_id,
                    expected=True)
            else:
                raise ExtractorError('Viki said: %s %s' % (err_msg, url))
        mobj = re.search(
            r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage)
        if not mobj:
            raise ExtractorError('Unable to find video URL')
        video_url = unescapeHTML(mobj.group('url'))
        video_ext = mimetype2ext(mobj.group('mime_type'))

        if determine_ext(video_url) == 'm3u8':
            formats = self._extract_m3u8_formats(
                video_url, video_id, ext=video_ext)
        else:
            formats = [{
                'url': video_url,
                'ext': video_ext,
            }]

        upload_date_str = self._html_search_regex(
            r'"created_at":"([^"]+)"', info_webpage, 'upload date')
        upload_date = (
            unified_strdate(upload_date_str)
            if upload_date_str is not None
            else None
        )

        # subtitles
        video_subtitles = self.extract_subtitles(video_id, info_webpage)

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': description,
            'thumbnail': thumbnail,
            'age_limit': age_limit,
            'uploader': uploader,
            'subtitles': video_subtitles,
            'upload_date': upload_date,
        }

    def _get_subtitles(self, video_id, info_webpage):
        res = {}
        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
            sturl = unescapeHTML(sturl_html)
            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
            if not m:
                continue
            res[m.group('lang')] = [{
                'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
                'ext': 'vtt',
            }]
        return res


class VikiChannelIE(InfoExtractor):
    IE_NAME = 'viki:channel'
    _VALID_URL = r'^https?://(?:www\.)?viki\.com/tv/(?P<id>[0-9]+c)'
    _TESTS = [{
        'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
        'info_dict': {
            'id': '50c',
            'title': 'Boys Over Flowers',
            'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
        },
        'playlist_count': 70,
    }, {
        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
        'info_dict': {
            'id': '1354c',
            'title': 'Poor Nastya [COMPLETE]',
            'description': 'md5:05bf5471385aa8b21c18ad450e350525',
        },
        'playlist_count': 127,
    }]
    _PER_PAGE = 25

    def _real_extract(self, url):
        show_id = self._match_id(url)
        show_page = self._download_webpage(url, show_id, 'Download show page')

        title = self._og_search_title(show_page)
        description = self._og_search_description(show_page)

        entries = []
        for video_type in ['episodes', 'clips']:
            json_url = 'http://api.viki.io/v4/containers/%s/%s.json?app=100000a&per_page=%d&sort=number&direction=asc&with_paging=true&page=1' % (show_id, video_type, self._PER_PAGE)
            while json_url is not None:
                show_json = self._download_json(
                    json_url, show_id,
                    note='Downloading %s json page #%s' %
                         (video_type, re.search(r'[?&]page=([0-9]+)', json_url).group(1)))
                for video in show_json['response']:
                    video_id = video['id']
                    entries.append(self.url_result(
                        'http://www.viki.com/videos/%s' % video_id, 'Viki', video_id))
                json_url = show_json['pagination']['next']

        return self.playlist_result(entries, show_id, title, description)
Commit	Line	Data
cb9722cb PH	1	from __future__ import unicode_literals
cb9722cb PH	2
382ed50e PH	3	import re
382ed50e PH	4
8e3df9df YCH	5	from ..compat import (
	6	compat_urlparse,
	7	compat_urllib_request,
	8	)
382ed50e	9	from ..utils import (
6d88bc37	10	ExtractorError,
de79c46c	11	unescapeHTML,
382ed50e	12	unified_strdate,
a1a530b0	13	US_RATINGS,
d948e09b YCH	14	determine_ext,
d948e09b YCH	15	mimetype2ext,
382ed50e	16	)
4f7cea6c	17	from .common import InfoExtractor
382ed50e PH	18
382ed50e PH	19
4f7cea6c	20	class VikiIE(InfoExtractor):
cb9722cb	21	IE_NAME = 'viki'
382ed50e	22
8e3df9df YCH	23	# iPad2
	24	_USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5'
	25
382ed50e	26	_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
8e3df9df	27	_TESTS = [{
cb9722cb	28	'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
cb9722cb PH	29	'info_dict': {
	30	'id': '1023585v',
	31	'ext': 'mp4',
	32	'title': 'Heirs Episode 14',
	33	'uploader': 'SBS',
	34	'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
	35	'upload_date': '20131121',
	36	'age_limit': 13,
6d88bc37	37	},
cb9722cb	38	'skip': 'Blocked in the US',
8e3df9df YCH	39	}, {
	40	'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
	41	'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c',
	42	'info_dict': {
	43	'id': '1067139v',
	44	'ext': 'mp4',
	45	'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
	46	'upload_date': '20150430',
	47	'title': '\'The Avengers: Age of Ultron\' Press Conference',
	48	}
d948e09b YCH	49	}, {
	50	'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
	51	'info_dict': {
	52	'id': '1048879v',
	53	'ext': 'mp4',
	54	'upload_date': '20140820',
	55	'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',
	56	'title': 'Ankhon Dekhi',
	57	},
	58	'params': {
	59	# requires ffmpeg
	60	'skip_download': True,
	61	}
8e3df9df	62	}]
382ed50e PH	63
382ed50e PH	64	def _real_extract(self, url):
8ee34150	65	video_id = self._match_id(url)
382ed50e PH	66
	67	webpage = self._download_webpage(url, video_id)
	68	title = self._og_search_title(webpage)
	69	description = self._og_search_description(webpage)
	70	thumbnail = self._og_search_thumbnail(webpage)
	71
1fb2bcbb PH	72	uploader_m = re.search(
	73	r'<strong>Broadcast Network: </strong>\s([^<])<', webpage)
	74	if uploader_m is None:
	75	uploader = None
	76	else:
07e40358	77	uploader = uploader_m.group(1).strip()
382ed50e PH	78
	79	rating_str = self._html_search_regex(
	80	r'<strong>Rating: </strong>\s([^<])<', webpage,
cb9722cb	81	'rating information', default='').strip()
a1a530b0	82	age_limit = US_RATINGS.get(rating_str)
382ed50e	83
8e3df9df YCH	84	req = compat_urllib_request.Request(
	85	'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id)
	86	req.add_header('User-Agent', self._USER_AGENT)
b7553b25	87	info_webpage = self._download_webpage(
8e3df9df	88	req, video_id, note='Downloading info page')
89966a5a YCH	89	err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None)
89966a5a YCH	90	if err_msg:
89966a5a YCH	91	if 'not available in your region' in err_msg:
	92	raise ExtractorError(
	93	'Video %s is blocked from your location.' % video_id,
	94	expected=True)
	95	else:
1c18de00	96	raise ExtractorError('Viki said: %s %s' % (err_msg, url))
d948e09b YCH	97	mobj = re.search(
	98	r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage)
	99	if not mobj:
	100	raise ExtractorError('Unable to find video URL')
	101	video_url = unescapeHTML(mobj.group('url'))
	102	video_ext = mimetype2ext(mobj.group('mime_type'))
	103
	104	if determine_ext(video_url) == 'm3u8':
	105	formats = self._extract_m3u8_formats(
	106	video_url, video_id, ext=video_ext)
	107	else:
	108	formats = [{
	109	'url': video_url,
	110	'ext': video_ext,
	111	}]
382ed50e PH	112
382ed50e PH	113	upload_date_str = self._html_search_regex(
cb9722cb	114	r'"created_at":"([^"]+)"', info_webpage, 'upload date')
382ed50e PH	115	upload_date = (
	116	unified_strdate(upload_date_str)
	117	if upload_date_str is not None
	118	else None
	119	)
	120
	121	# subtitles
	122	video_subtitles = self.extract_subtitles(video_id, info_webpage)
382ed50e PH	123
	124	return {
	125	'id': video_id,
	126	'title': title,
d948e09b	127	'formats': formats,
382ed50e PH	128	'description': description,
	129	'thumbnail': thumbnail,
	130	'age_limit': age_limit,
	131	'uploader': uploader,
	132	'subtitles': video_subtitles,
	133	'upload_date': upload_date,
	134	}
	135
4f7cea6c	136	def _get_subtitles(self, video_id, info_webpage):
382ed50e	137	res = {}
4f7cea6c	138	for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
de79c46c	139	sturl = unescapeHTML(sturl_html)
382ed50e PH	140	m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
	141	if not m:
	142	continue
4f7cea6c JMF	143	res[m.group('lang')] = [{
	144	'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
	145	'ext': 'vtt',
	146	}]
382ed50e	147	return res
0d7f0364	148
0d7f0364	149
8da0e0e9	150	class VikiChannelIE(InfoExtractor):
8da0e0e9	151	IE_NAME = 'viki:channel'
0d7f0364	152	_VALID_URL = r'^https?://(?:www\.)?viki\.com/tv/(?P<id>[0-9]+c)'
	153	_TESTS = [{
	154	'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
	155	'info_dict': {
	156	'id': '50c',
	157	'title': 'Boys Over Flowers',
	158	'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
	159	},
1c18de00	160	'playlist_count': 70,
	161	}, {
	162	'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
	163	'info_dict': {
	164	'id': '1354c',
	165	'title': 'Poor Nastya [COMPLETE]',
	166	'description': 'md5:05bf5471385aa8b21c18ad450e350525',
	167	},
	168	'playlist_count': 127,
0d7f0364	169	}]
8da0e0e9	170	_PER_PAGE = 25
0d7f0364	171
	172	def _real_extract(self, url):
	173	show_id = self._match_id(url)
	174	show_page = self._download_webpage(url, show_id, 'Download show page')
	175
	176	title = self._og_search_title(show_page)
	177	description = self._og_search_description(show_page)
	178
0d7f0364	179	entries = []
1c18de00	180	for video_type in ['episodes', 'clips']:
8da0e0e9	181	json_url = 'http://api.viki.io/v4/containers/%s/%s.json?app=100000a&per_page=%d&sort=number&direction=asc&with_paging=true&page=1' % (show_id, video_type, self._PER_PAGE)
1c18de00	182	while json_url is not None:
1c18de00	183	show_json = self._download_json(
8da0e0e9	184	json_url, show_id,
	185	note='Downloading %s json page #%s' %
	186	(video_type, re.search(r'[?&]page=([0-9]+)', json_url).group(1)))
1c18de00	187	for video in show_json['response']:
	188	video_id = video['id']
	189	entries.append(self.url_result(
	190	'http://www.viki.com/videos/%s' % video_id, 'Viki', video_id))
	191	json_url = show_json['pagination']['next']
0d7f0364	192
0d7f0364	193	return self.playlist_result(entries, show_id, title, description)