[yt-dlp.git] / youtube_dl / extractor / vlive.py

# coding: utf-8
from __future__ import unicode_literals

import re
import time
import itertools

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse_urlencode,
    compat_str,
)
from ..utils import (
    dict_get,
    ExtractorError,
    float_or_none,
    int_or_none,
    remove_start,
    try_get,
    urlencode_postdata,
)


class VLiveIE(InfoExtractor):
    IE_NAME = 'vlive'
    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
    _TESTS = [{
        'url': 'http://www.vlive.tv/video/1326',
        'md5': 'cc7314812855ce56de70a06a27314983',
        'info_dict': {
            'id': '1326',
            'ext': 'mp4',
            'title': "[V LIVE] Girl's Day's Broadcast",
            'creator': "Girl's Day",
            'view_count': int,
        },
    }, {
        'url': 'http://www.vlive.tv/video/16937',
        'info_dict': {
            'id': '16937',
            'ext': 'mp4',
            'title': '[V LIVE] 첸백시 걍방',
            'creator': 'EXO',
            'view_count': int,
            'subtitles': 'mincount:12',
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(
            'http://www.vlive.tv/video/%s' % video_id, video_id)

        VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
        VIDEO_PARAMS_FIELD = 'video params'

        params = self._parse_json(self._search_regex(
            VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
            transform_source=lambda s: '[' + s + ']', fatal=False)

        if not params or len(params) < 7:
            params = self._search_regex(
                VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
            params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)]

        status, long_video_id, key = params[2], params[5], params[6]
        status = remove_start(status, 'PRODUCT_')

        if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
            return self._live(video_id, webpage)
        elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
            if long_video_id and key:
                return self._replay(video_id, webpage, long_video_id, key)
            else:
                status = 'COMING_SOON'

        if status == 'LIVE_END':
            raise ExtractorError('Uploading for replay. Please wait...',
                                 expected=True)
        elif status == 'COMING_SOON':
            raise ExtractorError('Coming soon!', expected=True)
        elif status == 'CANCELED':
            raise ExtractorError('We are sorry, '
                                 'but the live broadcast has been canceled.',
                                 expected=True)
        else:
            raise ExtractorError('Unknown status %s' % status)

    def _get_common_fields(self, webpage):
        title = self._og_search_title(webpage)
        creator = self._html_search_regex(
            r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)',
            webpage, 'creator', fatal=False)
        thumbnail = self._og_search_thumbnail(webpage)
        return {
            'title': title,
            'creator': creator,
            'thumbnail': thumbnail,
        }

    def _live(self, video_id, webpage):
        init_page = self._download_webpage(
            'http://www.vlive.tv/video/init/view',
            video_id, note='Downloading live webpage',
            data=urlencode_postdata({'videoSeq': video_id}),
            headers={
                'Referer': 'http://www.vlive.tv/video/%s' % video_id,
                'Content-Type': 'application/x-www-form-urlencoded'
            })

        live_params = self._search_regex(
            r'"liveStreamInfo"\s*:\s*(".*"),',
            init_page, 'live stream info')
        live_params = self._parse_json(live_params, video_id)
        live_params = self._parse_json(live_params, video_id)

        formats = []
        for vid in live_params.get('resolutions', []):
            formats.extend(self._extract_m3u8_formats(
                vid['cdnUrl'], video_id, 'mp4',
                m3u8_id=vid.get('name'),
                fatal=False, live=True))
        self._sort_formats(formats)

        info = self._get_common_fields(webpage)
        info.update({
            'title': self._live_title(info['title']),
            'id': video_id,
            'formats': formats,
            'is_live': True,
        })
        return info

    def _replay(self, video_id, webpage, long_video_id, key):
        playinfo = self._download_json(
            'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s'
            % compat_urllib_parse_urlencode({
                'videoId': long_video_id,
                'key': key,
                'ptc': 'http',
                'doct': 'json',  # document type (xml or json)
                'cpt': 'vtt',  # captions type (vtt or ttml)
            }), video_id)

        formats = [{
            'url': vid['source'],
            'format_id': vid.get('encodingOption', {}).get('name'),
            'abr': float_or_none(vid.get('bitrate', {}).get('audio')),
            'vbr': float_or_none(vid.get('bitrate', {}).get('video')),
            'width': int_or_none(vid.get('encodingOption', {}).get('width')),
            'height': int_or_none(vid.get('encodingOption', {}).get('height')),
            'filesize': int_or_none(vid.get('size')),
        } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')]
        self._sort_formats(formats)

        view_count = int_or_none(playinfo.get('meta', {}).get('count'))

        subtitles = {}
        for caption in playinfo.get('captions', {}).get('list', []):
            lang = dict_get(caption, ('locale', 'language', 'country', 'label'))
            if lang and caption.get('source'):
                subtitles[lang] = [{
                    'ext': 'vtt',
                    'url': caption['source']}]

        info = self._get_common_fields(webpage)
        info.update({
            'id': video_id,
            'formats': formats,
            'view_count': view_count,
            'subtitles': subtitles,
        })
        return info


class VLiveChannelIE(InfoExtractor):
    IE_NAME = 'vlive:channel'
    _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)'
    _TEST = {
        'url': 'http://channels.vlive.tv/FCD4B',
        'info_dict': {
            'id': 'FCD4B',
            'title': 'MAMAMOO',
        },
        'playlist_mincount': 110
    }
    _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'

    def _real_extract(self, url):
        channel_code = self._match_id(url)

        webpage = self._download_webpage(
            'http://channels.vlive.tv/%s/video' % channel_code, channel_code)

        app_id = None

        app_js_url = self._search_regex(
            r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
            webpage, 'app js', default=None, group='url')

        if app_js_url:
            app_js = self._download_webpage(
                app_js_url, channel_code, 'Downloading app JS', fatal=False)
            if app_js:
                app_id = self._search_regex(
                    r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
                    app_js, 'app id', default=None)

        app_id = app_id or self._APP_ID

        channel_info = self._download_json(
            'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
            channel_code, note='Downloading decode channel code',
            query={
                'app_id': app_id,
                'channelCode': channel_code,
                '_': int(time.time())
            })

        channel_seq = channel_info['result']['channelSeq']
        channel_name = None
        entries = []

        for page_num in itertools.count(1):
            video_list = self._download_json(
                'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
                channel_code, note='Downloading channel list page #%d' % page_num,
                query={
                    'app_id': app_id,
                    'channelSeq': channel_seq,
                    'maxNumOfRows': 1000,
                    '_': int(time.time()),
                    'pageNo': page_num
                }
            )

            if not channel_name:
                channel_name = try_get(
                    video_list,
                    lambda x: x['result']['channelInfo']['channelName'],
                    compat_str)

            videos = try_get(
                video_list, lambda x: x['result']['videoList'], list)
            if not videos:
                break

            for video in videos:
                video_id = video.get('videoSeq')
                if not video_id:
                    continue
                video_id = compat_str(video_id)
                entries.append(
                    self.url_result(
                        'http://www.vlive.tv/video/%s' % video_id,
                        ie=VLiveIE.ie_key(), video_id=video_id))

        return self.playlist_result(
            entries, channel_code, channel_name)
Commit	Line	Data
061f62da	1	# coding: utf-8
25bcd355	2	from __future__ import unicode_literals
061f62da	3
b24d6336	4	import re
b92d3c53	5	import time
b92d3c53	6	import itertools
9d186afa	7
061f62da	8	from .common import InfoExtractor
661cc229 S	9	from ..compat import (
	10	compat_urllib_parse_urlencode,
	11	compat_str,
	12	)
061f62da	13	from ..utils import (
52f5889f	14	dict_get,
9d186afa	15	ExtractorError,
52f5889f S	16	float_or_none,
52f5889f S	17	int_or_none,
345dec93	18	remove_start,
661cc229	19	try_get,
89c63cc5	20	urlencode_postdata,
061f62da	21	)
061f62da	22
	23
	24	class VLiveIE(InfoExtractor):
	25	IE_NAME = 'vlive'
52f5889f	26	_VALID_URL = r'https?://(?:(?:www\|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
58355a3b	27	_TESTS = [{
b8b465af	28	'url': 'http://www.vlive.tv/video/1326',
061f62da	29	'md5': 'cc7314812855ce56de70a06a27314983',
	30	'info_dict': {
	31	'id': '1326',
	32	'ext': 'mp4',
25bcd355	33	'title': "[V LIVE] Girl's Day's Broadcast",
52f5889f S	34	'creator': "Girl's Day",
52f5889f S	35	'view_count': int,
061f62da	36	},
58355a3b S	37	}, {
	38	'url': 'http://www.vlive.tv/video/16937',
	39	'info_dict': {
	40	'id': '16937',
	41	'ext': 'mp4',
	42	'title': '[V LIVE] 첸백시 걍방',
	43	'creator': 'EXO',
	44	'view_count': int,
	45	'subtitles': 'mincount:12',
	46	},
	47	'params': {
	48	'skip_download': True,
	49	},
	50	}]
061f62da	51
	52	def _real_extract(self, url):
	53	video_id = self._match_id(url)
	54
	55	webpage = self._download_webpage(
52f5889f	56	'http://www.vlive.tv/video/%s' % video_id, video_id)
061f62da	57
89c63cc5 S	58	VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
89c63cc5 S	59	VIDEO_PARAMS_FIELD = 'video params'
57774807	60
89c63cc5 S	61	params = self._parse_json(self._search_regex(
	62	VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
	63	transform_source=lambda s: '[' + s + ']', fatal=False)
	64
	65	if not params or len(params) < 7:
	66	params = self._search_regex(
	67	VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
	68	params = [p.strip(r'"') for p in re.split(r'\s,\s', params)]
	69
	70	status, long_video_id, key = params[2], params[5], params[6]
345dec93	71	status = remove_start(status, 'PRODUCT_')
b24d6336	72
40fcba5e	73	if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
57774807	74	return self._live(video_id, webpage)
40fcba5e	75	elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
b24d6336 KH	76	if long_video_id and key:
b24d6336 KH	77	return self._replay(video_id, webpage, long_video_id, key)
b24d6336 KH	78	else:
	79	status = 'COMING_SOON'
	80
	81	if status == 'LIVE_END':
	82	raise ExtractorError('Uploading for replay. Please wait...',
	83	expected=True)
	84	elif status == 'COMING_SOON':
25bcd355	85	raise ExtractorError('Coming soon!', expected=True)
b24d6336 KH	86	elif status == 'CANCELED':
	87	raise ExtractorError('We are sorry, '
	88	'but the live broadcast has been canceled.',
	89	expected=True)
	90	else:
	91	raise ExtractorError('Unknown status %s' % status)
	92
	93	def _get_common_fields(self, webpage):
061f62da	94	title = self._og_search_title(webpage)
b24d6336 KH	95	creator = self._html_search_regex(
	96	r'<div[^>]+class="info_area"[^>]>\s<a\s+[^>]*>([^<]+)',
	97	webpage, 'creator', fatal=False)
	98	thumbnail = self._og_search_thumbnail(webpage)
	99	return {
	100	'title': title,
	101	'creator': creator,
	102	'thumbnail': thumbnail,
	103	}
08354db4	104
57774807 CN	105	def _live(self, video_id, webpage):
	106	init_page = self._download_webpage(
	107	'http://www.vlive.tv/video/init/view',
89c63cc5 S	108	video_id, note='Downloading live webpage',
	109	data=urlencode_postdata({'videoSeq': video_id}),
	110	headers={
57774807 CN	111	'Referer': 'http://www.vlive.tv/video/%s' % video_id,
	112	'Content-Type': 'application/x-www-form-urlencoded'
	113	})
	114
	115	live_params = self._search_regex(
	116	r'"liveStreamInfo"\s:\s(".*"),',
89c63cc5	117	init_page, 'live stream info')
57774807 CN	118	live_params = self._parse_json(live_params, video_id)
	119	live_params = self._parse_json(live_params, video_id)
	120
b24d6336 KH	121	formats = []
	122	for vid in live_params.get('resolutions', []):
	123	formats.extend(self._extract_m3u8_formats(
	124	vid['cdnUrl'], video_id, 'mp4',
	125	m3u8_id=vid.get('name'),
	126	fatal=False, live=True))
	127	self._sort_formats(formats)
	128
069f9183 S	129	info = self._get_common_fields(webpage)
	130	info.update({
	131	'title': self._live_title(info['title']),
	132	'id': video_id,
	133	'formats': formats,
	134	'is_live': True,
	135	})
	136	return info
b24d6336 KH	137
b24d6336 KH	138	def _replay(self, video_id, webpage, long_video_id, key):
52f5889f S	139	playinfo = self._download_json(
52f5889f S	140	'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s'
15707c7e	141	% compat_urllib_parse_urlencode({
52f5889f S	142	'videoId': long_video_id,
	143	'key': key,
	144	'ptc': 'http',
	145	'doct': 'json', # document type (xml or json)
	146	'cpt': 'vtt', # captions type (vtt or ttml)
	147	}), video_id)
061f62da	148
52f5889f S	149	formats = [{
	150	'url': vid['source'],
	151	'format_id': vid.get('encodingOption', {}).get('name'),
	152	'abr': float_or_none(vid.get('bitrate', {}).get('audio')),
	153	'vbr': float_or_none(vid.get('bitrate', {}).get('video')),
	154	'width': int_or_none(vid.get('encodingOption', {}).get('width')),
	155	'height': int_or_none(vid.get('encodingOption', {}).get('height')),
	156	'filesize': int_or_none(vid.get('size')),
	157	} for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')]
061f62da	158	self._sort_formats(formats)
061f62da	159
52f5889f S	160	view_count = int_or_none(playinfo.get('meta', {}).get('count'))
52f5889f S	161
061f62da	162	subtitles = {}
b8b465af	163	for caption in playinfo.get('captions', {}).get('list', []):
49b69ad9	164	lang = dict_get(caption, ('locale', 'language', 'country', 'label'))
52f5889f S	165	if lang and caption.get('source'):
	166	subtitles[lang] = [{
	167	'ext': 'vtt',
	168	'url': caption['source']}]
061f62da	169
069f9183 S	170	info = self._get_common_fields(webpage)
	171	info.update({
	172	'id': video_id,
	173	'formats': formats,
	174	'view_count': view_count,
	175	'subtitles': subtitles,
	176	})
	177	return info
b92d3c53	178
	179
	180	class VLiveChannelIE(InfoExtractor):
	181	IE_NAME = 'vlive:channel'
661cc229	182	_VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)'
b92d3c53	183	_TEST = {
661cc229	184	'url': 'http://channels.vlive.tv/FCD4B',
b92d3c53	185	'info_dict': {
	186	'id': 'FCD4B',
	187	'title': 'MAMAMOO',
	188	},
	189	'playlist_mincount': 110
	190	}
	191	_APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
	192
	193	def _real_extract(self, url):
	194	channel_code = self._match_id(url)
	195
	196	webpage = self._download_webpage(
	197	'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
661cc229 S	198
	199	app_id = None
	200
b92d3c53	201	app_js_url = self._search_regex(
661cc229 S	202	r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
661cc229 S	203	webpage, 'app js', default=None, group='url')
b92d3c53	204
b92d3c53	205	if app_js_url:
661cc229 S	206	app_js = self._download_webpage(
	207	app_js_url, channel_code, 'Downloading app JS', fatal=False)
	208	if app_js:
	209	app_id = self._search_regex(
	210	r'Global\.VFAN_APP_ID\s=\s[\'"]([^\'"]+)[\'"]',
	211	app_js, 'app id', default=None)
	212
	213	app_id = app_id or self._APP_ID
b92d3c53	214
	215	channel_info = self._download_json(
	216	'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
661cc229 S	217	channel_code, note='Downloading decode channel code',
	218	query={
	219	'app_id': app_id,
	220	'channelCode': channel_code,
	221	'_': int(time.time())
	222	})
b92d3c53	223
	224	channel_seq = channel_info['result']['channelSeq']
	225	channel_name = None
	226	entries = []
	227
	228	for page_num in itertools.count(1):
	229	video_list = self._download_json(
	230	'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
661cc229	231	channel_code, note='Downloading channel list page #%d' % page_num,
b92d3c53	232	query={
	233	'app_id': app_id,
	234	'channelSeq': channel_seq,
	235	'maxNumOfRows': 1000,
	236	'_': int(time.time()),
	237	'pageNo': page_num
	238	}
	239	)
b92d3c53	240
661cc229 S	241	if not channel_name:
	242	channel_name = try_get(
	243	video_list,
	244	lambda x: x['result']['channelInfo']['channelName'],
	245	compat_str)
	246
	247	videos = try_get(
	248	video_list, lambda x: x['result']['videoList'], list)
	249	if not videos:
b92d3c53	250	break
b92d3c53	251
661cc229 S	252	for video in videos:
	253	video_id = video.get('videoSeq')
	254	if not video_id:
	255	continue
	256	video_id = compat_str(video_id)
b92d3c53	257	entries.append(
b92d3c53	258	self.url_result(
661cc229 S	259	'http://www.vlive.tv/video/%s' % video_id,
661cc229 S	260	ie=VLiveIE.ie_key(), video_id=video_id))
b92d3c53	261
	262	return self.playlist_result(
	263	entries, channel_code, channel_name)