[yt-dlp.git] / youtube_dl / extractor / tudou.py

# coding: utf-8

from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    ExtractorError,
    int_or_none,
    InAdvancePagedList,
    float_or_none,
    unescapeHTML,
)


class TudouIE(InfoExtractor):
    IE_NAME = 'tudou'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'
    _TESTS = [{
        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
        'md5': '140a49ed444bd22f93330985d8475fcb',
        'info_dict': {
            'id': '159448201',
            'ext': 'f4v',
            'title': '卡马乔国足开大脚长传冲吊集锦',
            'thumbnail': 're:^https?://.*\.jpg$',
            'timestamp': 1372113489000,
            'description': '卡马乔卡家军，开大脚先进战术不完全集锦！',
            'duration': 289.04,
            'view_count': int,
            'filesize': int,
        }
    }, {
        'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
        'info_dict': {
            'id': '117049447',
            'ext': 'f4v',
            'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
            'thumbnail': 're:^https?://.*\.jpg$',
            'timestamp': 1349207518000,
            'description': 'md5:294612423894260f2dcd5c6c04fe248b',
            'duration': 5478.33,
            'view_count': int,
            'filesize': int,
        }
    }]

    _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'

    # Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf
    # 0001, 0002 and 4001 are not included as they indicate temporary issues
    TVC_ERRORS = {
        '0003': 'The video is deleted or does not exist',
        '1001': 'This video is unavailable due to licensing issues',
        '1002': 'This video is unavailable as it\'s under review',
        '1003': 'This video is unavailable as it\'s under review',
        '3001': 'Password required',
        '5001': 'This video is available in Mainland China only due to licensing issues',
        '7001': 'This video is unavailable',
        '8001': 'This video is unavailable due to licensing issues',
    }

    def _url_for_id(self, video_id, quality=None):
        info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
        if quality:
            info_url += '&hd' + quality
        xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page')
        error = xml_data.attrib.get('error')
        if error is not None:
            raise ExtractorError('Tudou said: %s' % error, expected=True)
        final_url = xml_data.text
        return final_url

    def _real_extract(self, url):
        video_id = self._match_id(url)
        item_data = self._download_json(
            'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)

        youku_vcode = item_data.get('vcode')
        if youku_vcode:
            return self.url_result('youku:' + youku_vcode, ie='Youku')

        if not item_data.get('itemSegs'):
            tvc_code = item_data.get('tvcCode')
            if tvc_code:
                err_msg = self.TVC_ERRORS.get(tvc_code)
                if err_msg:
                    raise ExtractorError('Tudou said: %s' % err_msg, expected=True)
                raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code)
            raise ExtractorError('Unxpected error returned from Tudou')

        title = unescapeHTML(item_data['kw'])
        description = item_data.get('desc')
        thumbnail_url = item_data.get('pic')
        view_count = int_or_none(item_data.get('playTimes'))
        timestamp = int_or_none(item_data.get('pt'))

        segments = self._parse_json(item_data['itemSegs'], video_id)
        # It looks like the keys are the arguments that have to be passed as
        # the hd field in the request url, we pick the higher
        # Also, filter non-number qualities (see issue #3643).
        quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
                         key=lambda k: int(k))[-1]
        parts = segments[quality]
        len_parts = len(parts)
        if len_parts > 1:
            self.to_screen('%s: found %s parts' % (video_id, len_parts))

        def part_func(partnum):
            part = parts[partnum]
            part_id = part['k']
            final_url = self._url_for_id(part_id, quality)
            ext = (final_url.split('?')[0]).split('.')[-1]
            return [{
                'id': '%s' % part_id,
                'url': final_url,
                'ext': ext,
                'title': title,
                'thumbnail': thumbnail_url,
                'description': description,
                'view_count': view_count,
                'timestamp': timestamp,
                'duration': float_or_none(part.get('seconds'), 1000),
                'filesize': int_or_none(part.get('size')),
                'http_headers': {
                    'Referer': self._PLAYER_URL,
                },
            }]

        entries = InAdvancePagedList(part_func, len_parts, 1)

        return {
            '_type': 'multi_video',
            'entries': entries,
            'id': video_id,
            'title': title,
        }


class TudouPlaylistIE(InfoExtractor):
    IE_NAME = 'tudou:playlist'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
    _TESTS = [{
        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
        'info_dict': {
            'id': 'zzdE77v6Mmo',
        },
        'playlist_mincount': 209,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        playlist_data = self._download_json(
            'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
        entries = [self.url_result(
            'http://www.tudou.com/programs/view/%s' % item['icode'],
            'Tudou', item['icode'],
            item['kw']) for item in playlist_data['items']]
        return self.playlist_result(entries, playlist_id)


class TudouAlbumIE(InfoExtractor):
    IE_NAME = 'tudou:album'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})'
    _TESTS = [{
        'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
        'info_dict': {
            'id': 'v5qckFJvNJg',
        },
        'playlist_mincount': 45,
    }]

    def _real_extract(self, url):
        album_id = self._match_id(url)
        album_data = self._download_json(
            'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
        entries = [self.url_result(
            'http://www.tudou.com/programs/view/%s' % item['icode'],
            'Tudou', item['icode'],
            item['kw']) for item in album_data['items']]
        return self.playlist_result(entries, album_id)
Commit	Line	Data
24a267b5 JMF	1	# coding: utf-8
24a267b5 JMF	2
8bdfddf6 PH	3	from __future__ import unicode_literals
8bdfddf6 PH	4
9caa687d	5	from .common import InfoExtractor
b264c213	6	from ..compat import compat_str
40cf7fcb	7	from ..utils import (
5b012dfc	8	ExtractorError,
40cf7fcb	9	int_or_none,
664bcd80	10	InAdvancePagedList,
40cf7fcb	11	float_or_none,
	12	unescapeHTML,
	13	)
9caa687d YK	14
	15
	16	class TudouIE(InfoExtractor):
40cf7fcb	17	IE_NAME = 'tudou'
40cf7fcb	18	_VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs\|wlplay)/view\|(?:listplay\|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'
9ed3bdc6	19	_TESTS = [{
8bdfddf6 PH	20	'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
	21	'md5': '140a49ed444bd22f93330985d8475fcb',
	22	'info_dict': {
	23	'id': '159448201',
	24	'ext': 'f4v',
	25	'title': '卡马乔国足开大脚长传冲吊集锦',
	26	'thumbnail': 're:^https?://.*\.jpg$',
40cf7fcb	27	'timestamp': 1372113489000,
	28	'description': '卡马乔卡家军，开大脚先进战术不完全集锦！',
	29	'duration': 289.04,
	30	'view_count': int,
	31	'filesize': int,
6f5ac90c	32	}
a8be56ce PH	33	}, {
	34	'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
	35	'info_dict': {
	36	'id': '117049447',
	37	'ext': 'f4v',
	38	'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
	39	'thumbnail': 're:^https?://.*\.jpg$',
40cf7fcb	40	'timestamp': 1349207518000,
	41	'description': 'md5:294612423894260f2dcd5c6c04fe248b',
	42	'duration': 5478.33,
	43	'view_count': int,
	44	'filesize': int,
a8be56ce	45	}
9ed3bdc6	46	}]
9caa687d	47
c71a3195	48	_PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
23875575	49
5b012dfc YCH	50	# Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf
	51	# 0001, 0002 and 4001 are not included as they indicate temporary issues
	52	TVC_ERRORS = {
	53	'0003': 'The video is deleted or does not exist',
	54	'1001': 'This video is unavailable due to licensing issues',
	55	'1002': 'This video is unavailable as it\'s under review',
	56	'1003': 'This video is unavailable as it\'s under review',
	57	'3001': 'Password required',
	58	'5001': 'This video is available in Mainland China only due to licensing issues',
	59	'7001': 'This video is unavailable',
	60	'8001': 'This video is unavailable due to licensing issues',
	61	}
	62
aab13551	63	def _url_for_id(self, video_id, quality=None):
b264c213	64	info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
24a267b5 JMF	65	if quality:
24a267b5 JMF	66	info_url += '&hd' + quality
611c1dd9	67	xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page')
bec47a07 YCH	68	error = xml_data.attrib.get('error')
	69	if error is not None:
	70	raise ExtractorError('Tudou said: %s' % error, expected=True)
87813a85	71	final_url = xml_data.text
24a267b5 JMF	72	return final_url
24a267b5 JMF	73
9caa687d	74	def _real_extract(self, url):
92b065dc	75	video_id = self._match_id(url)
40cf7fcb	76	item_data = self._download_json(
40cf7fcb	77	'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)
9ed3bdc6	78
40cf7fcb	79	youku_vcode = item_data.get('vcode')
141ba369 YCH	80	if youku_vcode:
141ba369 YCH	81	return self.url_result('youku:' + youku_vcode, ie='Youku')
9ed3bdc6	82
5b012dfc YCH	83	if not item_data.get('itemSegs'):
	84	tvc_code = item_data.get('tvcCode')
	85	if tvc_code:
	86	err_msg = self.TVC_ERRORS.get(tvc_code)
	87	if err_msg:
	88	raise ExtractorError('Tudou said: %s' % err_msg, expected=True)
	89	raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code)
	90	raise ExtractorError('Unxpected error returned from Tudou')
	91
40cf7fcb	92	title = unescapeHTML(item_data['kw'])
	93	description = item_data.get('desc')
	94	thumbnail_url = item_data.get('pic')
	95	view_count = int_or_none(item_data.get('playTimes'))
	96	timestamp = int_or_none(item_data.get('pt'))
23875575	97
40cf7fcb	98	segments = self._parse_json(item_data['itemSegs'], video_id)
24a267b5 JMF	99	# It looks like the keys are the arguments that have to be passed as
24a267b5 JMF	100	# the hd field in the request url, we pick the higher
ca9cd290	101	# Also, filter non-number qualities (see issue #3643).
f931e259 NJ	102	quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
f931e259 NJ	103	key=lambda k: int(k))[-1]
24a267b5	104	parts = segments[quality]
24a267b5 JMF	105	len_parts = len(parts)
24a267b5 JMF	106	if len_parts > 1:
8865bdeb	107	self.to_screen('%s: found %s parts' % (video_id, len_parts))
664bcd80 YCH	108
	109	def part_func(partnum):
	110	part = parts[partnum]
24a267b5 JMF	111	part_id = part['k']
	112	final_url = self._url_for_id(part_id, quality)
	113	ext = (final_url.split('?')[0]).split('.')[-1]
664bcd80	114	return [{
8bdfddf6 PH	115	'id': '%s' % part_id,
	116	'url': final_url,
	117	'ext': ext,
	118	'title': title,
	119	'thumbnail': thumbnail_url,
40cf7fcb	120	'description': description,
	121	'view_count': view_count,
	122	'timestamp': timestamp,
	123	'duration': float_or_none(part.get('seconds'), 1000),
	124	'filesize': int_or_none(part.get('size')),
c71a3195	125	'http_headers': {
40cf7fcb	126	'Referer': self._PLAYER_URL,
c71a3195	127	},
664bcd80 YCH	128	}]
	129
	130	entries = InAdvancePagedList(part_func, len_parts, 1)
24a267b5	131
92b065dc PH	132	return {
92b065dc PH	133	'_type': 'multi_video',
664bcd80	134	'entries': entries,
92b065dc PH	135	'id': video_id,
	136	'title': title,
	137	}
40cf7fcb	138
	139
	140	class TudouPlaylistIE(InfoExtractor):
	141	IE_NAME = 'tudou:playlist'
	142	_VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
	143	_TESTS = [{
	144	'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
	145	'info_dict': {
	146	'id': 'zzdE77v6Mmo',
	147	},
	148	'playlist_mincount': 209,
	149	}]
	150
	151	def _real_extract(self, url):
	152	playlist_id = self._match_id(url)
	153	playlist_data = self._download_json(
	154	'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
	155	entries = [self.url_result(
	156	'http://www.tudou.com/programs/view/%s' % item['icode'],
	157	'Tudou', item['icode'],
	158	item['kw']) for item in playlist_data['items']]
	159	return self.playlist_result(entries, playlist_id)
	160
	161
	162	class TudouAlbumIE(InfoExtractor):
	163	IE_NAME = 'tudou:album'
	164	_VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover\|play)/(?P<id>[\w-]{11})'
	165	_TESTS = [{
	166	'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
	167	'info_dict': {
	168	'id': 'v5qckFJvNJg',
	169	},
	170	'playlist_mincount': 45,
	171	}]
	172
	173	def _real_extract(self, url):
	174	album_id = self._match_id(url)
	175	album_data = self._download_json(
	176	'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
	177	entries = [self.url_result(
	178	'http://www.tudou.com/programs/view/%s' % item['icode'],
	179	'Tudou', item['icode'],
	180	item['kw']) for item in album_data['items']]
	181	return self.playlist_result(entries, album_id)