[yt-dlp.git] / youtube_dl / extractor / tudou.py

# coding: utf-8

from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    int_or_none,
    float_or_none,
    unescapeHTML,
)


class TudouIE(InfoExtractor):
    IE_NAME = 'tudou'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'
    _TESTS = [{
        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
        'md5': '140a49ed444bd22f93330985d8475fcb',
        'info_dict': {
            'id': '159448201',
            'ext': 'f4v',
            'title': '卡马乔国足开大脚长传冲吊集锦',
            'thumbnail': 're:^https?://.*\.jpg$',
            'timestamp': 1372113489000,
            'description': '卡马乔卡家军，开大脚先进战术不完全集锦！',
            'duration': 289.04,
            'view_count': int,
            'filesize': int,
        }
    }, {
        'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
        'info_dict': {
            'id': '117049447',
            'ext': 'f4v',
            'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
            'thumbnail': 're:^https?://.*\.jpg$',
            'timestamp': 1349207518000,
            'description': 'md5:294612423894260f2dcd5c6c04fe248b',
            'duration': 5478.33,
            'view_count': int,
            'filesize': int,
        }
    }]

    _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'

    def _url_for_id(self, video_id, quality=None):
        info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
        if quality:
            info_url += '&hd' + quality
        xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page')
        final_url = xml_data.text
        return final_url

    def _real_extract(self, url):
        video_id = self._match_id(url)
        item_data = self._download_json(
            'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)

        youku_vcode = item_data.get('vcode')
        if youku_vcode:
            return self.url_result('youku:' + youku_vcode, ie='Youku')

        title = unescapeHTML(item_data['kw'])
        description = item_data.get('desc')
        thumbnail_url = item_data.get('pic')
        view_count = int_or_none(item_data.get('playTimes'))
        timestamp = int_or_none(item_data.get('pt'))

        segments = self._parse_json(item_data['itemSegs'], video_id)
        # It looks like the keys are the arguments that have to be passed as
        # the hd field in the request url, we pick the higher
        # Also, filter non-number qualities (see issue #3643).
        quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
                         key=lambda k: int(k))[-1]
        parts = segments[quality]
        result = []
        len_parts = len(parts)
        if len_parts > 1:
            self.to_screen('%s: found %s parts' % (video_id, len_parts))
        for part in parts:
            part_id = part['k']
            final_url = self._url_for_id(part_id, quality)
            ext = (final_url.split('?')[0]).split('.')[-1]
            part_info = {
                'id': '%s' % part_id,
                'url': final_url,
                'ext': ext,
                'title': title,
                'thumbnail': thumbnail_url,
                'description': description,
                'view_count': view_count,
                'timestamp': timestamp,
                'duration': float_or_none(part.get('seconds'), 1000),
                'filesize': int_or_none(part.get('size')),
                'http_headers': {
                    'Referer': self._PLAYER_URL,
                },
            }
            result.append(part_info)

        return {
            '_type': 'multi_video',
            'entries': result,
            'id': video_id,
            'title': title,
        }


class TudouPlaylistIE(InfoExtractor):
    IE_NAME = 'tudou:playlist'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
    _TESTS = [{
        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
        'info_dict': {
            'id': 'zzdE77v6Mmo',
        },
        'playlist_mincount': 209,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        playlist_data = self._download_json(
            'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
        entries = [self.url_result(
            'http://www.tudou.com/programs/view/%s' % item['icode'],
            'Tudou', item['icode'],
            item['kw']) for item in playlist_data['items']]
        return self.playlist_result(entries, playlist_id)


class TudouAlbumIE(InfoExtractor):
    IE_NAME = 'tudou:album'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})'
    _TESTS = [{
        'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
        'info_dict': {
            'id': 'v5qckFJvNJg',
        },
        'playlist_mincount': 45,
    }]

    def _real_extract(self, url):
        album_id = self._match_id(url)
        album_data = self._download_json(
            'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
        entries = [self.url_result(
            'http://www.tudou.com/programs/view/%s' % item['icode'],
            'Tudou', item['icode'],
            item['kw']) for item in album_data['items']]
        return self.playlist_result(entries, album_id)
Commit	Line	Data
24a267b5 JMF	1	# coding: utf-8
24a267b5 JMF	2
8bdfddf6 PH	3	from __future__ import unicode_literals
8bdfddf6 PH	4
9caa687d	5	from .common import InfoExtractor
b264c213	6	from ..compat import compat_str
40cf7fcb	7	from ..utils import (
	8	int_or_none,
	9	float_or_none,
	10	unescapeHTML,
	11	)
9caa687d YK	12
	13
	14	class TudouIE(InfoExtractor):
40cf7fcb	15	IE_NAME = 'tudou'
40cf7fcb	16	_VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs\|wlplay)/view\|(?:listplay\|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'
9ed3bdc6	17	_TESTS = [{
8bdfddf6 PH	18	'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
	19	'md5': '140a49ed444bd22f93330985d8475fcb',
	20	'info_dict': {
	21	'id': '159448201',
	22	'ext': 'f4v',
	23	'title': '卡马乔国足开大脚长传冲吊集锦',
	24	'thumbnail': 're:^https?://.*\.jpg$',
40cf7fcb	25	'timestamp': 1372113489000,
	26	'description': '卡马乔卡家军，开大脚先进战术不完全集锦！',
	27	'duration': 289.04,
	28	'view_count': int,
	29	'filesize': int,
6f5ac90c	30	}
a8be56ce PH	31	}, {
	32	'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
	33	'info_dict': {
	34	'id': '117049447',
	35	'ext': 'f4v',
	36	'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
	37	'thumbnail': 're:^https?://.*\.jpg$',
40cf7fcb	38	'timestamp': 1349207518000,
	39	'description': 'md5:294612423894260f2dcd5c6c04fe248b',
	40	'duration': 5478.33,
	41	'view_count': int,
	42	'filesize': int,
a8be56ce	43	}
9ed3bdc6	44	}]
9caa687d	45
c71a3195	46	_PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
23875575	47
aab13551	48	def _url_for_id(self, video_id, quality=None):
b264c213	49	info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
24a267b5 JMF	50	if quality:
24a267b5 JMF	51	info_url += '&hd' + quality
611c1dd9	52	xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page')
87813a85	53	final_url = xml_data.text
24a267b5 JMF	54	return final_url
24a267b5 JMF	55
9caa687d	56	def _real_extract(self, url):
92b065dc	57	video_id = self._match_id(url)
40cf7fcb	58	item_data = self._download_json(
40cf7fcb	59	'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)
9ed3bdc6	60
40cf7fcb	61	youku_vcode = item_data.get('vcode')
141ba369 YCH	62	if youku_vcode:
141ba369 YCH	63	return self.url_result('youku:' + youku_vcode, ie='Youku')
9ed3bdc6	64
40cf7fcb	65	title = unescapeHTML(item_data['kw'])
	66	description = item_data.get('desc')
	67	thumbnail_url = item_data.get('pic')
	68	view_count = int_or_none(item_data.get('playTimes'))
	69	timestamp = int_or_none(item_data.get('pt'))
23875575	70
40cf7fcb	71	segments = self._parse_json(item_data['itemSegs'], video_id)
24a267b5 JMF	72	# It looks like the keys are the arguments that have to be passed as
24a267b5 JMF	73	# the hd field in the request url, we pick the higher
ca9cd290	74	# Also, filter non-number qualities (see issue #3643).
f931e259 NJ	75	quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
f931e259 NJ	76	key=lambda k: int(k))[-1]
24a267b5 JMF	77	parts = segments[quality]
	78	result = []
	79	len_parts = len(parts)
	80	if len_parts > 1:
8865bdeb	81	self.to_screen('%s: found %s parts' % (video_id, len_parts))
24a267b5 JMF	82	for part in parts:
	83	part_id = part['k']
	84	final_url = self._url_for_id(part_id, quality)
	85	ext = (final_url.split('?')[0]).split('.')[-1]
8bdfddf6 PH	86	part_info = {
	87	'id': '%s' % part_id,
	88	'url': final_url,
	89	'ext': ext,
	90	'title': title,
	91	'thumbnail': thumbnail_url,
40cf7fcb	92	'description': description,
	93	'view_count': view_count,
	94	'timestamp': timestamp,
	95	'duration': float_or_none(part.get('seconds'), 1000),
	96	'filesize': int_or_none(part.get('size')),
c71a3195	97	'http_headers': {
40cf7fcb	98	'Referer': self._PLAYER_URL,
c71a3195	99	},
8bdfddf6	100	}
24a267b5 JMF	101	result.append(part_info)
24a267b5 JMF	102
92b065dc PH	103	return {
	104	'_type': 'multi_video',
	105	'entries': result,
	106	'id': video_id,
	107	'title': title,
	108	}
40cf7fcb	109
	110
	111	class TudouPlaylistIE(InfoExtractor):
	112	IE_NAME = 'tudou:playlist'
	113	_VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
	114	_TESTS = [{
	115	'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
	116	'info_dict': {
	117	'id': 'zzdE77v6Mmo',
	118	},
	119	'playlist_mincount': 209,
	120	}]
	121
	122	def _real_extract(self, url):
	123	playlist_id = self._match_id(url)
	124	playlist_data = self._download_json(
	125	'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
	126	entries = [self.url_result(
	127	'http://www.tudou.com/programs/view/%s' % item['icode'],
	128	'Tudou', item['icode'],
	129	item['kw']) for item in playlist_data['items']]
	130	return self.playlist_result(entries, playlist_id)
	131
	132
	133	class TudouAlbumIE(InfoExtractor):
	134	IE_NAME = 'tudou:album'
	135	_VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover\|play)/(?P<id>[\w-]{11})'
	136	_TESTS = [{
	137	'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
	138	'info_dict': {
	139	'id': 'v5qckFJvNJg',
	140	},
	141	'playlist_mincount': 45,
	142	}]
	143
	144	def _real_extract(self, url):
	145	album_id = self._match_id(url)
	146	album_data = self._download_json(
	147	'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
	148	entries = [self.url_result(
	149	'http://www.tudou.com/programs/view/%s' % item['icode'],
	150	'Tudou', item['icode'],
	151	item['kw']) for item in album_data['items']]
	152	return self.playlist_result(entries, album_id)