[yt-dlp.git] / youtube_dl / extractor / infoq.py

# coding: utf-8

from __future__ import unicode_literals

import base64

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse_unquote,
    compat_parse_qs,
)
from ..utils import determine_ext


class InfoQIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'

    _TESTS = [{
        'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
        'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
        'info_dict': {
            'id': 'A-Few-of-My-Favorite-Python-Things',
            'ext': 'mp4',
            'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
            'title': 'A Few of My Favorite [Python] Things',
        },
    }, {
        'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
        'only_matching': True,
    }, {
        'url': 'http://www.infoq.com/cn/presentations/openstack-continued-delivery',
        'md5': '4918d0cca1497f2244572caf626687ef',
        'info_dict': {
            'id': 'openstack-continued-delivery',
            'title': 'OpenStack持续交付之路',
            'ext': 'flv',
            'description': 'md5:308d981fb28fa42f49f9568322c683ff',
        },
    }]

    def _extract_bokecc_videos(self, webpage, video_id):
        # TODO: bokecc.com is a Chinese video cloud platform
        # It should have an independent extractor but I don't have other
        # examples using bokecc
        player_params_str = self._html_search_regex(
            r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
            webpage, 'player params', default=None)

        player_params = compat_parse_qs(player_params_str)

        info_xml = self._download_xml(
            'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
                player_params['siteid'][0], player_params['vid'][0]), video_id)

        return [{
            'format_id': 'bokecc',
            'url': quality.find('./copy').attrib['playurl'],
            'preference': int(quality.attrib['value']),
        } for quality in info_xml.findall('./video/quality')]

    def _extract_rtmp_videos(self, webpage):
        # The server URL is hardcoded
        video_url = 'rtmpe://video.infoq.com/cfx/st/'

        # Extract video URL
        encoded_id = self._search_regex(
            r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None)

        real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
        playpath = 'mp4:' + real_id

        return [{
            'format_id': 'rtmp',
            'url': video_url,
            'ext': determine_ext(playpath),
            'play_path': playpath,
        }]

    def _extract_http_videos(self, webpage):
        http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL')

        policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy')
        signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature')
        key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id')

        return [{
            'format_id': 'http',
            'url': http_video_url,
            'http_headers': {
                'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % (
                    policy, signature, key_pair_id),
            },
        }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
        video_description = self._html_search_meta('description', webpage, 'description')

        if '/cn/' in url:
            # for China videos, HTTP video URL exists but always fails with 403
            formats = self._extract_bokecc_videos(webpage, video_id)
        else:
            formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage)

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': video_title,
            'description': video_description,
            'formats': formats,
        }
Commit	Line	Data
02a63fad YCH	1	# coding: utf-8
02a63fad YCH	2
d882161d JMF	3	from __future__ import unicode_literals
d882161d JMF	4
fda7d31a	5	import base64
fda7d31a PH	6
fda7d31a PH	7	from .common import InfoExtractor
02a63fad YCH	8	from ..compat import (
	9	compat_urllib_parse_unquote,
	10	compat_parse_qs,
	11	)
	12	from ..utils import determine_ext
fda7d31a PH	13
	14
	15	class InfoQIE(InfoExtractor):
533f67d3	16	_VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
282cb9c7	17
533f67d3	18	_TESTS = [{
c0a7c608	19	'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
edec83a0	20	'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
c0a7c608	21	'info_dict': {
02a63fad	22	'id': 'A-Few-of-My-Favorite-Python-Things',
c0a7c608 PH	23	'ext': 'mp4',
	24	'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
	25	'title': 'A Few of My Favorite [Python] Things',
9d069c47	26	},
533f67d3 S	27	}, {
	28	'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
	29	'only_matching': True,
02a63fad YCH	30	}, {
	31	'url': 'http://www.infoq.com/cn/presentations/openstack-continued-delivery',
	32	'md5': '4918d0cca1497f2244572caf626687ef',
	33	'info_dict': {
	34	'id': 'openstack-continued-delivery',
	35	'title': 'OpenStack持续交付之路',
	36	'ext': 'flv',
	37	'description': 'md5:308d981fb28fa42f49f9568322c683ff',
	38	},
533f67d3	39	}]
fda7d31a	40
02a63fad YCH	41	def _extract_bokecc_videos(self, webpage, video_id):
	42	# TODO: bokecc.com is a Chinese video cloud platform
	43	# It should have an independent extractor but I don't have other
	44	# examples using bokecc
	45	player_params_str = self._html_search_regex(
	46	r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
	47	webpage, 'player params', default=None)
fda7d31a	48
02a63fad YCH	49	player_params = compat_parse_qs(player_params_str)
	50
	51	info_xml = self._download_xml(
	52	'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
	53	player_params['siteid'][0], player_params['vid'][0]), video_id)
	54
	55	return [{
	56	'format_id': 'bokecc',
	57	'url': quality.find('./copy').attrib['playurl'],
	58	'preference': int(quality.attrib['value']),
	59	} for quality in info_xml.findall('./video/quality')]
fda7d31a	60
02a63fad	61	def _extract_rtmp_videos(self, webpage):
7560096d	62	# The server URL is hardcoded
282cb9c7	63	video_url = 'rtmpe://video.infoq.com/cfx/st/'
7560096d KW	64
7560096d KW	65	# Extract video URL
edec83a0	66	encoded_id = self._search_regex(
02a63fad YCH	67	r"jsclassref\s=\s'([^']*)'", webpage, 'encoded id', default=None)
02a63fad YCH	68
8ee4ecb4	69	real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
7560096d	70	playpath = 'mp4:' + real_id
fda7d31a	71
02a63fad YCH	72	return [{
	73	'format_id': 'rtmp',
	74	'url': video_url,
	75	'ext': determine_ext(playpath),
	76	'play_path': playpath,
	77	}]
fda7d31a	78
02a63fad	79	def _extract_http_videos(self, webpage):
22d07ba4 YCH	80	http_video_url = self._search_regex(r'P\.s\s=\s\'([^\']+)\'', webpage, 'video URL')
	81
	82	policy = self._search_regex(r'InfoQConstants.scp\s=\s\'([^\']+)\'', webpage, 'policy')
	83	signature = self._search_regex(r'InfoQConstants.scs\s=\s\'([^\']+)\'', webpage, 'signature')
	84	key_pair_id = self._search_regex(r'InfoQConstants.sck\s=\s\'([^\']+)\'', webpage, 'key-pair-id')
edec83a0	85
02a63fad	86	return [{
edec83a0	87	'format_id': 'http',
22d07ba4 YCH	88	'url': http_video_url,
	89	'http_headers': {
	90	'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % (
	91	policy, signature, key_pair_id),
	92	},
edec83a0	93	}]
02a63fad YCH	94
	95	def _real_extract(self, url):
	96	video_id = self._match_id(url)
	97	webpage = self._download_webpage(url, video_id)
	98
	99	video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
	100	video_description = self._html_search_meta('description', webpage, 'description')
	101
	102	if '/cn/' in url:
	103	# for China videos, HTTP video URL exists but always fails with 403
	104	formats = self._extract_bokecc_videos(webpage, video_id)
	105	else:
	106	formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage)
	107
edec83a0 PH	108	self._sort_formats(formats)
edec83a0 PH	109
c0a7c608	110	return {
fda7d31a	111	'id': video_id,
fda7d31a	112	'title': video_title,
fda7d31a	113	'description': video_description,
edec83a0	114	'formats': formats,
c0a7c608	115	}