[yt-dlp.git] / youtube_dl / extractor / brightcove.py

# encoding: utf-8
from __future__ import unicode_literals

import re
import json
import xml.etree.ElementTree

from .common import InfoExtractor
from ..utils import (
    compat_urllib_parse,
    find_xpath_attr,
    fix_xml_ampersands,
    compat_urlparse,
    compat_str,
    compat_urllib_request,
    compat_parse_qs,

    ExtractorError,
    unsmuggle_url,
)


class BrightcoveIE(InfoExtractor):
    _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
    _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'

    _TESTS = [
        {
            # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
            'file': '2371591881001.mp4',
            'md5': '5423e113865d26e40624dce2e4b45d95',
            'note': 'Test Brightcove downloads and detection in GenericIE',
            'info_dict': {
                'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
                'uploader': '8TV',
                'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
            }
        },
        {
            # From http://medianetwork.oracle.com/video/player/1785452137001
            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
            'file': '1785452137001.flv',
            'info_dict': {
                'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
                'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
                'uploader': 'Oracle',
            },
        },
        {
            # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
            'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
            'info_dict': {
                'id': '2750934548001',
                'ext': 'mp4',
                'title': 'This Bracelet Acts as a Personal Thermostat',
                'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
                'uploader': 'Mashable',
            },
        },
        {
            # test that the default referer works
            # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
            'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
            'info_dict': {
                'id': '2878862109001',
                'ext': 'mp4',
                'title': 'Lost in Motion II',
                'description': 'md5:363109c02998fee92ec02211bd8000df',
                'uploader': 'National Ballet of Canada',
            },
        }
    ]

    @classmethod
    def _build_brighcove_url(cls, object_str):
        """
        Build a Brightcove url from a xml string containing
        <object class="BrightcoveExperience">{params}</object>
        """

        # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
        object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',
                            lambda m: m.group(1) + '/>', object_str)
        # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
        object_str = object_str.replace('<--', '<!--')
        object_str = fix_xml_ampersands(object_str)

        object_doc = xml.etree.ElementTree.fromstring(object_str)

        fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
        if fv_el is not None:
            flashvars = dict(
                (k, v[0])
                for k, v in compat_parse_qs(fv_el.attrib['value']).items())
        else:
            flashvars = {}

        def find_param(name):
            if name in flashvars:
                return flashvars[name]
            node = find_xpath_attr(object_doc, './param', 'name', name)
            if node is not None:
                return node.attrib['value']
            return None

        params = {}

        playerID = find_param('playerID')
        if playerID is None:
            raise ExtractorError('Cannot find player ID')
        params['playerID'] = playerID

        playerKey = find_param('playerKey')
        # Not all pages define this value
        if playerKey is not None:
            params['playerKey'] = playerKey
        # The three fields hold the id of the video
        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
        if videoPlayer is not None:
            params['@videoPlayer'] = videoPlayer
        linkBase = find_param('linkBaseURL')
        if linkBase is not None:
            params['linkBaseURL'] = linkBase
        data = compat_urllib_parse.urlencode(params)
        return cls._FEDERATED_URL_TEMPLATE % data

    @classmethod
    def _extract_brightcove_url(cls, webpage):
        """Try to extract the brightcove url from the webpage, returns None
        if it can't be found
        """
        urls = cls._extract_brightcove_urls(webpage)
        return urls[0] if urls else None

    @classmethod
    def _extract_brightcove_urls(cls, webpage):
        """Return a list of all Brightcove URLs from the webpage """

        url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
        if url_m:
            return [url_m.group(1)]

        matches = re.findall(
            r'''(?sx)<object
            (?:
                [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
                [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
            ).+?</object>''',
            webpage)
        return [cls._build_brighcove_url(m) for m in matches]

    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})

        # Change the 'videoId' and others field to '@videoPlayer'
        url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url)
        # Change bckey (used by bcove.me urls) to playerKey
        url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
        mobj = re.match(self._VALID_URL, url)
        query_str = mobj.group('query')
        query = compat_urlparse.parse_qs(query_str)

        videoPlayer = query.get('@videoPlayer')
        if videoPlayer:
            # We set the original url as the default 'Referer' header
            referer = smuggled_data.get('Referer', url)
            return self._get_video_info(
                videoPlayer[0], query_str, query, referer=referer)
        else:
            player_key = query['playerKey']
            return self._get_playlist_info(player_key[0])

    def _get_video_info(self, video_id, query_str, query, referer=None):
        request_url = self._FEDERATED_URL_TEMPLATE % query_str
        req = compat_urllib_request.Request(request_url)
        linkBase = query.get('linkBaseURL')
        if linkBase is not None:
            referer = linkBase[0]
        if referer is not None:
            req.add_header('Referer', referer)
        webpage = self._download_webpage(req, video_id)

        self.report_extraction(video_id)
        info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
        info = json.loads(info)['data']
        video_info = info['programmedContent']['videoPlayer']['mediaDTO']
        video_info['_youtubedl_adServerURL'] = info.get('adServerURL')

        return self._extract_video_info(video_info)

    def _get_playlist_info(self, player_key):
        info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key
        playlist_info = self._download_webpage(
            info_url, player_key, 'Downloading playlist information')

        json_data = json.loads(playlist_info)
        if 'videoList' not in json_data:
            raise ExtractorError('Empty playlist')
        playlist_info = json_data['videoList']
        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]

        return self.playlist_result(videos, playlist_id=playlist_info['id'],
                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName'])

    def _extract_video_info(self, video_info):
        info = {
            'id': compat_str(video_info['id']),
            'title': video_info['displayName'].strip(),
            'description': video_info.get('shortDescription'),
            'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
            'uploader': video_info.get('publisherName'),
        }

        renditions = video_info.get('renditions')
        if renditions:
            renditions = sorted(renditions, key=lambda r: r['size'])
            info['formats'] = [{
                'url': rend['defaultURL'],
                'height': rend.get('frameHeight'),
                'width': rend.get('frameWidth'),
            } for rend in renditions]
        elif video_info.get('FLVFullLengthURL') is not None:
            info.update({
                'url': video_info['FLVFullLengthURL'],
            })

        if self._downloader.params.get('include_ads', False):
            adServerURL = video_info.get('_youtubedl_adServerURL')
            if adServerURL:
                ad_info = {
                    '_type': 'url',
                    'url': adServerURL,
                }
                if 'url' in info:
                    return {
                        '_type': 'playlist',
                        'title': info['title'],
                        'entries': [ad_info, info],
                    }
                else:
                    return ad_info

        if 'url' not in info and not info.get('formats'):
            raise ExtractorError('Unable to extract video url for %s' % info['id'])
        return info
Commit	Line	Data
592882aa	1	# encoding: utf-8
400e5810	2	from __future__ import unicode_literals
592882aa	3
fbaaad49 JMF	4	import re
fbaaad49 JMF	5	import json
cfe50f04	6	import xml.etree.ElementTree
fbaaad49 JMF	7
fbaaad49 JMF	8	from .common import InfoExtractor
cfe50f04 JMF	9	from ..utils import (
cfe50f04 JMF	10	compat_urllib_parse,
45ff2d51	11	find_xpath_attr,
7b0817e8	12	fix_xml_ampersands,
6543f0dc	13	compat_urlparse,
eeb165e6	14	compat_str,
dd5bcdc4	15	compat_urllib_request,
7b0817e8	16	compat_parse_qs,
592882aa JMF	17
592882aa JMF	18	ExtractorError,
79f82953	19	unsmuggle_url,
cfe50f04	20	)
fbaaad49	21
dd5bcdc4	22
fbaaad49	23	class BrightcoveIE(InfoExtractor):
abb285fb	24	_VALID_URL = r'https?://.brightcove\.com/(services\|viewer).\?(?P<query>.*)'
cfe50f04	25	_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
592882aa JMF	26
	27	_TESTS = [
	28	{
4de1994b	29	# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
400e5810 PH	30	'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
	31	'file': '2371591881001.mp4',
	32	'md5': '5423e113865d26e40624dce2e4b45d95',
	33	'note': 'Test Brightcove downloads and detection in GenericIE',
	34	'info_dict': {
	35	'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
	36	'uploader': '8TV',
	37	'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
592882aa JMF	38	}
	39	},
	40	{
4de1994b	41	# From http://medianetwork.oracle.com/video/player/1785452137001
400e5810 PH	42	'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
	43	'file': '1785452137001.flv',
	44	'info_dict': {
	45	'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
	46	'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
	47	'uploader': 'Oracle',
592882aa JMF	48	},
592882aa JMF	49	},
fc4a0c2a JMF	50	{
fc4a0c2a JMF	51	# From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
400e5810 PH	52	'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
	53	'info_dict': {
	54	'id': '2750934548001',
	55	'ext': 'mp4',
	56	'title': 'This Bracelet Acts as a Personal Thermostat',
	57	'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
	58	'uploader': 'Mashable',
fc4a0c2a JMF	59	},
fc4a0c2a JMF	60	},
77526143 JMF	61	{
	62	# test that the default referer works
	63	# from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
400e5810 PH	64	'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
	65	'info_dict': {
	66	'id': '2878862109001',
	67	'ext': 'mp4',
	68	'title': 'Lost in Motion II',
	69	'description': 'md5:363109c02998fee92ec02211bd8000df',
	70	'uploader': 'National Ballet of Canada',
77526143	71	},
117bec93	72	}
592882aa	73	]
cfe50f04 JMF	74
	75	@classmethod
	76	def _build_brighcove_url(cls, object_str):
	77	"""
	78	Build a Brightcove url from a xml string containing
	79	<object class="BrightcoveExperience">{params}</object>
	80	"""
46e28a84 PH	81
	82	# Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
	83	object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',
	84	lambda m: m.group(1) + '/>', object_str)
2d0efe70	85	# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
400e5810	86	object_str = object_str.replace('<--', '<!--')
7b0817e8	87	object_str = fix_xml_ampersands(object_str)
46e28a84	88
cfe50f04	89	object_doc = xml.etree.ElementTree.fromstring(object_str)
7b0817e8 PH	90
7b0817e8 PH	91	fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
47917f24 JMF	92	if fv_el is not None:
	93	flashvars = dict(
	94	(k, v[0])
	95	for k, v in compat_parse_qs(fv_el.attrib['value']).items())
	96	else:
	97	flashvars = {}
7b0817e8	98
36de0a0e	99	def find_param(name):
7b0817e8 PH	100	if name in flashvars:
7b0817e8 PH	101	return flashvars[name]
d214fdb8 JMF	102	node = find_xpath_attr(object_doc, './param', 'name', name)
	103	if node is not None:
	104	return node.attrib['value']
	105	return None
7b0817e8 PH	106
	107	params = {}
	108
	109	playerID = find_param('playerID')
	110	if playerID is None:
	111	raise ExtractorError('Cannot find player ID')
	112	params['playerID'] = playerID
	113
36de0a0e	114	playerKey = find_param('playerKey')
cfe50f04 JMF	115	# Not all pages define this value
cfe50f04 JMF	116	if playerKey is not None:
d214fdb8	117	params['playerKey'] = playerKey
36de0a0e JMF	118	# The three fields hold the id of the video
36de0a0e JMF	119	videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
abb285fb	120	if videoPlayer is not None:
d214fdb8	121	params['@videoPlayer'] = videoPlayer
36de0a0e	122	linkBase = find_param('linkBaseURL')
dd5bcdc4	123	if linkBase is not None:
d214fdb8	124	params['linkBaseURL'] = linkBase
cfe50f04 JMF	125	data = compat_urllib_parse.urlencode(params)
cfe50f04 JMF	126	return cls._FEDERATED_URL_TEMPLATE % data
fbaaad49	127
eeb165e6 JMF	128	@classmethod
eeb165e6 JMF	129	def _extract_brightcove_url(cls, webpage):
99877772	130	"""Try to extract the brightcove url from the webpage, returns None
eeb165e6 JMF	131	if it can't be found
eeb165e6 JMF	132	"""
99877772 PH	133	urls = cls._extract_brightcove_urls(webpage)
	134	return urls[0] if urls else None
	135
	136	@classmethod
	137	def _extract_brightcove_urls(cls, webpage):
	138	"""Return a list of all Brightcove URLs from the webpage """
117bec93 PH	139
	140	url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
	141	if url_m:
99877772	142	return [url_m.group(1)]
117bec93	143
99877772	144	matches = re.findall(
7b0817e8 PH	145	r'''(?sx)<object
7b0817e8 PH	146	(?:
99877772	147	[^>]+?class=[\'"][^>]?BrightcoveExperience.?[\'"] \|
7b0817e8 PH	148	[^>]?>\s<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
	149	).+?</object>''',
	150	webpage)
99877772	151	return [cls._build_brighcove_url(m) for m in matches]
eeb165e6	152
fbaaad49	153	def _real_extract(self, url):
79f82953 PH	154	url, smuggled_data = unsmuggle_url(url, {})
79f82953 PH	155
51040b72 JMF	156	# Change the 'videoId' and others field to '@videoPlayer'
	157	url = re.sub(r'(?<=[?&])(videoI(d\|D)\|bctid)', '%40videoPlayer', url)
	158	# Change bckey (used by bcove.me urls) to playerKey
	159	url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
fbaaad49	160	mobj = re.match(self._VALID_URL, url)
6543f0dc JMF	161	query_str = mobj.group('query')
6543f0dc JMF	162	query = compat_urlparse.parse_qs(query_str)
fbaaad49	163
6543f0dc JMF	164	videoPlayer = query.get('@videoPlayer')
6543f0dc JMF	165	if videoPlayer:
79f82953 PH	166	# We set the original url as the default 'Referer' header
	167	referer = smuggled_data.get('Referer', url)
	168	return self._get_video_info(
	169	videoPlayer[0], query_str, query, referer=referer)
abb285fb	170	else:
6543f0dc JMF	171	player_key = query['playerKey']
6543f0dc JMF	172	return self._get_playlist_info(player_key[0])
abb285fb	173
77526143	174	def _get_video_info(self, video_id, query_str, query, referer=None):
dd5bcdc4 JMF	175	request_url = self._FEDERATED_URL_TEMPLATE % query_str
	176	req = compat_urllib_request.Request(request_url)
	177	linkBase = query.get('linkBaseURL')
	178	if linkBase is not None:
77526143 JMF	179	referer = linkBase[0]
	180	if referer is not None:
	181	req.add_header('Referer', referer)
dd5bcdc4	182	webpage = self._download_webpage(req, video_id)
fbaaad49 JMF	183
	184	self.report_extraction(video_id)
	185	info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
	186	info = json.loads(info)['data']
	187	video_info = info['programmedContent']['videoPlayer']['mediaDTO']
7b0817e8	188	video_info['_youtubedl_adServerURL'] = info.get('adServerURL')
abb285fb JMF	189
	190	return self._extract_video_info(video_info)
	191
	192	def _get_playlist_info(self, player_key):
117bec93 PH	193	info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key
	194	playlist_info = self._download_webpage(
	195	info_url, player_key, 'Downloading playlist information')
abb285fb	196
59145479 PH	197	json_data = json.loads(playlist_info)
59145479 PH	198	if 'videoList' not in json_data:
400e5810	199	raise ExtractorError('Empty playlist')
59145479	200	playlist_info = json_data['videoList']
abb285fb JMF	201	videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
	202
	203	return self.playlist_result(videos, playlist_id=playlist_info['id'],
	204	playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
	205
	206	def _extract_video_info(self, video_info):
592882aa	207	info = {
eeb165e6	208	'id': compat_str(video_info['id']),
066f6a06	209	'title': video_info['displayName'].strip(),
592882aa JMF	210	'description': video_info.get('shortDescription'),
	211	'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
	212	'uploader': video_info.get('publisherName'),
	213	}
abb285fb	214
592882aa JMF	215	renditions = video_info.get('renditions')
	216	if renditions:
	217	renditions = sorted(renditions, key=lambda r: r['size'])
b0759f0c JMF	218	info['formats'] = [{
	219	'url': rend['defaultURL'],
	220	'height': rend.get('frameHeight'),
	221	'width': rend.get('frameWidth'),
	222	} for rend in renditions]
592882aa JMF	223	elif video_info.get('FLVFullLengthURL') is not None:
	224	info.update({
	225	'url': video_info['FLVFullLengthURL'],
592882aa	226	})
7b0817e8 PH	227
	228	if self._downloader.params.get('include_ads', False):
	229	adServerURL = video_info.get('_youtubedl_adServerURL')
	230	if adServerURL:
	231	ad_info = {
	232	'_type': 'url',
	233	'url': adServerURL,
	234	}
	235	if 'url' in info:
	236	return {
	237	'_type': 'playlist',
	238	'title': info['title'],
	239	'entries': [ad_info, info],
	240	}
	241	else:
	242	return ad_info
	243
d614aa40	244	if 'url' not in info and not info.get('formats'):
400e5810	245	raise ExtractorError('Unable to extract video url for %s' % info['id'])
592882aa	246	return info