[yt-dlp.git] / youtube_dl / extractor / brightcove.py

# encoding: utf-8
from __future__ import unicode_literals

import re
import json
import xml.etree.ElementTree

from .common import InfoExtractor
from ..compat import (
    compat_parse_qs,
    compat_str,
    compat_urllib_parse,
    compat_urllib_parse_urlparse,
    compat_urllib_request,
    compat_urlparse,
    compat_xml_parse_error,
)
from ..utils import (
    determine_ext,
    ExtractorError,
    find_xpath_attr,
    fix_xml_ampersands,
    unescapeHTML,
    unsmuggle_url,
    js_to_json,
    int_or_none,
    parse_iso8601,
    extract_attributes,
)


class BrightcoveIE(InfoExtractor):
    _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
    _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'

    _TESTS = [
        {
            # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
            'md5': '5423e113865d26e40624dce2e4b45d95',
            'note': 'Test Brightcove downloads and detection in GenericIE',
            'info_dict': {
                'id': '2371591881001',
                'ext': 'mp4',
                'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
                'uploader': '8TV',
                'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
            }
        },
        {
            # From http://medianetwork.oracle.com/video/player/1785452137001
            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
            'info_dict': {
                'id': '1785452137001',
                'ext': 'flv',
                'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
                'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
                'uploader': 'Oracle',
            },
        },
        {
            # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
            'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
            'info_dict': {
                'id': '2750934548001',
                'ext': 'mp4',
                'title': 'This Bracelet Acts as a Personal Thermostat',
                'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
                'uploader': 'Mashable',
            },
        },
        {
            # test that the default referer works
            # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
            'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
            'info_dict': {
                'id': '2878862109001',
                'ext': 'mp4',
                'title': 'Lost in Motion II',
                'description': 'md5:363109c02998fee92ec02211bd8000df',
                'uploader': 'National Ballet of Canada',
            },
        },
        {
            # test flv videos served by akamaihd.net
            # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william
            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
            # The md5 checksum changes on each download
            'info_dict': {
                'id': '2996102916001',
                'ext': 'flv',
                'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
                'uploader': 'Red Bull TV',
                'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
            },
        },
        {
            # playlist test
            # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
            'info_dict': {
                'title': 'Sealife',
                'id': '3550319591001',
            },
            'playlist_mincount': 7,
        },
    ]

    @classmethod
    def _build_brighcove_url(cls, object_str):
        """
        Build a Brightcove url from a xml string containing
        <object class="BrightcoveExperience">{params}</object>
        """

        # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
        object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>',
                            lambda m: m.group(1) + '/>', object_str)
        # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
        object_str = object_str.replace('<--', '<!--')
        # remove namespace to simplify extraction
        object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
        object_str = fix_xml_ampersands(object_str)

        try:
            object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
        except compat_xml_parse_error:
            return

        fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
        if fv_el is not None:
            flashvars = dict(
                (k, v[0])
                for k, v in compat_parse_qs(fv_el.attrib['value']).items())
        else:
            flashvars = {}

        def find_param(name):
            if name in flashvars:
                return flashvars[name]
            node = find_xpath_attr(object_doc, './param', 'name', name)
            if node is not None:
                return node.attrib['value']
            return None

        params = {}

        playerID = find_param('playerID')
        if playerID is None:
            raise ExtractorError('Cannot find player ID')
        params['playerID'] = playerID

        playerKey = find_param('playerKey')
        # Not all pages define this value
        if playerKey is not None:
            params['playerKey'] = playerKey
        # The three fields hold the id of the video
        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
        if videoPlayer is not None:
            params['@videoPlayer'] = videoPlayer
        linkBase = find_param('linkBaseURL')
        if linkBase is not None:
            params['linkBaseURL'] = linkBase
        return cls._make_brightcove_url(params)

    @classmethod
    def _build_brighcove_url_from_js(cls, object_js):
        # The layout of JS is as follows:
        # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
        #   // build Brightcove <object /> XML
        # }
        m = re.search(
            r'''(?x)customBC.\createVideo\(
                .*?                                                  # skipping width and height
                ["\'](?P<playerID>\d+)["\']\s*,\s*                   # playerID
                ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s*  # playerKey begins with AQ and is 50 characters
                                                                     # in length, however it's appended to itself
                                                                     # in places, so truncate
                ["\'](?P<videoID>\d+)["\']                           # @videoPlayer
            ''', object_js)
        if m:
            return cls._make_brightcove_url(m.groupdict())

    @classmethod
    def _make_brightcove_url(cls, params):
        data = compat_urllib_parse.urlencode(params)
        return cls._FEDERATED_URL_TEMPLATE % data

    @classmethod
    def _extract_brightcove_url(cls, webpage):
        """Try to extract the brightcove url from the webpage, returns None
        if it can't be found
        """
        urls = cls._extract_brightcove_urls(webpage)
        return urls[0] if urls else None

    @classmethod
    def _extract_brightcove_urls(cls, webpage):
        """Return a list of all Brightcove URLs from the webpage """

        url_m = re.search(
            r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]',
            webpage)
        if url_m:
            url = unescapeHTML(url_m.group(1))
            # Some sites don't add it, we can't download with this url, for example:
            # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
            if 'playerKey' in url or 'videoId' in url:
                return [url]

        matches = re.findall(
            r'''(?sx)<object
            (?:
                [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
                [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
            ).+?>\s*</object>''',
            webpage)
        if matches:
            return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))

        return list(filter(None, [
            cls._build_brighcove_url_from_js(custom_bc)
            for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))

    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})

        # Change the 'videoId' and others field to '@videoPlayer'
        url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url)
        # Change bckey (used by bcove.me urls) to playerKey
        url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
        mobj = re.match(self._VALID_URL, url)
        query_str = mobj.group('query')
        query = compat_urlparse.parse_qs(query_str)

        videoPlayer = query.get('@videoPlayer')
        if videoPlayer:
            # We set the original url as the default 'Referer' header
            referer = smuggled_data.get('Referer', url)
            return self._get_video_info(
                videoPlayer[0], query_str, query, referer=referer)
        elif 'playerKey' in query:
            player_key = query['playerKey']
            return self._get_playlist_info(player_key[0])
        else:
            raise ExtractorError(
                'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',
                expected=True)

    def _get_video_info(self, video_id, query_str, query, referer=None):
        request_url = self._FEDERATED_URL_TEMPLATE % query_str
        req = compat_urllib_request.Request(request_url)
        linkBase = query.get('linkBaseURL')
        if linkBase is not None:
            referer = linkBase[0]
        if referer is not None:
            req.add_header('Referer', referer)
        webpage = self._download_webpage(req, video_id)

        error_msg = self._html_search_regex(
            r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage,
            'error message', default=None)
        if error_msg is not None:
            raise ExtractorError(
                'brightcove said: %s' % error_msg, expected=True)

        self.report_extraction(video_id)
        info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json')
        info = json.loads(info)['data']
        video_info = info['programmedContent']['videoPlayer']['mediaDTO']
        video_info['_youtubedl_adServerURL'] = info.get('adServerURL')

        return self._extract_video_info(video_info)

    def _get_playlist_info(self, player_key):
        info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key
        playlist_info = self._download_webpage(
            info_url, player_key, 'Downloading playlist information')

        json_data = json.loads(playlist_info)
        if 'videoList' not in json_data:
            raise ExtractorError('Empty playlist')
        playlist_info = json_data['videoList']
        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]

        return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName'])

    def _extract_video_info(self, video_info):
        info = {
            'id': compat_str(video_info['id']),
            'title': video_info['displayName'].strip(),
            'description': video_info.get('shortDescription'),
            'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
            'uploader': video_info.get('publisherName'),
        }

        renditions = video_info.get('renditions')
        if renditions:
            formats = []
            for rend in renditions:
                url = rend['defaultURL']
                if not url:
                    continue
                ext = None
                if rend['remote']:
                    url_comp = compat_urllib_parse_urlparse(url)
                    if url_comp.path.endswith('.m3u8'):
                        formats.extend(
                            self._extract_m3u8_formats(url, info['id'], 'mp4'))
                        continue
                    elif 'akamaihd.net' in url_comp.netloc:
                        # This type of renditions are served through
                        # akamaihd.net, but they don't use f4m manifests
                        url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
                        ext = 'flv'
                if ext is None:
                    ext = determine_ext(url)
                size = rend.get('size')
                formats.append({
                    'url': url,
                    'ext': ext,
                    'height': rend.get('frameHeight'),
                    'width': rend.get('frameWidth'),
                    'filesize': size if size != 0 else None,
                })
            self._sort_formats(formats)
            info['formats'] = formats
        elif video_info.get('FLVFullLengthURL') is not None:
            info.update({
                'url': video_info['FLVFullLengthURL'],
            })

        if self._downloader.params.get('include_ads', False):
            adServerURL = video_info.get('_youtubedl_adServerURL')
            if adServerURL:
                ad_info = {
                    '_type': 'url',
                    'url': adServerURL,
                }
                if 'url' in info:
                    return {
                        '_type': 'playlist',
                        'title': info['title'],
                        'entries': [ad_info, info],
                    }
                else:
                    return ad_info

        if 'url' not in info and not info.get('formats'):
            raise ExtractorError('Unable to extract video url for %s' % info['id'])
        return info


class BrightcoveInPageEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)'
    TEST = {
        'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
        'info_dict': {
            'id': '4463358922001',
            'ext': 'flv',
            'title': 'Meet the man behind Popcorn Time',
            'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
            'duration': 165768,
        }
    }

    @staticmethod
    def _extract_url(webpage):
        video_attributes = re.search(r'(?s)<video([^>]*)>.*?</(?:video|audio)>', webpage)
        if video_attributes:
            video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']')
            account_id = video_attributes.get('account')
            player_id = video_attributes.get('player')
            embed = video_attributes.get('embed')
            video_id = video_attributes.get('video-id')
            if account_id and player_id and embed and video_id:
                return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id)
        return None

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        account_id, player_id, embed, video_id = mobj.groups()

        webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id)

        catalog = self._parse_json(
            js_to_json(
                self._search_regex(
                    r'catalog\(({[^}]+})\);',
                    webpage,
                    'catalog'
                )
            ),
            video_id
        )
        policy_key = catalog['policyKey']

        req = compat_urllib_request.Request(
            'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id),
            headers={'Accept': 'application/json;pk=%s' % policy_key})
        json_data = self._download_json(req, video_id)

        title = json_data['name']
        description = json_data.get('description')
        thumbnail = json_data.get('name')
        timestamp = parse_iso8601(json_data.get('published_at'))
        duration = int_or_none(json_data.get('duration'))

        formats = []
        for source in json_data.get('sources'):
            source_type = source.get('type')
            if source_type == 'application/x-mpegURL':
                formats.extend(self._extract_m3u8_formats(source.get('src'), video_id))
            else:
                src = source.get('src') or source.get('streaming_src')
                if src:
                    formats.append({
                        'url': src,
                        'abr': source.get('avg_bitrate'),
                        'width': int_or_none(source.get('width')),
                        'height': int_or_none(source.get('height')),
                        'filesize': source.get('size'),
                        'container': source.get('container'),
                        'vcodec': source.get('container'),
                    })

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'duration': duration,
            'formats': formats,
        }
Commit	Line	Data
592882aa	1	# encoding: utf-8
400e5810	2	from __future__ import unicode_literals
592882aa	3
fbaaad49 JMF	4	import re
fbaaad49 JMF	5	import json
cfe50f04	6	import xml.etree.ElementTree
fbaaad49 JMF	7
fbaaad49 JMF	8	from .common import InfoExtractor
1cc79574	9	from ..compat import (
7b0817e8	10	compat_parse_qs,
1cc79574 PH	11	compat_str,
1cc79574 PH	12	compat_urllib_parse,
37aab278	13	compat_urllib_parse_urlparse,
1cc79574 PH	14	compat_urllib_request,
1cc79574 PH	15	compat_urlparse,
e20d0c1e	16	compat_xml_parse_error,
1cc79574 PH	17	)
1cc79574 PH	18	from ..utils import (
5524b242	19	determine_ext,
592882aa	20	ExtractorError,
1cc79574 PH	21	find_xpath_attr,
1cc79574 PH	22	fix_xml_ampersands,
ac6e4ca1	23	unescapeHTML,
1cc79574	24	unsmuggle_url,
ed126900	25	js_to_json,
	26	int_or_none,
	27	parse_iso8601,
	28	extract_attributes,
cfe50f04	29	)
fbaaad49	30
dd5bcdc4	31
fbaaad49	32	class BrightcoveIE(InfoExtractor):
ec05fee4	33	_VALID_URL = r'(?:https?://.brightcove\.com/(services\|viewer).?\?\|brightcove:)(?P<query>.*)'
cfe50f04	34	_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
592882aa JMF	35
	36	_TESTS = [
	37	{
4de1994b	38	# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
400e5810	39	'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
400e5810 PH	40	'md5': '5423e113865d26e40624dce2e4b45d95',
	41	'note': 'Test Brightcove downloads and detection in GenericIE',
	42	'info_dict': {
96bef88f JMF	43	'id': '2371591881001',
96bef88f JMF	44	'ext': 'mp4',
400e5810 PH	45	'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
	46	'uploader': '8TV',
	47	'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
592882aa JMF	48	}
	49	},
	50	{
4de1994b	51	# From http://medianetwork.oracle.com/video/player/1785452137001
400e5810	52	'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
400e5810	53	'info_dict': {
96bef88f JMF	54	'id': '1785452137001',
96bef88f JMF	55	'ext': 'flv',
400e5810 PH	56	'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
	57	'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
	58	'uploader': 'Oracle',
592882aa JMF	59	},
592882aa JMF	60	},
fc4a0c2a JMF	61	{
fc4a0c2a JMF	62	# From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
400e5810 PH	63	'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
	64	'info_dict': {
	65	'id': '2750934548001',
	66	'ext': 'mp4',
	67	'title': 'This Bracelet Acts as a Personal Thermostat',
	68	'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
	69	'uploader': 'Mashable',
fc4a0c2a JMF	70	},
fc4a0c2a JMF	71	},
77526143 JMF	72	{
	73	# test that the default referer works
	74	# from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
400e5810 PH	75	'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
	76	'info_dict': {
	77	'id': '2878862109001',
	78	'ext': 'mp4',
	79	'title': 'Lost in Motion II',
	80	'description': 'md5:363109c02998fee92ec02211bd8000df',
	81	'uploader': 'National Ballet of Canada',
77526143	82	},
5524b242 JMF	83	},
	84	{
	85	# test flv videos served by akamaihd.net
	86	# From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william
	87	'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
	88	# The md5 checksum changes on each download
	89	'info_dict': {
	90	'id': '2996102916001',
	91	'ext': 'flv',
	92	'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
	93	'uploader': 'Red Bull TV',
	94	'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
	95	},
	96	},
bd4e40df JMF	97	{
	98	# playlist test
	99	# from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
	100	'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
	101	'info_dict': {
	102	'title': 'Sealife',
d47c26e1	103	'id': '3550319591001',
bd4e40df JMF	104	},
	105	'playlist_mincount': 7,
	106	},
592882aa	107	]
cfe50f04 JMF	108
	109	@classmethod
	110	def _build_brighcove_url(cls, object_str):
	111	"""
	112	Build a Brightcove url from a xml string containing
	113	<object class="BrightcoveExperience">{params}</object>
	114	"""
46e28a84 PH	115
46e28a84 PH	116	# Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
c1147c05	117	object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]"))>',
46e28a84	118	lambda m: m.group(1) + '/>', object_str)
2d0efe70	119	# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
400e5810	120	object_str = object_str.replace('<--', '<!--')
d6f714f3 JMF	121	# remove namespace to simplify extraction
d6f714f3 JMF	122	object_str = re.sub(r'(<object[^>])(xmlns=".?")', r'\1', object_str)
7b0817e8	123	object_str = fix_xml_ampersands(object_str)
46e28a84	124
94c12557 S	125	try:
94c12557 S	126	object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
e20d0c1e	127	except compat_xml_parse_error:
94c12557	128	return
7b0817e8 PH	129
7b0817e8 PH	130	fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
47917f24 JMF	131	if fv_el is not None:
	132	flashvars = dict(
	133	(k, v[0])
	134	for k, v in compat_parse_qs(fv_el.attrib['value']).items())
	135	else:
	136	flashvars = {}
7b0817e8	137
36de0a0e	138	def find_param(name):
7b0817e8 PH	139	if name in flashvars:
7b0817e8 PH	140	return flashvars[name]
d214fdb8 JMF	141	node = find_xpath_attr(object_doc, './param', 'name', name)
	142	if node is not None:
	143	return node.attrib['value']
	144	return None
7b0817e8 PH	145
	146	params = {}
	147
	148	playerID = find_param('playerID')
	149	if playerID is None:
	150	raise ExtractorError('Cannot find player ID')
	151	params['playerID'] = playerID
	152
36de0a0e	153	playerKey = find_param('playerKey')
cfe50f04 JMF	154	# Not all pages define this value
cfe50f04 JMF	155	if playerKey is not None:
d214fdb8	156	params['playerKey'] = playerKey
36de0a0e JMF	157	# The three fields hold the id of the video
36de0a0e JMF	158	videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
abb285fb	159	if videoPlayer is not None:
d214fdb8	160	params['@videoPlayer'] = videoPlayer
36de0a0e	161	linkBase = find_param('linkBaseURL')
dd5bcdc4	162	if linkBase is not None:
d214fdb8	163	params['linkBaseURL'] = linkBase
af9cdee9 S	164	return cls._make_brightcove_url(params)
	165
	166	@classmethod
	167	def _build_brighcove_url_from_js(cls, object_js):
	168	# The layout of JS is as follows:
	169	# customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
	170	# // build Brightcove <object /> XML
	171	# }
	172	m = re.search(
	173	r'''(?x)customBC.\createVideo\(
	174	.*? # skipping width and height
	175	["\'](?P<playerID>\d+)["\']\s,\s # playerID
	176	["\'](?P<playerKey>AQ[^"\']{48})[^"\']["\']\s,\s* # playerKey begins with AQ and is 50 characters
	177	# in length, however it's appended to itself
	178	# in places, so truncate
	179	["\'](?P<videoID>\d+)["\'] # @videoPlayer
	180	''', object_js)
	181	if m:
	182	return cls._make_brightcove_url(m.groupdict())
	183
	184	@classmethod
	185	def _make_brightcove_url(cls, params):
cfe50f04 JMF	186	data = compat_urllib_parse.urlencode(params)
cfe50f04 JMF	187	return cls._FEDERATED_URL_TEMPLATE % data
fbaaad49	188
eeb165e6 JMF	189	@classmethod
eeb165e6 JMF	190	def _extract_brightcove_url(cls, webpage):
99877772	191	"""Try to extract the brightcove url from the webpage, returns None
eeb165e6 JMF	192	if it can't be found
eeb165e6 JMF	193	"""
99877772 PH	194	urls = cls._extract_brightcove_urls(webpage)
	195	return urls[0] if urls else None
	196
	197	@classmethod
	198	def _extract_brightcove_urls(cls, webpage):
	199	"""Return a list of all Brightcove URLs from the webpage """
117bec93	200
e1ab5000	201	url_m = re.search(
b26733ba	202	r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure\|c)\.brightcove.com/[^\'"]+)[\'"]',
e1ab5000	203	webpage)
117bec93	204	if url_m:
381640e3 JMF	205	url = unescapeHTML(url_m.group(1))
	206	# Some sites don't add it, we can't download with this url, for example:
	207	# http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
e1ab5000	208	if 'playerKey' in url or 'videoId' in url:
381640e3	209	return [url]
117bec93	210
99877772	211	matches = re.findall(
7b0817e8 PH	212	r'''(?sx)<object
7b0817e8 PH	213	(?:
99877772	214	[^>]+?class=[\'"][^>]?BrightcoveExperience.?[\'"] \|
7b0817e8	215	[^>]?>\s<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
476e1095	216	).+?>\s*</object>''',
7b0817e8	217	webpage)
b4e1576a	218	if matches:
	219	return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
	220
af9cdee9 S	221	return list(filter(None, [
	222	cls._build_brighcove_url_from_js(custom_bc)
	223	for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
eeb165e6	224
fbaaad49	225	def _real_extract(self, url):
79f82953 PH	226	url, smuggled_data = unsmuggle_url(url, {})
79f82953 PH	227
51040b72 JMF	228	# Change the 'videoId' and others field to '@videoPlayer'
	229	url = re.sub(r'(?<=[?&])(videoI(d\|D)\|bctid)', '%40videoPlayer', url)
	230	# Change bckey (used by bcove.me urls) to playerKey
	231	url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
fbaaad49	232	mobj = re.match(self._VALID_URL, url)
6543f0dc JMF	233	query_str = mobj.group('query')
6543f0dc JMF	234	query = compat_urlparse.parse_qs(query_str)
fbaaad49	235
6543f0dc JMF	236	videoPlayer = query.get('@videoPlayer')
6543f0dc JMF	237	if videoPlayer:
79f82953 PH	238	# We set the original url as the default 'Referer' header
	239	referer = smuggled_data.get('Referer', url)
	240	return self._get_video_info(
	241	videoPlayer[0], query_str, query, referer=referer)
68575900	242	elif 'playerKey' in query:
6543f0dc JMF	243	player_key = query['playerKey']
6543f0dc JMF	244	return self._get_playlist_info(player_key[0])
68575900 PH	245	else:
	246	raise ExtractorError(
	247	'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',
	248	expected=True)
abb285fb	249
77526143	250	def _get_video_info(self, video_id, query_str, query, referer=None):
dd5bcdc4 JMF	251	request_url = self._FEDERATED_URL_TEMPLATE % query_str
	252	req = compat_urllib_request.Request(request_url)
	253	linkBase = query.get('linkBaseURL')
	254	if linkBase is not None:
77526143 JMF	255	referer = linkBase[0]
	256	if referer is not None:
	257	req.add_header('Referer', referer)
dd5bcdc4	258	webpage = self._download_webpage(req, video_id)
fbaaad49	259
a5ed3e57	260	error_msg = self._html_search_regex(
4d46c1c6	261	r"<h1>We're sorry.</h1>([\s\n]<p>.?</p>)+", webpage,
a5ed3e57 PH	262	'error message', default=None)
	263	if error_msg is not None:
	264	raise ExtractorError(
	265	'brightcove said: %s' % error_msg, expected=True)
	266
fbaaad49	267	self.report_extraction(video_id)
a013eba6	268	info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json')
fbaaad49 JMF	269	info = json.loads(info)['data']
fbaaad49 JMF	270	video_info = info['programmedContent']['videoPlayer']['mediaDTO']
7b0817e8	271	video_info['_youtubedl_adServerURL'] = info.get('adServerURL')
abb285fb JMF	272
	273	return self._extract_video_info(video_info)
	274
	275	def _get_playlist_info(self, player_key):
117bec93 PH	276	info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key
	277	playlist_info = self._download_webpage(
	278	info_url, player_key, 'Downloading playlist information')
abb285fb	279
59145479 PH	280	json_data = json.loads(playlist_info)
59145479 PH	281	if 'videoList' not in json_data:
400e5810	282	raise ExtractorError('Empty playlist')
59145479	283	playlist_info = json_data['videoList']
abb285fb JMF	284	videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
abb285fb JMF	285
d47c26e1	286	return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
abb285fb JMF	287	playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
	288
	289	def _extract_video_info(self, video_info):
592882aa	290	info = {
eeb165e6	291	'id': compat_str(video_info['id']),
066f6a06	292	'title': video_info['displayName'].strip(),
592882aa JMF	293	'description': video_info.get('shortDescription'),
	294	'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
	295	'uploader': video_info.get('publisherName'),
	296	}
abb285fb	297
592882aa JMF	298	renditions = video_info.get('renditions')
592882aa JMF	299	if renditions:
5524b242 JMF	300	formats = []
	301	for rend in renditions:
	302	url = rend['defaultURL']
b5af6fcd NJ	303	if not url:
b5af6fcd NJ	304	continue
233d37fb	305	ext = None
5524b242	306	if rend['remote']:
37aab278 NJ	307	url_comp = compat_urllib_parse_urlparse(url)
	308	if url_comp.path.endswith('.m3u8'):
	309	formats.extend(
	310	self._extract_m3u8_formats(url, info['id'], 'mp4'))
	311	continue
	312	elif 'akamaihd.net' in url_comp.netloc:
	313	# This type of renditions are served through
	314	# akamaihd.net, but they don't use f4m manifests
	315	url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
	316	ext = 'flv'
233d37fb	317	if ext is None:
5524b242	318	ext = determine_ext(url)
35eacd0d	319	size = rend.get('size')
5524b242 JMF	320	formats.append({
	321	'url': url,
	322	'ext': ext,
	323	'height': rend.get('frameHeight'),
	324	'width': rend.get('frameWidth'),
35eacd0d	325	'filesize': size if size != 0 else None,
5524b242	326	})
35eacd0d	327	self._sort_formats(formats)
5524b242	328	info['formats'] = formats
592882aa JMF	329	elif video_info.get('FLVFullLengthURL') is not None:
	330	info.update({
	331	'url': video_info['FLVFullLengthURL'],
592882aa	332	})
7b0817e8 PH	333
	334	if self._downloader.params.get('include_ads', False):
	335	adServerURL = video_info.get('_youtubedl_adServerURL')
	336	if adServerURL:
	337	ad_info = {
	338	'_type': 'url',
	339	'url': adServerURL,
	340	}
	341	if 'url' in info:
	342	return {
	343	'_type': 'playlist',
	344	'title': info['title'],
	345	'entries': [ad_info, info],
	346	}
	347	else:
	348	return ad_info
	349
d614aa40	350	if 'url' not in info and not info.get('formats'):
400e5810	351	raise ExtractorError('Unable to extract video url for %s' % info['id'])
592882aa	352	return info
ed126900	353
	354
	355	class BrightcoveInPageEmbedIE(InfoExtractor):
	356	_VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)'
	357	TEST = {
	358	'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
	359	'info_dict': {
	360	'id': '4463358922001',
	361	'ext': 'flv',
	362	'title': 'Meet the man behind Popcorn Time',
	363	'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
	364	'duration': 165768,
	365	}
	366	}
	367
	368	@staticmethod
	369	def _extract_url(webpage):
	370	video_attributes = re.search(r'(?s)<video([^>])>.?</(?:video\|audio)>', webpage)
	371	if video_attributes:
	372	video_attributes = extract_attributes(video_attributes.group(), r'(?s)\sdata-(account\|video-id\|playlist-id\|policy-key\|player\|embed)\s=\s*["\']([^"\']+)["\']')
	373	account_id = video_attributes.get('account')
	374	player_id = video_attributes.get('player')
	375	embed = video_attributes.get('embed')
	376	video_id = video_attributes.get('video-id')
	377	if account_id and player_id and embed and video_id:
	378	return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id)
	379	return None
	380
	381	def _real_extract(self, url):
	382	mobj = re.match(self._VALID_URL, url)
	383	account_id, player_id, embed, video_id = mobj.groups()
	384
	385	webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id)
	386
	387	catalog = self._parse_json(
	388	js_to_json(
	389	self._search_regex(
	390	r'catalog\(({[^}]+})\);',
	391	webpage,
	392	'catalog'
	393	)
	394	),
	395	video_id
	396	)
	397	policy_key = catalog['policyKey']
	398
	399	req = compat_urllib_request.Request(
	400	'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id),
	401	headers={'Accept': 'application/json;pk=%s' % policy_key})
	402	json_data = self._download_json(req, video_id)
	403
	404	title = json_data['name']
	405	description = json_data.get('description')
	406	thumbnail = json_data.get('name')
	407	timestamp = parse_iso8601(json_data.get('published_at'))
	408	duration = int_or_none(json_data.get('duration'))
	409
	410	formats = []
	411	for source in json_data.get('sources'):
	412	source_type = source.get('type')
	413	if source_type == 'application/x-mpegURL':
	414	formats.extend(self._extract_m3u8_formats(source.get('src'), video_id))
	415	else:
53407e3f	416	src = source.get('src') or source.get('streaming_src')
ed126900	417	if src:
	418	formats.append({
	419	'url': src,
	420	'abr': source.get('avg_bitrate'),
	421	'width': int_or_none(source.get('width')),
	422	'height': int_or_none(source.get('height')),
	423	'filesize': source.get('size'),
	424	'container': source.get('container'),
	425	'vcodec': source.get('container'),
	426	})
ed126900	427
	428	self._sort_formats(formats)
	429
	430	return {
	431	'id': video_id,
	432	'title': title,
	433	'description': description,
	434	'thumbnail': thumbnail,
	435	'timestamp': timestamp,
	436	'duration': duration,
	437	'formats': formats,
	438	}