[yt-dlp.git] / youtube_dl / extractor / internetvideoarchive.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    compat_urlparse,
    compat_urllib_parse,
    xpath_with_ns,
)


class InternetVideoArchiveIE(InfoExtractor):
    _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'

    _TEST = {
        'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
        'info_dict': {
            'id': '452693',
            'ext': 'mp4',
            'title': 'SKYFALL',
            'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
            'duration': 149,
        },
    }

    @staticmethod
    def _build_url(query):
        return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query

    @staticmethod
    def _clean_query(query):
        NEEDED_ARGS = ['publishedid', 'customerid']
        query_dic = compat_urlparse.parse_qs(query)
        cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS)
        # Other player ids return m3u8 urls
        cleaned_dic['playerid'] = '247'
        cleaned_dic['videokbrate'] = '100000'
        return compat_urllib_parse.urlencode(cleaned_dic)

    def _real_extract(self, url):
        query = compat_urlparse.urlparse(url).query
        query_dic = compat_urlparse.parse_qs(query)
        video_id = query_dic['publishedid'][0]
        url = self._build_url(query)

        flashconfiguration = self._download_xml(url, video_id,
                                                'Downloading flash configuration')
        file_url = flashconfiguration.find('file').text
        file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
        # Replace some of the parameters in the query to get the best quality
        # and http links (no m3u8 manifests)
        file_url = re.sub(r'(?<=\?)(.+)$',
                          lambda m: self._clean_query(m.group()),
                          file_url)
        info = self._download_xml(file_url, video_id,
                                  'Downloading video info')
        item = info.find('channel/item')

        def _bp(p):
            return xpath_with_ns(
                p,
                {
                    'media': 'http://search.yahoo.com/mrss/',
                    'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats',
                }
            )
        formats = []
        for content in item.findall(_bp('media:group/media:content')):
            attr = content.attrib
            f_url = attr['url']
            width = int(attr['width'])
            bitrate = int(attr['bitrate'])
            format_id = '%d-%dk' % (width, bitrate)
            formats.append({
                'format_id': format_id,
                'url': f_url,
                'width': width,
                'tbr': bitrate,
            })

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': item.find('title').text,
            'formats': formats,
            'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
            'description': item.find('description').text,
            'duration': int(attr['duration']),
        }
Commit	Line	Data
9e1e67fc PH	1	from __future__ import unicode_literals
9e1e67fc PH	2
d7e66d39	3	import re
d7e66d39 JMF	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	compat_urlparse,
4b7b839f	8	compat_urllib_parse,
d7e66d39	9	xpath_with_ns,
d7e66d39 JMF	10	)
	11
	12
	13	class InternetVideoArchiveIE(InfoExtractor):
	14	_VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.?\?.?publishedid.*?'
	15
	16	_TEST = {
9e1e67fc PH	17	'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
	18	'info_dict': {
	19	'id': '452693',
	20	'ext': 'mp4',
	21	'title': 'SKYFALL',
	22	'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
72d53356	23	'duration': 149,
d7e66d39 JMF	24	},
	25	}
	26
	27	@staticmethod
	28	def _build_url(query):
	29	return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
	30
4b7b839f JMF	31	@staticmethod
	32	def _clean_query(query):
	33	NEEDED_ARGS = ['publishedid', 'customerid']
	34	query_dic = compat_urlparse.parse_qs(query)
5f6a1245	35	cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS)
4b7b839f JMF	36	# Other player ids return m3u8 urls
	37	cleaned_dic['playerid'] = '247'
	38	cleaned_dic['videokbrate'] = '100000'
	39	return compat_urllib_parse.urlencode(cleaned_dic)
	40
d7e66d39 JMF	41	def _real_extract(self, url):
	42	query = compat_urlparse.urlparse(url).query
	43	query_dic = compat_urlparse.parse_qs(query)
	44	video_id = query_dic['publishedid'][0]
	45	url = self._build_url(query)
	46
e26f8712	47	flashconfiguration = self._download_xml(url, video_id,
9e1a5b84	48	'Downloading flash configuration')
d7e66d39 JMF	49	file_url = flashconfiguration.find('file').text
d7e66d39 JMF	50	file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
4b7b839f JMF	51	# Replace some of the parameters in the query to get the best quality
	52	# and http links (no m3u8 manifests)
	53	file_url = re.sub(r'(?<=\?)(.+)$',
9e1a5b84 JW	54	lambda m: self._clean_query(m.group()),
9e1a5b84 JW	55	file_url)
e26f8712	56	info = self._download_xml(file_url, video_id,
9e1a5b84	57	'Downloading video info')
d7e66d39 JMF	58	item = info.find('channel/item')
	59
	60	def _bp(p):
9e1a5b84 JW	61	return xpath_with_ns(
	62	p,
	63	{
	64	'media': 'http://search.yahoo.com/mrss/',
	65	'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats',
	66	}
	67	)
d7e66d39 JMF	68	formats = []
	69	for content in item.findall(_bp('media:group/media:content')):
	70	attr = content.attrib
	71	f_url = attr['url']
12c97873 PH	72	width = int(attr['width'])
	73	bitrate = int(attr['bitrate'])
	74	format_id = '%d-%dk' % (width, bitrate)
d7e66d39	75	formats.append({
12c97873	76	'format_id': format_id,
d7e66d39	77	'url': f_url,
12c97873 PH	78	'width': width,
12c97873 PH	79	'tbr': bitrate,
d7e66d39	80	})
12c97873 PH	81
12c97873 PH	82	self._sort_formats(formats)
d7e66d39	83
cbbd9a9c	84	return {
d7e66d39 JMF	85	'id': video_id,
	86	'title': item.find('title').text,
	87	'formats': formats,
	88	'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
	89	'description': item.find('description').text,
	90	'duration': int(attr['duration']),
	91	}