[yt-dlp.git] / youtube_dl / extractor / internetvideoarchive.py

import re

from .common import InfoExtractor
from ..utils import (
    compat_urlparse,
    compat_urllib_parse,
    xpath_with_ns,
)


class InternetVideoArchiveIE(InfoExtractor):
    _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'

    _TEST = {
        u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
        u'file': u'452693.mp4',
        u'info_dict': {
            u'title': u'SKYFALL',
            u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
            u'duration': 153,
        },
    }

    @staticmethod
    def _build_url(query):
        return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query

    @staticmethod
    def _clean_query(query):
        NEEDED_ARGS = ['publishedid', 'customerid']
        query_dic = compat_urlparse.parse_qs(query)
        cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
        # Other player ids return m3u8 urls
        cleaned_dic['playerid'] = '247'
        cleaned_dic['videokbrate'] = '100000'
        return compat_urllib_parse.urlencode(cleaned_dic)

    def _real_extract(self, url):
        query = compat_urlparse.urlparse(url).query
        query_dic = compat_urlparse.parse_qs(query)
        video_id = query_dic['publishedid'][0]
        url = self._build_url(query)

        flashconfiguration = self._download_xml(url, video_id,
            u'Downloading flash configuration')
        file_url = flashconfiguration.find('file').text
        file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
        # Replace some of the parameters in the query to get the best quality
        # and http links (no m3u8 manifests)
        file_url = re.sub(r'(?<=\?)(.+)$',
            lambda m: self._clean_query(m.group()),
            file_url)
        info = self._download_xml(file_url, video_id,
            u'Downloading video info')
        item = info.find('channel/item')

        def _bp(p):
            return xpath_with_ns(p,
                {'media': 'http://search.yahoo.com/mrss/',
                'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
        formats = []
        for content in item.findall(_bp('media:group/media:content')):
            attr = content.attrib
            f_url = attr['url']
            width = int(attr['width'])
            bitrate = int(attr['bitrate'])
            format_id = '%d-%dk' % (width, bitrate)
            formats.append({
                'format_id': format_id,
                'url': f_url,
                'width': width,
                'tbr': bitrate,
            })

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': item.find('title').text,
            'formats': formats,
            'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
            'description': item.find('description').text,
            'duration': int(attr['duration']),
        }
Commit	Line	Data
d7e66d39	1	import re
d7e66d39 JMF	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	compat_urlparse,
4b7b839f	6	compat_urllib_parse,
d7e66d39	7	xpath_with_ns,
d7e66d39 JMF	8	)
	9
	10
	11	class InternetVideoArchiveIE(InfoExtractor):
	12	_VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.?\?.?publishedid.*?'
	13
	14	_TEST = {
	15	u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
	16	u'file': u'452693.mp4',
	17	u'info_dict': {
	18	u'title': u'SKYFALL',
	19	u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
cbbd9a9c	20	u'duration': 153,
d7e66d39 JMF	21	},
	22	}
	23
	24	@staticmethod
	25	def _build_url(query):
	26	return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
	27
4b7b839f JMF	28	@staticmethod
	29	def _clean_query(query):
	30	NEEDED_ARGS = ['publishedid', 'customerid']
	31	query_dic = compat_urlparse.parse_qs(query)
	32	cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
	33	# Other player ids return m3u8 urls
	34	cleaned_dic['playerid'] = '247'
	35	cleaned_dic['videokbrate'] = '100000'
	36	return compat_urllib_parse.urlencode(cleaned_dic)
	37
d7e66d39 JMF	38	def _real_extract(self, url):
	39	query = compat_urlparse.urlparse(url).query
	40	query_dic = compat_urlparse.parse_qs(query)
	41	video_id = query_dic['publishedid'][0]
	42	url = self._build_url(query)
	43
e26f8712	44	flashconfiguration = self._download_xml(url, video_id,
d7e66d39	45	u'Downloading flash configuration')
d7e66d39 JMF	46	file_url = flashconfiguration.find('file').text
d7e66d39 JMF	47	file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
4b7b839f JMF	48	# Replace some of the parameters in the query to get the best quality
	49	# and http links (no m3u8 manifests)
	50	file_url = re.sub(r'(?<=\?)(.+)$',
	51	lambda m: self._clean_query(m.group()),
	52	file_url)
e26f8712	53	info = self._download_xml(file_url, video_id,
d7e66d39	54	u'Downloading video info')
d7e66d39 JMF	55	item = info.find('channel/item')
	56
	57	def _bp(p):
	58	return xpath_with_ns(p,
	59	{'media': 'http://search.yahoo.com/mrss/',
	60	'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
	61	formats = []
	62	for content in item.findall(_bp('media:group/media:content')):
	63	attr = content.attrib
	64	f_url = attr['url']
12c97873 PH	65	width = int(attr['width'])
	66	bitrate = int(attr['bitrate'])
	67	format_id = '%d-%dk' % (width, bitrate)
d7e66d39	68	formats.append({
12c97873	69	'format_id': format_id,
d7e66d39	70	'url': f_url,
12c97873 PH	71	'width': width,
12c97873 PH	72	'tbr': bitrate,
d7e66d39	73	})
12c97873 PH	74
12c97873 PH	75	self._sort_formats(formats)
d7e66d39	76
cbbd9a9c	77	return {
d7e66d39 JMF	78	'id': video_id,
	79	'title': item.find('title').text,
	80	'formats': formats,
	81	'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
	82	'description': item.find('description').text,
	83	'duration': int(attr['duration']),
	84	}