[yt-dlp.git] / youtube_dl / extractor / spiegel.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor


class SpiegelIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
    _TESTS = [{
        'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
        'file': '1259285.mp4',
        'md5': '2c2754212136f35fb4b19767d242f66e',
        'info_dict': {
            'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
        },
    },
    {
        'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
        'file': '1309159.mp4',
        'md5': 'f2cdf638d7aa47654e251e1aee360af1',
        'info_dict': {
            'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
        },
    }]

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url)
        video_id = m.group('videoID')

        webpage = self._download_webpage(url, video_id)

        video_title = self._html_search_regex(
            r'<div class="module-title">(.*?)</div>', webpage, 'title')

        xml_url = 'http://video2.spiegel.de/flash/' + video_id + '.xml'
        idoc = self._download_xml(
            xml_url, video_id,
            note='Downloading XML', errnote='Failed to download XML')

        formats = [
            {
                'format_id': n.tag.rpartition('type')[2],
                'url': 'http://video2.spiegel.de/flash/' + n.find('./filename').text,
                'width': int(n.find('./width').text),
                'height': int(n.find('./height').text),
                'abr': int(n.find('./audiobitrate').text),
                'vbr': int(n.find('./videobitrate').text),
                'vcodec': n.find('./codec').text,
                'acodec': 'MP4A',
            }
            for n in list(idoc)
            # Blacklist type 6, it's extremely LQ and not available on the same server
            if n.tag.startswith('type') and n.tag != 'type6'
        ]
        duration = float(idoc[0].findall('./duration')[0].text)

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': video_title,
            'duration': duration,
            'formats': formats,
        }
Commit	Line	Data
4baff4a4 JMF	1	from __future__ import unicode_literals
4baff4a4 JMF	2
49f5f315	3	import re
49f5f315 PH	4
	5	from .common import InfoExtractor
	6
	7
	8	class SpiegelIE(InfoExtractor):
	9	_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]-(?P<videoID>[0-9]+)(?:\.html)?(?:#.)?$'
7150858d	10	_TESTS = [{
4baff4a4 JMF	11	'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
	12	'file': '1259285.mp4',
	13	'md5': '2c2754212136f35fb4b19767d242f66e',
	14	'info_dict': {
	15	'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
	16	},
7150858d PH	17	},
7150858d PH	18	{
4baff4a4 JMF	19	'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
	20	'file': '1309159.mp4',
	21	'md5': 'f2cdf638d7aa47654e251e1aee360af1',
	22	'info_dict': {
	23	'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
	24	},
7150858d	25	}]
49f5f315 PH	26
	27	def _real_extract(self, url):
	28	m = re.match(self._VALID_URL, url)
	29	video_id = m.group('videoID')
	30
	31	webpage = self._download_webpage(url, video_id)
	32
7150858d	33	video_title = self._html_search_regex(
4baff4a4	34	r'<div class="module-title">(.*?)</div>', webpage, 'title')
49f5f315	35
4baff4a4	36	xml_url = 'http://video2.spiegel.de/flash/' + video_id + '.xml'
e26f8712	37	idoc = self._download_xml(
7150858d	38	xml_url, video_id,
4baff4a4	39	note='Downloading XML', errnote='Failed to download XML')
49f5f315	40
7150858d PH	41	formats = [
	42	{
	43	'format_id': n.tag.rpartition('type')[2],
4baff4a4	44	'url': 'http://video2.spiegel.de/flash/' + n.find('./filename').text,
7150858d PH	45	'width': int(n.find('./width').text),
	46	'height': int(n.find('./height').text),
	47	'abr': int(n.find('./audiobitrate').text),
	48	'vbr': int(n.find('./videobitrate').text),
	49	'vcodec': n.find('./codec').text,
	50	'acodec': 'MP4A',
	51	}
	52	for n in list(idoc)
	53	# Blacklist type 6, it's extremely LQ and not available on the same server
	54	if n.tag.startswith('type') and n.tag != 'type6'
	55	]
7150858d PH	56	duration = float(idoc[0].findall('./duration')[0].text)
7150858d PH	57
e6812ac9 PH	58	self._sort_formats(formats)
e6812ac9 PH	59
4baff4a4	60	return {
49f5f315	61	'id': video_id,
49f5f315 PH	62	'title': video_title,
49f5f315 PH	63	'duration': duration,
7150858d	64	'formats': formats,
49f5f315	65	}