[yt-dlp.git] / youtube_dl / extractor / orf.py

import re
import xml.etree.ElementTree
import json

from .common import InfoExtractor
from ..utils import (
    compat_urlparse,
    ExtractorError,
    find_xpath_attr,
)

class ORFIE(InfoExtractor):
    _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'

    _TEST = {
        u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter',
        u'file': u'6566957.flv',
        u'info_dict': {
            u'title': u'Wetter',
            u'description': u'Christa Kummer, Marcus Wadsak und Kollegen  präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at',
        },
        u'params': {
            # It uses rtmp
            u'skip_download': True,
        }
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        playlist_id = mobj.group('id')
        webpage = self._download_webpage(url, playlist_id)

        flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
        flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
        flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
        playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
        playlist = json.loads(playlist_json)

        videos = []
        ns = '{http://tempuri.org/XMLSchema.xsd}'
        xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
        webpage_description = self._og_search_description(webpage)
        for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
            # Get best quality url
            rtmp_url = None
            for q in ['Q6A', 'Q4A', 'Q1A']:
                video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
                if video_url is not None:
                    rtmp_url = video_url.text
                    break
            if rtmp_url is None:
                raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
            description = self._html_search_regex(
                r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
                u'description', default=webpage_description, flags=re.DOTALL)
            videos.append({
                '_type': 'video',
                'id': info['id'],
                'title': info['title'],
                'url': rtmp_url,
                'ext': 'flv',
                'description': description,
                })

        return videos
Commit	Line	Data
54543467 JMF	1	import re
	2	import xml.etree.ElementTree
	3	import json
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	compat_urlparse,
	8	ExtractorError,
	9	find_xpath_attr,
	10	)
	11
	12	class ORFIE(InfoExtractor):
	13	_VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes\|topics/.+?)/(?P<id>\d+)'
	14
	15	_TEST = {
	16	u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter',
	17	u'file': u'6566957.flv',
	18	u'info_dict': {
	19	u'title': u'Wetter',
	20	u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at',
	21	},
	22	u'params': {
	23	# It uses rtmp
	24	u'skip_download': True,
	25	}
	26	}
	27
	28	def _real_extract(self, url):
	29	mobj = re.match(self._VALID_URL, url)
	30	playlist_id = mobj.group('id')
	31	webpage = self._download_webpage(url, playlist_id)
	32
	33	flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
	34	flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
	35	flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
	36	playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
	37	playlist = json.loads(playlist_json)
	38
	39	videos = []
	40	ns = '{http://tempuri.org/XMLSchema.xsd}'
	41	xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
	42	webpage_description = self._og_search_description(webpage)
	43	for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
	44	# Get best quality url
	45	rtmp_url = None
	46	for q in ['Q6A', 'Q4A', 'Q1A']:
	47	video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
	48	if video_url is not None:
	49	rtmp_url = video_url.text
	50	break
	51	if rtmp_url is None:
	52	raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
	53	description = self._html_search_regex(
	54	r'id="playlist_entry_%s".?<p>(.?)</p>' % i, webpage,
	55	u'description', default=webpage_description, flags=re.DOTALL)
	56	videos.append({
	57	'_type': 'video',
	58	'id': info['id'],
	59	'title': info['title'],
	60	'url': rtmp_url,
	61	'ext': 'flv',
	62	'description': description,
	63	})
	64
65	return videos