[yt-dlp.git] / youtube_dl / extractor / ard.py

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
)

class ARDIE(InfoExtractor):
    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
    _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
    _TEST = {
        u'url': u'http://www.ardmediathek.de/das-erste/tagesschau-in-100-sek?documentId=14077640',
        u'file': u'14077640.mp4',
        u'md5': u'6ca8824255460c787376353f9e20bbd8',
        u'info_dict': {
            u"title": u"11.04.2013 09:23 Uhr - Tagesschau in 100 Sekunden"
        },
        u'skip': u'Requires rtmpdump'
    }

    def _real_extract(self, url):
        # determine video id from url
        m = re.match(self._VALID_URL, url)

        numid = re.search(r'documentId=([0-9]+)', url)
        if numid:
            video_id = numid.group(1)
        else:
            video_id = m.group('video_id')

        # determine title and media streams from webpage
        html = self._download_webpage(url, video_id)
        title = re.search(self._TITLE, html).group('title')
        streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)]
        if not streams:
            assert '"fsk"' in html
            raise ExtractorError(u'This video is only available after 8:00 pm')

        # choose default media type and highest quality for now
        stream = max([s for s in streams if int(s["media_type"]) == 0],
                     key=lambda s: int(s["quality"]))

        # there's two possibilities: RTMP stream or HTTP download
        info = {'id': video_id, 'title': title, 'ext': 'mp4'}
        if stream['rtmp_url']:
            self.to_screen(u'RTMP download detected')
            assert stream['video_url'].startswith('mp4:')
            info["url"] = stream["rtmp_url"]
            info["play_path"] = stream['video_url']
        else:
            assert stream["video_url"].endswith('.mp4')
            info["url"] = stream["video_url"]
        return [info]
Commit	Line	Data
d5822b96 PH	1	import re
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	ExtractorError,
	6	)
	7
	8	class ARDIE(InfoExtractor):
	9	_VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de\|mediathek\.daserste\.de)/(?:./)(?P<video_id>[^/\?]+)(?:\?.)?'
	10	_TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
	11	_MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"])", "(?P<video_url>[^"])", "[^"]*"\)'
6f5ac90c PH	12	_TEST = {
	13	u'url': u'http://www.ardmediathek.de/das-erste/tagesschau-in-100-sek?documentId=14077640',
	14	u'file': u'14077640.mp4',
	15	u'md5': u'6ca8824255460c787376353f9e20bbd8',
	16	u'info_dict': {
	17	u"title": u"11.04.2013 09:23 Uhr - Tagesschau in 100 Sekunden"
	18	},
	19	u'skip': u'Requires rtmpdump'
	20	}
d5822b96 PH	21
	22	def _real_extract(self, url):
	23	# determine video id from url
	24	m = re.match(self._VALID_URL, url)
	25
	26	numid = re.search(r'documentId=([0-9]+)', url)
	27	if numid:
	28	video_id = numid.group(1)
	29	else:
	30	video_id = m.group('video_id')
	31
	32	# determine title and media streams from webpage
	33	html = self._download_webpage(url, video_id)
	34	title = re.search(self._TITLE, html).group('title')
20c3893f	35	streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)]
d5822b96 PH	36	if not streams:
	37	assert '"fsk"' in html
	38	raise ExtractorError(u'This video is only available after 8:00 pm')
	39
	40	# choose default media type and highest quality for now
	41	stream = max([s for s in streams if int(s["media_type"]) == 0],
	42	key=lambda s: int(s["quality"]))
	43
	44	# there's two possibilities: RTMP stream or HTTP download
	45	info = {'id': video_id, 'title': title, 'ext': 'mp4'}
	46	if stream['rtmp_url']:
	47	self.to_screen(u'RTMP download detected')
	48	assert stream['video_url'].startswith('mp4:')
	49	info["url"] = stream["rtmp_url"]
	50	info["play_path"] = stream['video_url']
	51	else:
	52	assert stream["video_url"].endswith('.mp4')
	53	info["url"] = stream["video_url"]
	54	return [info]