[yt-dlp.git] / youtube_dl / extractor / fktv.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    clean_html,
    determine_ext,
    ExtractorError,
)


class FKTVIE(InfoExtractor):
    IE_NAME = 'fernsehkritik.tv'
    _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?'

    _TEST = {
        'url': 'http://fernsehkritik.tv/folge-1',
        'md5': '21f0b0c99bce7d5b524eb1b17b1c6d79',
        'info_dict': {
            'id': '1',
            'ext': 'mp4',
            'title': 'Folge 1 vom 10. April 2007',
            'thumbnail': 're:^https?://.*\.jpg$',
        },
    }

    def _real_extract(self, url):
        episode = self._match_id(url)

        webpage = self._download_webpage(
            'http://fernsehkritik.tv/folge-%s/play' % episode, episode)
        title = clean_html(self._html_search_regex(
            '<h3>([^<]+)</h3>', webpage, 'title'))
        matches = re.search(
            r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>',
            webpage)
        if matches is None:
            raise ExtractorError('Unable to extract the video')

        poster, sources = matches.groups()
        if poster is None:
            self.report_warning('unable to extract thumbnail')

        urls = re.findall(r'<source[^>]+src="([^"]+)"', sources)
        formats = [{
            'url': furl,
            'format_id': determine_ext(furl),
        } for furl in urls]
        return {
            'id': episode,
            'title': title,
            'formats': formats,
            'thumbnail': poster,
        }
Commit	Line	Data
bf7aa630 PH	1	from __future__ import unicode_literals
bf7aa630 PH	2
c5e743f6	3	import re
71c107fc	4
	5	from .common import InfoExtractor
	6	from ..utils import (
c5e743f6	7	clean_html,
7b4137c3	8	determine_ext,
8ddf48d5	9	ExtractorError,
71c107fc	10	)
71c107fc	11
c5e743f6	12
71c107fc	13	class FKTVIE(InfoExtractor):
bf7aa630	14	IE_NAME = 'fernsehkritik.tv'
bd6b25ce	15	_VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?'
71c107fc	16
c5e743f6	17	_TEST = {
bf7aa630	18	'url': 'http://fernsehkritik.tv/folge-1',
7b4137c3	19	'md5': '21f0b0c99bce7d5b524eb1b17b1c6d79',
bf7aa630	20	'info_dict': {
7b4137c3	21	'id': '1',
7b4137c3	22	'ext': 'mp4',
bf7aa630	23	'title': 'Folge 1 vom 10. April 2007',
57738039	24	'thumbnail': 're:^https?://.*\.jpg$',
c5e743f6 JMF	25	},
	26	}
	27
	28	def _real_extract(self, url):
7b4137c3	29	episode = self._match_id(url)
7b4137c3	30
711762f0 YCH	31	webpage = self._download_webpage(
	32	'http://fernsehkritik.tv/folge-%s/play' % episode, episode)
	33	title = clean_html(self._html_search_regex(
	34	'<h3>([^<]+)</h3>', webpage, 'title'))
	35	matches = re.search(
8de28761 YCH	36	r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]>(.)</video>',
8de28761 YCH	37	webpage)
8ddf48d5 YCH	38	if matches is None:
	39	raise ExtractorError('Unable to extract the video')
	40
	41	poster, sources = matches.groups()
57738039 YCH	42	if poster is None:
	43	self.report_warning('unable to extract thumbnail')
	44
140359fc	45	urls = re.findall(r'<source[^>]+src="([^"]+)"', sources)
711762f0	46	formats = [{
4866b72e	47	'url': furl,
3706fb5d	48	'format_id': determine_ext(furl),
4866b72e	49	} for furl in urls]
8ddf48d5 YCH	50	return {
	51	'id': episode,
	52	'title': title,
	53	'formats': formats,
	54	'thumbnail': poster,
	55	}