[yt-dlp.git] / yt_dlp / extractor / teletask.py

import re

from .common import InfoExtractor
from ..utils import unified_strdate


class TeleTaskIE(InfoExtractor):
    _WORKING = False
    _VALID_URL = r'https?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)'
    _TEST = {
        'url': 'http://www.tele-task.de/archive/video/html5/26168/',
        'info_dict': {
            'id': '26168',
            'title': 'Duplicate Detection',
        },
        'playlist': [{
            'md5': '290ef69fb2792e481169c3958dbfbd57',
            'info_dict': {
                'id': '26168-speaker',
                'ext': 'mp4',
                'title': 'Duplicate Detection',
                'upload_date': '20141218',
            },
        }, {
            'md5': 'e1e7218c5f0e4790015a437fcf6c71b4',
            'info_dict': {
                'id': '26168-slides',
                'ext': 'mp4',
                'title': 'Duplicate Detection',
                'upload_date': '20141218',
            },
        }],
    }

    def _real_extract(self, url):
        lecture_id = self._match_id(url)
        webpage = self._download_webpage(url, lecture_id)

        title = self._html_search_regex(
            r'itemprop="name">([^<]+)</a>', webpage, 'title')
        upload_date = unified_strdate(self._html_search_regex(
            r'Date:</td><td>([^<]+)</td>', webpage, 'date', fatal=False))

        entries = [{
            'id': f'{lecture_id}-{format_id}',
            'url': video_url,
            'title': title,
            'upload_date': upload_date,
        } for format_id, video_url in re.findall(
            r'<video class="([^"]+)"[^>]*>\s*<source src="([^"]+)"', webpage)]

        return self.playlist_result(entries, lecture_id, title)
Commit	Line	Data
33b53b60	1	import re
ee456252 M	2
ee456252 M	3	from .common import InfoExtractor
885e4384	4	from ..utils import unified_strdate
ee456252 M	5
	6
	7	class TeleTaskIE(InfoExtractor):
df773c3d	8	_WORKING = False
885e4384	9	_VALID_URL = r'https?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)'
ee456252	10	_TEST = {
33b53b60	11	'url': 'http://www.tele-task.de/archive/video/html5/26168/',
ee456252	12	'info_dict': {
654bd52f	13	'id': '26168',
ee456252	14	'title': 'Duplicate Detection',
33b53b60 M	15	},
	16	'playlist': [{
	17	'md5': '290ef69fb2792e481169c3958dbfbd57',
	18	'info_dict': {
885e4384 S	19	'id': '26168-speaker',
885e4384 S	20	'ext': 'mp4',
33b53b60 M	21	'title': 'Duplicate Detection',
33b53b60 M	22	'upload_date': '20141218',
add96eb9	23	},
885e4384	24	}, {
33b53b60 M	25	'md5': 'e1e7218c5f0e4790015a437fcf6c71b4',
33b53b60 M	26	'info_dict': {
885e4384 S	27	'id': '26168-slides',
885e4384 S	28	'ext': 'mp4',
33b53b60 M	29	'title': 'Duplicate Detection',
33b53b60 M	30	'upload_date': '20141218',
add96eb9	31	},
add96eb9	32	}],
ee456252 M	33	}
	34
	35	def _real_extract(self, url):
33b53b60 M	36	lecture_id = self._match_id(url)
	37	webpage = self._download_webpage(url, lecture_id)
	38
ee456252	39	title = self._html_search_regex(
885e4384 S	40	r'itemprop="name">([^<]+)</a>', webpage, 'title')
	41	upload_date = unified_strdate(self._html_search_regex(
	42	r'Date:</td><td>([^<]+)</td>', webpage, 'date', fatal=False))
ee456252	43
33b53b60	44	entries = [{
add96eb9	45	'id': f'{lecture_id}-{format_id}',
885e4384	46	'url': video_url,
33b53b60	47	'title': title,
885e4384 S	48	'upload_date': upload_date,
	49	} for format_id, video_url in re.findall(
	50	r'<video class="([^"]+)"[^>]>\s<source src="([^"]+)"', webpage)]
ee456252	51
885e4384	52	return self.playlist_result(entries, lecture_id, title)