[yt-dlp.git] / youtube_dl / extractor / ccc.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    qualities,
    unified_strdate,
)


class CCCIE(InfoExtractor):
    IE_NAME = 'media.ccc.de'
    _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/[^?#]+/[^?#/]*?_(?P<id>[0-9]{8,})._[^?#/]*\.html'

    _TEST = {
        'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
        'md5': '3a1eda8f3a29515d27f5adb967d7e740',
        'info_dict': {
            'id': '20131228183',
            'ext': 'mp4',
            'title': 'Introduction to Processor Design',
            'description': 'md5:5ddbf8c734800267f2cee4eab187bc1b',
            'thumbnail': 're:^https?://.*\.jpg$',
            'view_count': int,
            'upload_date': '20131229',
        }
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        if self._downloader.params.get('prefer_free_formats'):
            preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd'])
        else:
            preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd'])

        title = self._html_search_regex(
            r'(?s)<h1>(.*?)</h1>', webpage, 'title')
        description = self._html_search_regex(
            r"(?s)<p class='description'>(.*?)</p>",
            webpage, 'description', fatal=False)
        upload_date = unified_strdate(self._html_search_regex(
            r"(?s)<span class='[^']*fa-calendar-o'></span>(.*?)</li>",
            webpage, 'upload date', fatal=False))
        view_count = int_or_none(self._html_search_regex(
            r"(?s)<span class='[^']*fa-eye'></span>(.*?)</li>",
            webpage, 'view count', fatal=False))

        matches = re.finditer(r'''(?xs)
            <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
            <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
            (?:
                .*?
                <a\s+href='(?P<torrent_url>[^']+\.torrent)'
            )?''', webpage)
        formats = []
        for m in matches:
            format = m.group('format')
            format_id = self._search_regex(
                r'.*/([a-z0-9_-]+)/[^/]*$',
                m.group('http_url'), 'format id', default=None)
            vcodec = 'h264' if 'h264' in format_id else (
                'none' if format_id in ('mp3', 'opus') else None
            )
            formats.append({
                'format_id': format_id,
                'format': format,
                'url': m.group('http_url'),
                'vcodec': vcodec,
                'preference': preference(format_id),
            })

            if m.group('torrent_url'):
                formats.append({
                    'format_id': 'torrent-%s' % (format if format_id is None else format_id),
                    'format': '%s (torrent)' % format,
                    'proto': 'torrent',
                    'format_note': '(unsupported; will just download the .torrent file)',
                    'vcodec': vcodec,
                    'preference': -100 + preference(format_id),
                    'url': m.group('torrent_url'),
                })
        self._sort_formats(formats)

        thumbnail = self._html_search_regex(
            r"<video.*?poster='([^']+)'", webpage, 'thumbnail', fatal=False)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'view_count': view_count,
            'upload_date': upload_date,
            'formats': formats,
        }
Commit	Line	Data
8f84f571 PH	1	from __future__ import unicode_literals
	2
	3	import re
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	int_or_none,
	8	qualities,
	9	unified_strdate,
	10	)
	11
	12
	13	class CCCIE(InfoExtractor):
	14	IE_NAME = 'media.ccc.de'
	15	_VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/[^?#]+/[^?#/]?_(?P<id>[0-9]{8,})._[^?#/]\.html'
	16
	17	_TEST = {
	18	'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
15da7ce7	19	'md5': '3a1eda8f3a29515d27f5adb967d7e740',
8f84f571 PH	20	'info_dict': {
	21	'id': '20131228183',
	22	'ext': 'mp4',
	23	'title': 'Introduction to Processor Design',
	24	'description': 'md5:5ddbf8c734800267f2cee4eab187bc1b',
	25	'thumbnail': 're:^https?://.*\.jpg$',
	26	'view_count': int,
	27	'upload_date': '20131229',
	28	}
	29	}
	30
	31	def _real_extract(self, url):
	32	video_id = self._match_id(url)
	33	webpage = self._download_webpage(url, video_id)
	34
	35	if self._downloader.params.get('prefer_free_formats'):
	36	preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd'])
	37	else:
	38	preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd'])
	39
	40	title = self._html_search_regex(
	41	r'(?s)<h1>(.*?)</h1>', webpage, 'title')
	42	description = self._html_search_regex(
	43	r"(?s)<p class='description'>(.*?)</p>",
	44	webpage, 'description', fatal=False)
	45	upload_date = unified_strdate(self._html_search_regex(
	46	r"(?s)<span class='[^']fa-calendar-o'></span>(.?)</li>",
	47	webpage, 'upload date', fatal=False))
	48	view_count = int_or_none(self._html_search_regex(
	49	r"(?s)<span class='[^']fa-eye'></span>(.?)</li>",
	50	webpage, 'view count', fatal=False))
	51
	52	matches = re.finditer(r'''(?xs)
	53	<(?:span\|div)\s+class='label\s+filetype'>(?P<format>.?)</(?:span\|div)>\s
15da7ce7	54	<a\s+download\s+href='(?P<http_url>[^']+)'>\s*
8f84f571 PH	55	(?:
	56	.*?
	57	<a\s+href='(?P<torrent_url>[^']+\.torrent)'
	58	)?''', webpage)
	59	formats = []
	60	for m in matches:
	61	format = m.group('format')
	62	format_id = self._search_regex(
	63	r'./([a-z0-9_-]+)/[^/]$',
	64	m.group('http_url'), 'format id', default=None)
	65	vcodec = 'h264' if 'h264' in format_id else (
	66	'none' if format_id in ('mp3', 'opus') else None
	67	)
	68	formats.append({
	69	'format_id': format_id,
	70	'format': format,
	71	'url': m.group('http_url'),
	72	'vcodec': vcodec,
	73	'preference': preference(format_id),
	74	})
	75
	76	if m.group('torrent_url'):
	77	formats.append({
	78	'format_id': 'torrent-%s' % (format if format_id is None else format_id),
	79	'format': '%s (torrent)' % format,
	80	'proto': 'torrent',
	81	'format_note': '(unsupported; will just download the .torrent file)',
	82	'vcodec': vcodec,
	83	'preference': -100 + preference(format_id),
	84	'url': m.group('torrent_url'),
	85	})
	86	self._sort_formats(formats)
	87
	88	thumbnail = self._html_search_regex(
	89	r"<video.*?poster='([^']+)'", webpage, 'thumbnail', fatal=False)
	90
	91	return {
	92	'id': video_id,
	93	'title': title,
	94	'description': description,
	95	'thumbnail': thumbnail,
	96	'view_count': view_count,
	97	'upload_date': upload_date,
	98	'formats': formats,
	99	}