]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/cda.py
[cda] Add new extractor for cda.pl
[yt-dlp.git] / youtube_dl / extractor / cda.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import (
8 decode_packed_codes,
9 ExtractorError,
10 parse_duration
11 )
12
13
14 class CDAIE(InfoExtractor):
15 _VALID_URL = r'https?://(?:(?:www|ebd)\.)?cda\.pl/(?:video|[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
16 _TESTS = [
17 {
18 'url': 'http://www.cda.pl/video/5749950c',
19 'md5': '6f844bf51b15f31fae165365707ae970',
20 'info_dict': {
21 'id': '5749950c',
22 'ext': 'mp4',
23 'height': 720,
24 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
25 'duration': 39
26 }
27 },
28 {
29 'url': 'http://www.cda.pl/video/57413289',
30 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
31 'info_dict': {
32 'id': '57413289',
33 'ext': 'mp4',
34 'title': 'Lądowanie na lotnisku na Maderze',
35 'duration': 137
36 }
37 }
38 ]
39
40 def _real_extract(self, url):
41 video_id = self._match_id(url)
42 webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id)
43
44 if 'Ten film jest dostępny dla użytkowników premium' in webpage:
45 raise ExtractorError('This video is only available for premium users.', expected=True)
46
47 title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title', fatal=False)
48
49 def _get_format(page, version=''):
50 unpacked = decode_packed_codes(page)
51 duration = self._search_regex(r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False)
52 format_id = None
53 height = None
54
55 m = re.search(r'<a data-quality="(?P<format_id>[^"]+)" href="[^"]+" class="quality-btn quality-btn-active">(?P<height>[0-9]+)p<\/a>', page)
56 if m:
57 format_id = m.group('format_id')
58 height = int(m.group('height'))
59
60 url = self._search_regex(r"url:\\'(.+?)\\'", unpacked, version + ' url', fatal=False)
61 if url is None:
62 return None
63
64 return {
65 'format_id': format_id,
66 'height': height,
67 'url': url
68 }, parse_duration(duration)
69
70 formats = []
71
72 format_desc, duration = _get_format(webpage) or (None, None)
73 if format_desc is not None:
74 formats.append(format_desc)
75
76 pattern = re.compile(r'<a data-quality="[^"]+" href="([^"]+)" class="quality-btn">([0-9]+p)<\/a>')
77 for version in re.findall(pattern, webpage):
78 webpage = self._download_webpage(version[0], video_id, 'Downloading %s version information' % version[1], fatal=False)
79 if not webpage:
80 # Manually report warning because empty page is returned when invalid version is requested.
81 self.report_warning('Unable to download %s version information' % version[1])
82 continue
83
84 format_desc, duration_ = _get_format(webpage, version[1]) or (None, None)
85 duration = duration or duration_
86 if format_desc is not None:
87 formats.append(format_desc)
88
89 self._sort_formats(formats)
90
91 return {
92 'id': video_id,
93 'title': title,
94 'formats': formats,
95 'duration': duration
96 }