]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/channel9.py
[smotri] Adapt to new API and modernize
[yt-dlp.git] / youtube_dl / extractor / channel9.py
CommitLineData
adc267ee 1from __future__ import unicode_literals
df537474 2
3import re
4
5from .common import InfoExtractor
4d2ebb6b 6from ..utils import ExtractorError
df537474 7
8class Channel9IE(InfoExtractor):
9 '''
10 Common extractor for channel9.msdn.com.
11
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee 14 not always possible to do.
df537474 15 '''
adc267ee 16 IE_DESC = 'Channel 9'
17 IE_NAME = 'channel9'
79bc27b5 18 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
df537474 19
20 _TESTS = [
21 {
adc267ee 22 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
adc267ee 23 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
24 'info_dict': {
79bc27b5
S
25 'id': 'Events/TechEd/Australia/2013/KOS002',
26 'ext': 'mp4',
adc267ee 27 'title': 'Developer Kick-Off Session: Stuff We Love',
28 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
29 'duration': 4576,
a316a83d 30 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
adc267ee 31 'session_code': 'KOS002',
32 'session_day': 'Day 1',
33 'session_room': 'Arena 1A',
34 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
df537474 35 },
36 },
37 {
adc267ee 38 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
adc267ee 39 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
40 'info_dict': {
79bc27b5
S
41 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
42 'ext': 'mp4',
adc267ee 43 'title': 'Self-service BI with Power BI - nuclear testing',
44 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
45 'duration': 1540,
a316a83d 46 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
adc267ee 47 'authors': [ 'Mike Wilmot' ],
df537474 48 },
49 }
50 ]
51
52 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474 53
54 # Sorted by quality
55 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
56
57 def _restore_bytes(self, formatted_size):
58 if not formatted_size:
59 return 0
60 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
61 if not m:
62 return 0
63 units = m.group('units')
64 try:
adc267ee 65 exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
df537474 66 except ValueError:
67 return 0
68 size = float(m.group('size'))
69 return int(size * (1024 ** exponent))
70
71 def _formats_from_html(self, html):
72 FORMAT_REGEX = r'''
73 (?x)
74 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
75 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
76 (?:<div\s+class="popup\s+rounded">\s*
77 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
78 </div>)? # File size part may be missing
79 '''
80 # Extract known formats
a1b92edb
PH
81 formats = [{
82 'url': x.group('url'),
83 'format_id': x.group('quality'),
84 'format_note': x.group('note'),
adc267ee 85 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
a1b92edb
PH
86 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
87 'preference': self._known_formats.index(x.group('quality')),
88 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
89 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
90
91 self._sort_formats(formats)
92
df537474 93 return formats
94
df537474 95 def _extract_title(self, html):
adc267ee 96 title = self._html_search_meta('title', html, 'title')
a316a83d 97 if title is None:
df537474 98 title = self._og_search_title(html)
adc267ee 99 TITLE_SUFFIX = ' (Channel 9)'
df537474 100 if title is not None and title.endswith(TITLE_SUFFIX):
101 title = title[:-len(TITLE_SUFFIX)]
102 return title
103
104 def _extract_description(self, html):
105 DESCRIPTION_REGEX = r'''(?sx)
106 <div\s+class="entry-content">\s*
107 <div\s+id="entry-body">\s*
108 (?P<description>.+?)\s*
109 </div>\s*
110 </div>
111 '''
112 m = re.search(DESCRIPTION_REGEX, html)
113 if m is not None:
114 return m.group('description')
adc267ee 115 return self._html_search_meta('description', html, 'description')
df537474 116
117 def _extract_duration(self, html):
a316a83d 118 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
df537474 119 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
120
121 def _extract_slides(self, html):
122 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
123 return m.group('slidesurl') if m is not None else None
124
125 def _extract_zip(self, html):
126 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
127 return m.group('zipurl') if m is not None else None
128
129 def _extract_avg_rating(self, html):
130 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
131 return float(m.group('avgrating')) if m is not None else 0
132
133 def _extract_rating_count(self, html):
134 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
135 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
136
137 def _extract_view_count(self, html):
138 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
139 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
140
141 def _extract_comment_count(self, html):
142 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
143 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
144
145 def _fix_count(self, count):
146 return int(str(count).replace(',', '')) if count is not None else None
147
148 def _extract_authors(self, html):
149 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
150 if m is None:
151 return None
152 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
153
154 def _extract_session_code(self, html):
155 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
156 return m.group('code') if m is not None else None
157
158 def _extract_session_day(self, html):
159 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
160 return m.group('day') if m is not None else None
161
162 def _extract_session_room(self, html):
163 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
164 return m.group('room') if m is not None else None
165
166 def _extract_session_speakers(self, html):
167 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
168
169 def _extract_content(self, html, content_path):
a316a83d 170 # Look for downloadable content
df537474 171 formats = self._formats_from_html(html)
172 slides = self._extract_slides(html)
173 zip_ = self._extract_zip(html)
174
175 # Nothing to download
176 if len(formats) == 0 and slides is None and zip_ is None:
adc267ee 177 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474 178 return
179
180 # Extract meta
181 title = self._extract_title(html)
182 description = self._extract_description(html)
183 thumbnail = self._og_search_thumbnail(html)
184 duration = self._extract_duration(html)
185 avg_rating = self._extract_avg_rating(html)
186 rating_count = self._extract_rating_count(html)
187 view_count = self._extract_view_count(html)
188 comment_count = self._extract_comment_count(html)
189
190 common = {'_type': 'video',
191 'id': content_path,
192 'description': description,
193 'thumbnail': thumbnail,
194 'duration': duration,
195 'avg_rating': avg_rating,
196 'rating_count': rating_count,
197 'view_count': view_count,
198 'comment_count': comment_count,
199 }
200
201 result = []
202
203 if slides is not None:
204 d = common.copy()
205 d.update({ 'title': title + '-Slides', 'url': slides })
206 result.append(d)
207
208 if zip_ is not None:
209 d = common.copy()
210 d.update({ 'title': title + '-Zip', 'url': zip_ })
211 result.append(d)
212
213 if len(formats) > 0:
214 d = common.copy()
215 d.update({ 'title': title, 'formats': formats })
216 result.append(d)
217
218 return result
219
220 def _extract_entry_item(self, html, content_path):
221 contents = self._extract_content(html, content_path)
222 if contents is None:
223 return contents
224
225 authors = self._extract_authors(html)
226
227 for content in contents:
228 content['authors'] = authors
229
230 return contents
231
232 def _extract_session(self, html, content_path):
233 contents = self._extract_content(html, content_path)
234 if contents is None:
235 return contents
236
237 session_meta = {'session_code': self._extract_session_code(html),
238 'session_day': self._extract_session_day(html),
239 'session_room': self._extract_session_room(html),
240 'session_speakers': self._extract_session_speakers(html),
241 }
242
243 for content in contents:
244 content.update(session_meta)
245
246 return contents
247
df537474 248 def _extract_list(self, content_path):
adc267ee 249 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
4d2ebb6b 250 entries = [self.url_result(session_url.text, 'Channel9')
251 for session_url in rss.findall('./channel/item/link')]
252 title_text = rss.find('./channel/title').text
253 return self.playlist_result(entries, content_path, title_text)
df537474 254
255 def _real_extract(self, url):
256 mobj = re.match(self._VALID_URL, url)
257 content_path = mobj.group('contentpath')
258
adc267ee 259 webpage = self._download_webpage(url, content_path, 'Downloading web page')
df537474 260
a316a83d 261 page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
262 if page_type_m is not None:
263 page_type = page_type_m.group('pagetype')
264 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
265 return self._extract_entry_item(webpage, content_path)
266 elif page_type == 'Session': # Event session page, may contain downloadable content
267 return self._extract_session(webpage, content_path)
268 elif page_type == 'Event':
269 return self._extract_list(content_path)
270 else:
271 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
272
273 else: # Assuming list
df537474 274 return self._extract_list(content_path)