]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/channel9.py
Merge pull request #8061 from dstftw/introduce-chapter-and-series-fields
[yt-dlp.git] / youtube_dl / extractor / channel9.py
CommitLineData
adc267ee 1from __future__ import unicode_literals
df537474 2
3import re
4
5from .common import InfoExtractor
1db82381
S
6from ..utils import (
7 ExtractorError,
8 parse_filesize,
9 qualities,
10)
df537474 11
5f6a1245 12
df537474 13class Channel9IE(InfoExtractor):
14 '''
15 Common extractor for channel9.msdn.com.
16
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee 19 not always possible to do.
df537474 20 '''
adc267ee 21 IE_DESC = 'Channel 9'
22 IE_NAME = 'channel9'
79bc27b5 23 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
df537474 24
25 _TESTS = [
26 {
adc267ee 27 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
adc267ee 28 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
29 'info_dict': {
79bc27b5
S
30 'id': 'Events/TechEd/Australia/2013/KOS002',
31 'ext': 'mp4',
adc267ee 32 'title': 'Developer Kick-Off Session: Stuff We Love',
33 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
34 'duration': 4576,
fafc7950 35 'thumbnail': 're:http://.*\.jpg',
adc267ee 36 'session_code': 'KOS002',
37 'session_day': 'Day 1',
38 'session_room': 'Arena 1A',
5f6a1245 39 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
df537474 40 },
41 },
42 {
adc267ee 43 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
adc267ee 44 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
45 'info_dict': {
79bc27b5
S
46 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
47 'ext': 'mp4',
adc267ee 48 'title': 'Self-service BI with Power BI - nuclear testing',
49 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
50 'duration': 1540,
fafc7950 51 'thumbnail': 're:http://.*\.jpg',
5f6a1245 52 'authors': ['Mike Wilmot'],
df537474 53 },
a13d06de
S
54 },
55 {
56 # low quality mp4 is best
57 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
58 'info_dict': {
59 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
60 'ext': 'mp4',
61 'title': 'Ranges for the Standard Library',
62 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
63 'duration': 5646,
64 'thumbnail': 're:http://.*\.jpg',
65 },
66 'params': {
67 'skip_download': True,
68 },
df537474 69 }
70 ]
71
72 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474 73
df537474 74 def _formats_from_html(self, html):
75 FORMAT_REGEX = r'''
76 (?x)
77 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
78 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
79 (?:<div\s+class="popup\s+rounded">\s*
80 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
81 </div>)? # File size part may be missing
82 '''
1db82381
S
83 quality = qualities((
84 'MP3', 'MP4',
85 'Low Quality WMV', 'Low Quality MP4',
86 'Mid Quality WMV', 'Mid Quality MP4',
87 'High Quality WMV', 'High Quality MP4'))
a1b92edb
PH
88 formats = [{
89 'url': x.group('url'),
90 'format_id': x.group('quality'),
91 'format_note': x.group('note'),
adc267ee 92 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
1db82381
S
93 'filesize_approx': parse_filesize(x.group('filesize')),
94 'quality': quality(x.group('quality')),
a1b92edb 95 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
1db82381 96 } for x in list(re.finditer(FORMAT_REGEX, html))]
a1b92edb
PH
97
98 self._sort_formats(formats)
99
df537474 100 return formats
101
df537474 102 def _extract_title(self, html):
adc267ee 103 title = self._html_search_meta('title', html, 'title')
a316a83d 104 if title is None:
df537474 105 title = self._og_search_title(html)
adc267ee 106 TITLE_SUFFIX = ' (Channel 9)'
df537474 107 if title is not None and title.endswith(TITLE_SUFFIX):
108 title = title[:-len(TITLE_SUFFIX)]
109 return title
110
111 def _extract_description(self, html):
112 DESCRIPTION_REGEX = r'''(?sx)
113 <div\s+class="entry-content">\s*
114 <div\s+id="entry-body">\s*
115 (?P<description>.+?)\s*
116 </div>\s*
117 </div>
118 '''
119 m = re.search(DESCRIPTION_REGEX, html)
120 if m is not None:
121 return m.group('description')
adc267ee 122 return self._html_search_meta('description', html, 'description')
df537474 123
124 def _extract_duration(self, html):
a316a83d 125 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
df537474 126 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
127
128 def _extract_slides(self, html):
129 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
130 return m.group('slidesurl') if m is not None else None
131
132 def _extract_zip(self, html):
133 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
134 return m.group('zipurl') if m is not None else None
135
136 def _extract_avg_rating(self, html):
137 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
138 return float(m.group('avgrating')) if m is not None else 0
139
140 def _extract_rating_count(self, html):
141 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
142 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
143
144 def _extract_view_count(self, html):
145 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
146 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
147
148 def _extract_comment_count(self, html):
149 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
150 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
151
152 def _fix_count(self, count):
153 return int(str(count).replace(',', '')) if count is not None else None
154
155 def _extract_authors(self, html):
156 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
157 if m is None:
158 return None
159 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
160
161 def _extract_session_code(self, html):
162 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
163 return m.group('code') if m is not None else None
164
165 def _extract_session_day(self, html):
166 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
506e261d 167 return m.group('day').strip() if m is not None else None
df537474 168
169 def _extract_session_room(self, html):
170 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
171 return m.group('room') if m is not None else None
172
173 def _extract_session_speakers(self, html):
174 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
175
176 def _extract_content(self, html, content_path):
a316a83d 177 # Look for downloadable content
df537474 178 formats = self._formats_from_html(html)
179 slides = self._extract_slides(html)
180 zip_ = self._extract_zip(html)
181
182 # Nothing to download
183 if len(formats) == 0 and slides is None and zip_ is None:
adc267ee 184 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474 185 return
186
187 # Extract meta
188 title = self._extract_title(html)
189 description = self._extract_description(html)
190 thumbnail = self._og_search_thumbnail(html)
191 duration = self._extract_duration(html)
192 avg_rating = self._extract_avg_rating(html)
193 rating_count = self._extract_rating_count(html)
194 view_count = self._extract_view_count(html)
195 comment_count = self._extract_comment_count(html)
196
b74e86f4
PH
197 common = {
198 '_type': 'video',
199 'id': content_path,
200 'description': description,
201 'thumbnail': thumbnail,
202 'duration': duration,
203 'avg_rating': avg_rating,
204 'rating_count': rating_count,
205 'view_count': view_count,
206 'comment_count': comment_count,
207 }
df537474 208
209 result = []
210
211 if slides is not None:
212 d = common.copy()
5f6a1245 213 d.update({'title': title + '-Slides', 'url': slides})
df537474 214 result.append(d)
215
216 if zip_ is not None:
217 d = common.copy()
5f6a1245 218 d.update({'title': title + '-Zip', 'url': zip_})
df537474 219 result.append(d)
220
221 if len(formats) > 0:
222 d = common.copy()
5f6a1245 223 d.update({'title': title, 'formats': formats})
df537474 224 result.append(d)
225
226 return result
227
228 def _extract_entry_item(self, html, content_path):
229 contents = self._extract_content(html, content_path)
230 if contents is None:
231 return contents
232
b30c4992
JMF
233 if len(contents) > 1:
234 raise ExtractorError('Got more than one entry')
235 result = contents[0]
236 result['authors'] = self._extract_authors(html)
df537474 237
b30c4992 238 return result
df537474 239
240 def _extract_session(self, html, content_path):
241 contents = self._extract_content(html, content_path)
242 if contents is None:
243 return contents
244
025f30ba
PH
245 session_meta = {
246 'session_code': self._extract_session_code(html),
247 'session_day': self._extract_session_day(html),
248 'session_room': self._extract_session_room(html),
249 'session_speakers': self._extract_session_speakers(html),
250 }
df537474 251
252 for content in contents:
253 content.update(session_meta)
254
025f30ba 255 return self.playlist_result(contents)
df537474 256
df537474 257 def _extract_list(self, content_path):
adc267ee 258 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
4d2ebb6b 259 entries = [self.url_result(session_url.text, 'Channel9')
260 for session_url in rss.findall('./channel/item/link')]
261 title_text = rss.find('./channel/title').text
262 return self.playlist_result(entries, content_path, title_text)
df537474 263
264 def _real_extract(self, url):
265 mobj = re.match(self._VALID_URL, url)
266 content_path = mobj.group('contentpath')
267
adc267ee 268 webpage = self._download_webpage(url, content_path, 'Downloading web page')
df537474 269
a316a83d 270 page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
271 if page_type_m is not None:
272 page_type = page_type_m.group('pagetype')
273 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
274 return self._extract_entry_item(webpage, content_path)
275 elif page_type == 'Session': # Event session page, may contain downloadable content
276 return self._extract_session(webpage, content_path)
277 elif page_type == 'Event':
278 return self._extract_list(content_path)
279 else:
280 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
281
5f6a1245 282 else: # Assuming list
df537474 283 return self._extract_list(content_path)