]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/channel9.py
Fix "invalid escape sequences" error on Python 3.6
[yt-dlp.git] / youtube_dl / extractor / channel9.py
CommitLineData
adc267ee 1from __future__ import unicode_literals
df537474 2
3import re
4
5from .common import InfoExtractor
1db82381
S
6from ..utils import (
7 ExtractorError,
8 parse_filesize,
9 qualities,
10)
df537474 11
5f6a1245 12
df537474 13class Channel9IE(InfoExtractor):
14 '''
15 Common extractor for channel9.msdn.com.
16
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee 19 not always possible to do.
df537474 20 '''
adc267ee 21 IE_DESC = 'Channel 9'
22 IE_NAME = 'channel9'
762d44c9
S
23 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
24
25 _TESTS = [{
26 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
27 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
28 'info_dict': {
29 'id': 'Events/TechEd/Australia/2013/KOS002',
30 'ext': 'mp4',
31 'title': 'Developer Kick-Off Session: Stuff We Love',
32 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
33 'duration': 4576,
ec85ded8 34 'thumbnail': r're:http://.*\.jpg',
762d44c9
S
35 'session_code': 'KOS002',
36 'session_day': 'Day 1',
37 'session_room': 'Arena 1A',
38 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
39 'Mads Kristensen'],
df537474 40 },
762d44c9
S
41 }, {
42 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
43 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
44 'info_dict': {
45 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
46 'ext': 'mp4',
47 'title': 'Self-service BI with Power BI - nuclear testing',
48 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
49 'duration': 1540,
ec85ded8 50 'thumbnail': r're:http://.*\.jpg',
762d44c9 51 'authors': ['Mike Wilmot'],
a13d06de 52 },
762d44c9
S
53 }, {
54 # low quality mp4 is best
55 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
56 'info_dict': {
57 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
58 'ext': 'mp4',
59 'title': 'Ranges for the Standard Library',
60 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
61 'duration': 5646,
ec85ded8 62 'thumbnail': r're:http://.*\.jpg',
762d44c9
S
63 },
64 'params': {
65 'skip_download': True,
66 },
67 }, {
68 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
69 'info_dict': {
70 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
71 'title': 'Channel 9',
72 },
73 'playlist_count': 2,
74 }, {
75 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
76 'only_matching': True,
77 }, {
78 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
79 'only_matching': True,
80 }]
df537474 81
82 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474 83
df537474 84 def _formats_from_html(self, html):
85 FORMAT_REGEX = r'''
86 (?x)
87 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
88 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
89 (?:<div\s+class="popup\s+rounded">\s*
90 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
91 </div>)? # File size part may be missing
92 '''
1db82381
S
93 quality = qualities((
94 'MP3', 'MP4',
95 'Low Quality WMV', 'Low Quality MP4',
96 'Mid Quality WMV', 'Mid Quality MP4',
97 'High Quality WMV', 'High Quality MP4'))
a1b92edb
PH
98 formats = [{
99 'url': x.group('url'),
100 'format_id': x.group('quality'),
101 'format_note': x.group('note'),
adc267ee 102 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
1db82381
S
103 'filesize_approx': parse_filesize(x.group('filesize')),
104 'quality': quality(x.group('quality')),
a1b92edb 105 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
1db82381 106 } for x in list(re.finditer(FORMAT_REGEX, html))]
a1b92edb
PH
107
108 self._sort_formats(formats)
109
df537474 110 return formats
111
df537474 112 def _extract_title(self, html):
adc267ee 113 title = self._html_search_meta('title', html, 'title')
a316a83d 114 if title is None:
df537474 115 title = self._og_search_title(html)
adc267ee 116 TITLE_SUFFIX = ' (Channel 9)'
df537474 117 if title is not None and title.endswith(TITLE_SUFFIX):
118 title = title[:-len(TITLE_SUFFIX)]
119 return title
120
121 def _extract_description(self, html):
122 DESCRIPTION_REGEX = r'''(?sx)
123 <div\s+class="entry-content">\s*
124 <div\s+id="entry-body">\s*
125 (?P<description>.+?)\s*
126 </div>\s*
127 </div>
128 '''
129 m = re.search(DESCRIPTION_REGEX, html)
130 if m is not None:
131 return m.group('description')
adc267ee 132 return self._html_search_meta('description', html, 'description')
df537474 133
134 def _extract_duration(self, html):
a316a83d 135 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
df537474 136 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
137
138 def _extract_slides(self, html):
139 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
140 return m.group('slidesurl') if m is not None else None
141
142 def _extract_zip(self, html):
143 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
144 return m.group('zipurl') if m is not None else None
145
146 def _extract_avg_rating(self, html):
147 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
148 return float(m.group('avgrating')) if m is not None else 0
149
150 def _extract_rating_count(self, html):
151 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
152 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
153
154 def _extract_view_count(self, html):
155 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
156 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
157
158 def _extract_comment_count(self, html):
159 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
160 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
161
162 def _fix_count(self, count):
163 return int(str(count).replace(',', '')) if count is not None else None
164
165 def _extract_authors(self, html):
166 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
167 if m is None:
168 return None
169 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
170
171 def _extract_session_code(self, html):
172 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
173 return m.group('code') if m is not None else None
174
175 def _extract_session_day(self, html):
176 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
506e261d 177 return m.group('day').strip() if m is not None else None
df537474 178
179 def _extract_session_room(self, html):
180 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
181 return m.group('room') if m is not None else None
182
183 def _extract_session_speakers(self, html):
184 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
185
186 def _extract_content(self, html, content_path):
a316a83d 187 # Look for downloadable content
df537474 188 formats = self._formats_from_html(html)
189 slides = self._extract_slides(html)
190 zip_ = self._extract_zip(html)
191
192 # Nothing to download
193 if len(formats) == 0 and slides is None and zip_ is None:
adc267ee 194 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474 195 return
196
197 # Extract meta
198 title = self._extract_title(html)
199 description = self._extract_description(html)
200 thumbnail = self._og_search_thumbnail(html)
201 duration = self._extract_duration(html)
202 avg_rating = self._extract_avg_rating(html)
203 rating_count = self._extract_rating_count(html)
204 view_count = self._extract_view_count(html)
205 comment_count = self._extract_comment_count(html)
206
b74e86f4
PH
207 common = {
208 '_type': 'video',
209 'id': content_path,
210 'description': description,
211 'thumbnail': thumbnail,
212 'duration': duration,
213 'avg_rating': avg_rating,
214 'rating_count': rating_count,
215 'view_count': view_count,
216 'comment_count': comment_count,
217 }
df537474 218
219 result = []
220
221 if slides is not None:
222 d = common.copy()
5f6a1245 223 d.update({'title': title + '-Slides', 'url': slides})
df537474 224 result.append(d)
225
226 if zip_ is not None:
227 d = common.copy()
5f6a1245 228 d.update({'title': title + '-Zip', 'url': zip_})
df537474 229 result.append(d)
230
231 if len(formats) > 0:
232 d = common.copy()
5f6a1245 233 d.update({'title': title, 'formats': formats})
df537474 234 result.append(d)
235
236 return result
237
238 def _extract_entry_item(self, html, content_path):
239 contents = self._extract_content(html, content_path)
240 if contents is None:
241 return contents
242
b30c4992
JMF
243 if len(contents) > 1:
244 raise ExtractorError('Got more than one entry')
245 result = contents[0]
246 result['authors'] = self._extract_authors(html)
df537474 247
b30c4992 248 return result
df537474 249
250 def _extract_session(self, html, content_path):
251 contents = self._extract_content(html, content_path)
252 if contents is None:
253 return contents
254
025f30ba
PH
255 session_meta = {
256 'session_code': self._extract_session_code(html),
257 'session_day': self._extract_session_day(html),
258 'session_room': self._extract_session_room(html),
259 'session_speakers': self._extract_session_speakers(html),
260 }
df537474 261
262 for content in contents:
263 content.update(session_meta)
264
025f30ba 265 return self.playlist_result(contents)
df537474 266
762d44c9
S
267 def _extract_list(self, video_id, rss_url=None):
268 if not rss_url:
269 rss_url = self._RSS_URL % video_id
270 rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
4d2ebb6b 271 entries = [self.url_result(session_url.text, 'Channel9')
272 for session_url in rss.findall('./channel/item/link')]
273 title_text = rss.find('./channel/title').text
762d44c9 274 return self.playlist_result(entries, video_id, title_text)
df537474 275
276 def _real_extract(self, url):
277 mobj = re.match(self._VALID_URL, url)
278 content_path = mobj.group('contentpath')
762d44c9
S
279 rss = mobj.group('rss')
280
281 if rss:
282 return self._extract_list(content_path, url)
df537474 283
762d44c9
S
284 webpage = self._download_webpage(
285 url, content_path, 'Downloading web page')
df537474 286
762d44c9
S
287 page_type = self._search_regex(
288 r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
289 webpage, 'page type', default=None, group='pagetype')
290 if page_type:
a316a83d 291 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
292 return self._extract_entry_item(webpage, content_path)
293 elif page_type == 'Session': # Event session page, may contain downloadable content
294 return self._extract_session(webpage, content_path)
295 elif page_type == 'Event':
296 return self._extract_list(content_path)
297 else:
298 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
5f6a1245 299 else: # Assuming list
df537474 300 return self._extract_list(content_path)