]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/channel9.py
Merge pull request #7176 from remitamine/megavideoz
[yt-dlp.git] / youtube_dl / extractor / channel9.py
CommitLineData
adc267ee 1from __future__ import unicode_literals
df537474 2
3import re
4
5from .common import InfoExtractor
4d2ebb6b 6from ..utils import ExtractorError
df537474 7
5f6a1245 8
df537474 9class Channel9IE(InfoExtractor):
10 '''
11 Common extractor for channel9.msdn.com.
12
13 The type of provided URL (video or playlist) is determined according to
14 meta Search.PageType from web page HTML rather than URL itself, as it is
adc267ee 15 not always possible to do.
df537474 16 '''
adc267ee 17 IE_DESC = 'Channel 9'
18 IE_NAME = 'channel9'
79bc27b5 19 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
df537474 20
21 _TESTS = [
22 {
adc267ee 23 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
adc267ee 24 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
25 'info_dict': {
79bc27b5
S
26 'id': 'Events/TechEd/Australia/2013/KOS002',
27 'ext': 'mp4',
adc267ee 28 'title': 'Developer Kick-Off Session: Stuff We Love',
29 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
30 'duration': 4576,
a316a83d 31 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
adc267ee 32 'session_code': 'KOS002',
33 'session_day': 'Day 1',
34 'session_room': 'Arena 1A',
5f6a1245 35 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
df537474 36 },
37 },
38 {
adc267ee 39 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
adc267ee 40 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
41 'info_dict': {
79bc27b5
S
42 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
43 'ext': 'mp4',
adc267ee 44 'title': 'Self-service BI with Power BI - nuclear testing',
45 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
46 'duration': 1540,
a316a83d 47 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
5f6a1245 48 'authors': ['Mike Wilmot'],
df537474 49 },
50 }
51 ]
52
53 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
df537474 54
55 # Sorted by quality
56 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
57
58 def _restore_bytes(self, formatted_size):
59 if not formatted_size:
60 return 0
61 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
62 if not m:
63 return 0
64 units = m.group('units')
65 try:
adc267ee 66 exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
df537474 67 except ValueError:
68 return 0
69 size = float(m.group('size'))
70 return int(size * (1024 ** exponent))
71
72 def _formats_from_html(self, html):
73 FORMAT_REGEX = r'''
74 (?x)
75 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
76 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
77 (?:<div\s+class="popup\s+rounded">\s*
78 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
79 </div>)? # File size part may be missing
80 '''
81 # Extract known formats
a1b92edb
PH
82 formats = [{
83 'url': x.group('url'),
84 'format_id': x.group('quality'),
85 'format_note': x.group('note'),
adc267ee 86 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
5f6a1245 87 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
a1b92edb
PH
88 'preference': self._known_formats.index(x.group('quality')),
89 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
90 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
91
92 self._sort_formats(formats)
93
df537474 94 return formats
95
df537474 96 def _extract_title(self, html):
adc267ee 97 title = self._html_search_meta('title', html, 'title')
a316a83d 98 if title is None:
df537474 99 title = self._og_search_title(html)
adc267ee 100 TITLE_SUFFIX = ' (Channel 9)'
df537474 101 if title is not None and title.endswith(TITLE_SUFFIX):
102 title = title[:-len(TITLE_SUFFIX)]
103 return title
104
105 def _extract_description(self, html):
106 DESCRIPTION_REGEX = r'''(?sx)
107 <div\s+class="entry-content">\s*
108 <div\s+id="entry-body">\s*
109 (?P<description>.+?)\s*
110 </div>\s*
111 </div>
112 '''
113 m = re.search(DESCRIPTION_REGEX, html)
114 if m is not None:
115 return m.group('description')
adc267ee 116 return self._html_search_meta('description', html, 'description')
df537474 117
118 def _extract_duration(self, html):
a316a83d 119 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
df537474 120 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
121
122 def _extract_slides(self, html):
123 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
124 return m.group('slidesurl') if m is not None else None
125
126 def _extract_zip(self, html):
127 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
128 return m.group('zipurl') if m is not None else None
129
130 def _extract_avg_rating(self, html):
131 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
132 return float(m.group('avgrating')) if m is not None else 0
133
134 def _extract_rating_count(self, html):
135 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
136 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
137
138 def _extract_view_count(self, html):
139 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
140 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
141
142 def _extract_comment_count(self, html):
143 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
144 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
145
146 def _fix_count(self, count):
147 return int(str(count).replace(',', '')) if count is not None else None
148
149 def _extract_authors(self, html):
150 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
151 if m is None:
152 return None
153 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
154
155 def _extract_session_code(self, html):
156 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
157 return m.group('code') if m is not None else None
158
159 def _extract_session_day(self, html):
160 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
506e261d 161 return m.group('day').strip() if m is not None else None
df537474 162
163 def _extract_session_room(self, html):
164 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
165 return m.group('room') if m is not None else None
166
167 def _extract_session_speakers(self, html):
168 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
169
170 def _extract_content(self, html, content_path):
a316a83d 171 # Look for downloadable content
df537474 172 formats = self._formats_from_html(html)
173 slides = self._extract_slides(html)
174 zip_ = self._extract_zip(html)
175
176 # Nothing to download
177 if len(formats) == 0 and slides is None and zip_ is None:
adc267ee 178 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
df537474 179 return
180
181 # Extract meta
182 title = self._extract_title(html)
183 description = self._extract_description(html)
184 thumbnail = self._og_search_thumbnail(html)
185 duration = self._extract_duration(html)
186 avg_rating = self._extract_avg_rating(html)
187 rating_count = self._extract_rating_count(html)
188 view_count = self._extract_view_count(html)
189 comment_count = self._extract_comment_count(html)
190
b74e86f4
PH
191 common = {
192 '_type': 'video',
193 'id': content_path,
194 'description': description,
195 'thumbnail': thumbnail,
196 'duration': duration,
197 'avg_rating': avg_rating,
198 'rating_count': rating_count,
199 'view_count': view_count,
200 'comment_count': comment_count,
201 }
df537474 202
203 result = []
204
205 if slides is not None:
206 d = common.copy()
5f6a1245 207 d.update({'title': title + '-Slides', 'url': slides})
df537474 208 result.append(d)
209
210 if zip_ is not None:
211 d = common.copy()
5f6a1245 212 d.update({'title': title + '-Zip', 'url': zip_})
df537474 213 result.append(d)
214
215 if len(formats) > 0:
216 d = common.copy()
5f6a1245 217 d.update({'title': title, 'formats': formats})
df537474 218 result.append(d)
219
220 return result
221
222 def _extract_entry_item(self, html, content_path):
223 contents = self._extract_content(html, content_path)
224 if contents is None:
225 return contents
226
b30c4992
JMF
227 if len(contents) > 1:
228 raise ExtractorError('Got more than one entry')
229 result = contents[0]
230 result['authors'] = self._extract_authors(html)
df537474 231
b30c4992 232 return result
df537474 233
234 def _extract_session(self, html, content_path):
235 contents = self._extract_content(html, content_path)
236 if contents is None:
237 return contents
238
025f30ba
PH
239 session_meta = {
240 'session_code': self._extract_session_code(html),
241 'session_day': self._extract_session_day(html),
242 'session_room': self._extract_session_room(html),
243 'session_speakers': self._extract_session_speakers(html),
244 }
df537474 245
246 for content in contents:
247 content.update(session_meta)
248
025f30ba 249 return self.playlist_result(contents)
df537474 250
df537474 251 def _extract_list(self, content_path):
adc267ee 252 rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
4d2ebb6b 253 entries = [self.url_result(session_url.text, 'Channel9')
254 for session_url in rss.findall('./channel/item/link')]
255 title_text = rss.find('./channel/title').text
256 return self.playlist_result(entries, content_path, title_text)
df537474 257
258 def _real_extract(self, url):
259 mobj = re.match(self._VALID_URL, url)
260 content_path = mobj.group('contentpath')
261
adc267ee 262 webpage = self._download_webpage(url, content_path, 'Downloading web page')
df537474 263
a316a83d 264 page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
265 if page_type_m is not None:
266 page_type = page_type_m.group('pagetype')
267 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
268 return self._extract_entry_item(webpage, content_path)
269 elif page_type == 'Session': # Event session page, may contain downloadable content
270 return self._extract_session(webpage, content_path)
271 elif page_type == 'Event':
272 return self._extract_list(content_path)
273 else:
274 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
275
5f6a1245 276 else: # Assuming list
df537474 277 return self._extract_list(content_path)