]>
Commit | Line | Data |
---|---|---|
1 | from __future__ import unicode_literals | |
2 | ||
3 | import re | |
4 | ||
5 | from .common import InfoExtractor | |
6 | from ..utils import ( | |
7 | clean_html, | |
8 | ExtractorError, | |
9 | int_or_none, | |
10 | parse_iso8601, | |
11 | qualities, | |
12 | unescapeHTML, | |
13 | ) | |
14 | ||
15 | ||
16 | class Channel9IE(InfoExtractor): | |
17 | IE_DESC = 'Channel 9' | |
18 | IE_NAME = 'channel9' | |
19 | _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' | |
20 | ||
21 | _TESTS = [{ | |
22 | 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', | |
23 | 'md5': '32083d4eaf1946db6d454313f44510ca', | |
24 | 'info_dict': { | |
25 | 'id': '6c413323-383a-49dc-88f9-a22800cab024', | |
26 | 'ext': 'wmv', | |
27 | 'title': 'Developer Kick-Off Session: Stuff We Love', | |
28 | 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', | |
29 | 'duration': 4576, | |
30 | 'thumbnail': r're:https?://.*\.jpg', | |
31 | 'timestamp': 1377717420, | |
32 | 'upload_date': '20130828', | |
33 | 'session_code': 'KOS002', | |
34 | 'session_room': 'Arena 1A', | |
35 | 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'], | |
36 | }, | |
37 | }, { | |
38 | 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', | |
39 | 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', | |
40 | 'info_dict': { | |
41 | 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', | |
42 | 'ext': 'wmv', | |
43 | 'title': 'Self-service BI with Power BI - nuclear testing', | |
44 | 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', | |
45 | 'duration': 1540, | |
46 | 'thumbnail': r're:https?://.*\.jpg', | |
47 | 'timestamp': 1386381991, | |
48 | 'upload_date': '20131207', | |
49 | 'authors': ['Mike Wilmot'], | |
50 | }, | |
51 | }, { | |
52 | # low quality mp4 is best | |
53 | 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', | |
54 | 'info_dict': { | |
55 | 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', | |
56 | 'ext': 'mp4', | |
57 | 'title': 'Ranges for the Standard Library', | |
58 | 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', | |
59 | 'duration': 5646, | |
60 | 'thumbnail': r're:https?://.*\.jpg', | |
61 | 'upload_date': '20150930', | |
62 | 'timestamp': 1443640735, | |
63 | }, | |
64 | 'params': { | |
65 | 'skip_download': True, | |
66 | }, | |
67 | }, { | |
68 | 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', | |
69 | 'info_dict': { | |
70 | 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', | |
71 | 'title': 'Channel 9', | |
72 | }, | |
73 | 'playlist_mincount': 100, | |
74 | }, { | |
75 | 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', | |
76 | 'only_matching': True, | |
77 | }, { | |
78 | 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', | |
79 | 'only_matching': True, | |
80 | }] | |
81 | ||
82 | _RSS_URL = 'http://channel9.msdn.com/%s/RSS' | |
83 | ||
84 | @staticmethod | |
85 | def _extract_urls(webpage): | |
86 | return re.findall( | |
87 | r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', | |
88 | webpage) | |
89 | ||
90 | def _extract_list(self, video_id, rss_url=None): | |
91 | if not rss_url: | |
92 | rss_url = self._RSS_URL % video_id | |
93 | rss = self._download_xml(rss_url, video_id, 'Downloading RSS') | |
94 | entries = [self.url_result(session_url.text, 'Channel9') | |
95 | for session_url in rss.findall('./channel/item/link')] | |
96 | title_text = rss.find('./channel/title').text | |
97 | return self.playlist_result(entries, video_id, title_text) | |
98 | ||
99 | def _real_extract(self, url): | |
100 | content_path, rss = re.match(self._VALID_URL, url).groups() | |
101 | ||
102 | if rss: | |
103 | return self._extract_list(content_path, url) | |
104 | ||
105 | webpage = self._download_webpage( | |
106 | url, content_path, 'Downloading web page') | |
107 | ||
108 | episode_data = self._search_regex( | |
109 | r"data-episode='([^']+)'", webpage, 'episode data', default=None) | |
110 | if episode_data: | |
111 | episode_data = self._parse_json(unescapeHTML( | |
112 | episode_data), content_path) | |
113 | content_id = episode_data['contentId'] | |
114 | is_session = '/Sessions(' in episode_data['api'] | |
115 | content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] | |
116 | if is_session: | |
117 | content_url += '?$expand=Speakers' | |
118 | else: | |
119 | content_url += '?$expand=Authors' | |
120 | content_data = self._download_json(content_url, content_id) | |
121 | title = content_data['Title'] | |
122 | ||
123 | QUALITIES = ( | |
124 | 'mp3', | |
125 | 'wmv', 'mp4', | |
126 | 'wmv-low', 'mp4-low', | |
127 | 'wmv-mid', 'mp4-mid', | |
128 | 'wmv-high', 'mp4-high', | |
129 | ) | |
130 | ||
131 | quality_key = qualities(QUALITIES) | |
132 | ||
133 | def quality(quality_id, format_url): | |
134 | return (len(QUALITIES) if '_Source.' in format_url | |
135 | else quality_key(quality_id)) | |
136 | ||
137 | formats = [] | |
138 | urls = set() | |
139 | ||
140 | SITE_QUALITIES = { | |
141 | 'MP3': 'mp3', | |
142 | 'MP4': 'mp4', | |
143 | 'Low Quality WMV': 'wmv-low', | |
144 | 'Low Quality MP4': 'mp4-low', | |
145 | 'Mid Quality WMV': 'wmv-mid', | |
146 | 'Mid Quality MP4': 'mp4-mid', | |
147 | 'High Quality WMV': 'wmv-high', | |
148 | 'High Quality MP4': 'mp4-high', | |
149 | } | |
150 | ||
151 | formats_select = self._search_regex( | |
152 | r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, | |
153 | 'formats select', default=None) | |
154 | if formats_select: | |
155 | for mobj in re.finditer( | |
156 | r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', | |
157 | formats_select): | |
158 | format_url = mobj.group('url') | |
159 | if format_url in urls: | |
160 | continue | |
161 | urls.add(format_url) | |
162 | format_id = mobj.group('format') | |
163 | quality_id = SITE_QUALITIES.get(format_id, format_id) | |
164 | formats.append({ | |
165 | 'url': format_url, | |
166 | 'format_id': quality_id, | |
167 | 'quality': quality(quality_id, format_url), | |
168 | 'vcodec': 'none' if quality_id == 'mp3' else None, | |
169 | }) | |
170 | ||
171 | API_QUALITIES = { | |
172 | 'VideoMP4Low': 'mp4-low', | |
173 | 'VideoWMV': 'wmv-mid', | |
174 | 'VideoMP4Medium': 'mp4-mid', | |
175 | 'VideoMP4High': 'mp4-high', | |
176 | 'VideoWMVHQ': 'wmv-hq', | |
177 | } | |
178 | ||
179 | for format_id, q in API_QUALITIES.items(): | |
180 | q_url = content_data.get(format_id) | |
181 | if not q_url or q_url in urls: | |
182 | continue | |
183 | urls.add(q_url) | |
184 | formats.append({ | |
185 | 'url': q_url, | |
186 | 'format_id': q, | |
187 | 'quality': quality(q, q_url), | |
188 | }) | |
189 | ||
190 | self._sort_formats(formats) | |
191 | ||
192 | slides = content_data.get('Slides') | |
193 | zip_file = content_data.get('ZipFile') | |
194 | ||
195 | if not formats and not slides and not zip_file: | |
196 | raise ExtractorError( | |
197 | 'None of recording, slides or zip are available for %s' % content_path) | |
198 | ||
199 | subtitles = {} | |
200 | for caption in content_data.get('Captions', []): | |
201 | caption_url = caption.get('Url') | |
202 | if not caption_url: | |
203 | continue | |
204 | subtitles.setdefault(caption.get('Language', 'en'), []).append({ | |
205 | 'url': caption_url, | |
206 | 'ext': 'vtt', | |
207 | }) | |
208 | ||
209 | common = { | |
210 | 'id': content_id, | |
211 | 'title': title, | |
212 | 'description': clean_html(content_data.get('Description') or content_data.get('Body')), | |
213 | 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'), | |
214 | 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), | |
215 | 'timestamp': parse_iso8601(content_data.get('PublishedDate')), | |
216 | 'avg_rating': int_or_none(content_data.get('Rating')), | |
217 | 'rating_count': int_or_none(content_data.get('RatingCount')), | |
218 | 'view_count': int_or_none(content_data.get('Views')), | |
219 | 'comment_count': int_or_none(content_data.get('CommentCount')), | |
220 | 'subtitles': subtitles, | |
221 | } | |
222 | if is_session: | |
223 | speakers = [] | |
224 | for s in content_data.get('Speakers', []): | |
225 | speaker_name = s.get('FullName') | |
226 | if not speaker_name: | |
227 | continue | |
228 | speakers.append(speaker_name) | |
229 | ||
230 | common.update({ | |
231 | 'session_code': content_data.get('Code'), | |
232 | 'session_room': content_data.get('Room'), | |
233 | 'session_speakers': speakers, | |
234 | }) | |
235 | else: | |
236 | authors = [] | |
237 | for a in content_data.get('Authors', []): | |
238 | author_name = a.get('DisplayName') | |
239 | if not author_name: | |
240 | continue | |
241 | authors.append(author_name) | |
242 | common['authors'] = authors | |
243 | ||
244 | contents = [] | |
245 | ||
246 | if slides: | |
247 | d = common.copy() | |
248 | d.update({'title': title + '-Slides', 'url': slides}) | |
249 | contents.append(d) | |
250 | ||
251 | if zip_file: | |
252 | d = common.copy() | |
253 | d.update({'title': title + '-Zip', 'url': zip_file}) | |
254 | contents.append(d) | |
255 | ||
256 | if formats: | |
257 | d = common.copy() | |
258 | d.update({'title': title, 'formats': formats}) | |
259 | contents.append(d) | |
260 | return self.playlist_result(contents) | |
261 | else: | |
262 | return self._extract_list(content_path) |