]>
Commit | Line | Data |
---|---|---|
adc267ee | 1 | from __future__ import unicode_literals |
df537474 | 2 | |
3 | import re | |
4 | ||
5 | from .common import InfoExtractor | |
1db82381 | 6 | from ..utils import ( |
bea7af69 | 7 | clean_html, |
1db82381 | 8 | ExtractorError, |
b0f7f21c RA |
9 | int_or_none, |
10 | parse_iso8601, | |
a5d783f5 | 11 | qualities, |
bea7af69 | 12 | unescapeHTML, |
1db82381 | 13 | ) |
df537474 | 14 | |
5f6a1245 | 15 | |
df537474 | 16 | class Channel9IE(InfoExtractor): |
adc267ee | 17 | IE_DESC = 'Channel 9' |
18 | IE_NAME = 'channel9' | |
b0f7f21c | 19 | _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' |
762d44c9 S |
20 | |
21 | _TESTS = [{ | |
22 | 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', | |
b0f7f21c | 23 | 'md5': '32083d4eaf1946db6d454313f44510ca', |
762d44c9 | 24 | 'info_dict': { |
b0f7f21c RA |
25 | 'id': '6c413323-383a-49dc-88f9-a22800cab024', |
26 | 'ext': 'wmv', | |
762d44c9 | 27 | 'title': 'Developer Kick-Off Session: Stuff We Love', |
b0f7f21c | 28 | 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', |
762d44c9 | 29 | 'duration': 4576, |
b0f7f21c RA |
30 | 'thumbnail': r're:https?://.*\.jpg', |
31 | 'timestamp': 1377717420, | |
32 | 'upload_date': '20130828', | |
762d44c9 | 33 | 'session_code': 'KOS002', |
762d44c9 | 34 | 'session_room': 'Arena 1A', |
b0f7f21c | 35 | 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'], |
df537474 | 36 | }, |
762d44c9 S |
37 | }, { |
38 | 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', | |
b0f7f21c | 39 | 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', |
762d44c9 | 40 | 'info_dict': { |
b0f7f21c RA |
41 | 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', |
42 | 'ext': 'wmv', | |
762d44c9 | 43 | 'title': 'Self-service BI with Power BI - nuclear testing', |
b0f7f21c | 44 | 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', |
762d44c9 | 45 | 'duration': 1540, |
b0f7f21c RA |
46 | 'thumbnail': r're:https?://.*\.jpg', |
47 | 'timestamp': 1386381991, | |
48 | 'upload_date': '20131207', | |
762d44c9 | 49 | 'authors': ['Mike Wilmot'], |
a13d06de | 50 | }, |
762d44c9 S |
51 | }, { |
52 | # low quality mp4 is best | |
53 | 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', | |
54 | 'info_dict': { | |
b0f7f21c | 55 | 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', |
762d44c9 S |
56 | 'ext': 'mp4', |
57 | 'title': 'Ranges for the Standard Library', | |
b0f7f21c | 58 | 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', |
762d44c9 | 59 | 'duration': 5646, |
b0f7f21c RA |
60 | 'thumbnail': r're:https?://.*\.jpg', |
61 | 'upload_date': '20150930', | |
62 | 'timestamp': 1443640735, | |
762d44c9 S |
63 | }, |
64 | 'params': { | |
65 | 'skip_download': True, | |
66 | }, | |
67 | }, { | |
68 | 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', | |
69 | 'info_dict': { | |
70 | 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', | |
71 | 'title': 'Channel 9', | |
72 | }, | |
b0f7f21c | 73 | 'playlist_mincount': 100, |
762d44c9 S |
74 | }, { |
75 | 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', | |
76 | 'only_matching': True, | |
77 | }, { | |
78 | 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', | |
79 | 'only_matching': True, | |
80 | }] | |
df537474 | 81 | |
82 | _RSS_URL = 'http://channel9.msdn.com/%s/RSS' | |
df537474 | 83 | |
26bae2d9 S |
84 | @staticmethod |
85 | def _extract_urls(webpage): | |
86 | return re.findall( | |
87 | r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', | |
88 | webpage) | |
89 | ||
762d44c9 S |
90 | def _extract_list(self, video_id, rss_url=None): |
91 | if not rss_url: | |
92 | rss_url = self._RSS_URL % video_id | |
93 | rss = self._download_xml(rss_url, video_id, 'Downloading RSS') | |
4d2ebb6b | 94 | entries = [self.url_result(session_url.text, 'Channel9') |
95 | for session_url in rss.findall('./channel/item/link')] | |
96 | title_text = rss.find('./channel/title').text | |
762d44c9 | 97 | return self.playlist_result(entries, video_id, title_text) |
df537474 | 98 | |
99 | def _real_extract(self, url): | |
b0f7f21c | 100 | content_path, rss = re.match(self._VALID_URL, url).groups() |
762d44c9 S |
101 | |
102 | if rss: | |
103 | return self._extract_list(content_path, url) | |
df537474 | 104 | |
762d44c9 S |
105 | webpage = self._download_webpage( |
106 | url, content_path, 'Downloading web page') | |
df537474 | 107 | |
b0f7f21c RA |
108 | episode_data = self._search_regex( |
109 | r"data-episode='([^']+)'", webpage, 'episode data', default=None) | |
110 | if episode_data: | |
111 | episode_data = self._parse_json(unescapeHTML( | |
112 | episode_data), content_path) | |
113 | content_id = episode_data['contentId'] | |
114 | is_session = '/Sessions(' in episode_data['api'] | |
115 | content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] | |
116 | if is_session: | |
117 | content_url += '?$expand=Speakers' | |
118 | else: | |
119 | content_url += '?$expand=Authors' | |
120 | content_data = self._download_json(content_url, content_id) | |
121 | title = content_data['Title'] | |
122 | ||
a5d783f5 S |
123 | QUALITIES = ( |
124 | 'mp3', | |
125 | 'wmv', 'mp4', | |
126 | 'wmv-low', 'mp4-low', | |
127 | 'wmv-mid', 'mp4-mid', | |
128 | 'wmv-high', 'mp4-high', | |
129 | ) | |
130 | ||
131 | quality_key = qualities(QUALITIES) | |
132 | ||
133 | def quality(quality_id, format_url): | |
134 | return (len(QUALITIES) if '_Source.' in format_url | |
135 | else quality_key(quality_id)) | |
136 | ||
b0f7f21c | 137 | formats = [] |
a5d783f5 S |
138 | urls = set() |
139 | ||
140 | SITE_QUALITIES = { | |
141 | 'MP3': 'mp3', | |
142 | 'MP4': 'mp4', | |
143 | 'Low Quality WMV': 'wmv-low', | |
144 | 'Low Quality MP4': 'mp4-low', | |
145 | 'Mid Quality WMV': 'wmv-mid', | |
146 | 'Mid Quality MP4': 'mp4-mid', | |
147 | 'High Quality WMV': 'wmv-high', | |
148 | 'High Quality MP4': 'mp4-high', | |
149 | } | |
150 | ||
151 | formats_select = self._search_regex( | |
152 | r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, | |
153 | 'formats select', default=None) | |
154 | if formats_select: | |
155 | for mobj in re.finditer( | |
156 | r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', | |
157 | formats_select): | |
158 | format_url = mobj.group('url') | |
159 | if format_url in urls: | |
160 | continue | |
161 | urls.add(format_url) | |
162 | format_id = mobj.group('format') | |
163 | quality_id = SITE_QUALITIES.get(format_id, format_id) | |
164 | formats.append({ | |
165 | 'url': format_url, | |
166 | 'format_id': quality_id, | |
167 | 'quality': quality(quality_id, format_url), | |
168 | 'vcodec': 'none' if quality_id == 'mp3' else None, | |
169 | }) | |
170 | ||
171 | API_QUALITIES = { | |
172 | 'VideoMP4Low': 'mp4-low', | |
173 | 'VideoWMV': 'wmv-mid', | |
174 | 'VideoMP4Medium': 'mp4-mid', | |
175 | 'VideoMP4High': 'mp4-high', | |
176 | 'VideoWMVHQ': 'wmv-hq', | |
177 | } | |
178 | ||
179 | for format_id, q in API_QUALITIES.items(): | |
180 | q_url = content_data.get(format_id) | |
181 | if not q_url or q_url in urls: | |
b0f7f21c | 182 | continue |
a5d783f5 | 183 | urls.add(q_url) |
b0f7f21c | 184 | formats.append({ |
b0f7f21c | 185 | 'url': q_url, |
a5d783f5 S |
186 | 'format_id': q, |
187 | 'quality': quality(q, q_url), | |
b0f7f21c | 188 | }) |
a5d783f5 S |
189 | |
190 | self._sort_formats(formats) | |
191 | ||
b0f7f21c RA |
192 | slides = content_data.get('Slides') |
193 | zip_file = content_data.get('ZipFile') | |
194 | ||
195 | if not formats and not slides and not zip_file: | |
196 | raise ExtractorError( | |
197 | 'None of recording, slides or zip are available for %s' % content_path) | |
198 | ||
199 | subtitles = {} | |
200 | for caption in content_data.get('Captions', []): | |
201 | caption_url = caption.get('Url') | |
202 | if not caption_url: | |
203 | continue | |
204 | subtitles.setdefault(caption.get('Language', 'en'), []).append({ | |
205 | 'url': caption_url, | |
206 | 'ext': 'vtt', | |
207 | }) | |
208 | ||
209 | common = { | |
210 | 'id': content_id, | |
211 | 'title': title, | |
212 | 'description': clean_html(content_data.get('Description') or content_data.get('Body')), | |
213 | 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'), | |
214 | 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), | |
215 | 'timestamp': parse_iso8601(content_data.get('PublishedDate')), | |
216 | 'avg_rating': int_or_none(content_data.get('Rating')), | |
217 | 'rating_count': int_or_none(content_data.get('RatingCount')), | |
218 | 'view_count': int_or_none(content_data.get('Views')), | |
219 | 'comment_count': int_or_none(content_data.get('CommentCount')), | |
220 | 'subtitles': subtitles, | |
221 | } | |
222 | if is_session: | |
223 | speakers = [] | |
224 | for s in content_data.get('Speakers', []): | |
225 | speaker_name = s.get('FullName') | |
226 | if not speaker_name: | |
227 | continue | |
228 | speakers.append(speaker_name) | |
229 | ||
230 | common.update({ | |
231 | 'session_code': content_data.get('Code'), | |
232 | 'session_room': content_data.get('Room'), | |
233 | 'session_speakers': speakers, | |
234 | }) | |
a316a83d | 235 | else: |
b0f7f21c RA |
236 | authors = [] |
237 | for a in content_data.get('Authors', []): | |
238 | author_name = a.get('DisplayName') | |
239 | if not author_name: | |
240 | continue | |
241 | authors.append(author_name) | |
242 | common['authors'] = authors | |
243 | ||
244 | contents = [] | |
245 | ||
246 | if slides: | |
247 | d = common.copy() | |
248 | d.update({'title': title + '-Slides', 'url': slides}) | |
249 | contents.append(d) | |
250 | ||
251 | if zip_file: | |
252 | d = common.copy() | |
253 | d.update({'title': title + '-Zip', 'url': zip_file}) | |
254 | contents.append(d) | |
255 | ||
256 | if formats: | |
257 | d = common.copy() | |
258 | d.update({'title': title, 'formats': formats}) | |
259 | contents.append(d) | |
260 | return self.playlist_result(contents) | |
261 | else: | |
df537474 | 262 | return self._extract_list(content_path) |