]> jfr.im git - yt-dlp.git/blame - youtube_dl/InfoExtractors.py
release 2013.04.18
[yt-dlp.git] / youtube_dl / InfoExtractors.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
9e8056d5
PH
4from __future__ import absolute_import
5
4fcca4bb 6import base64
d77c3dfd 7import datetime
ccf65f9d 8import itertools
d77c3dfd
FV
9import netrc
10import os
11import re
12import socket
13import time
d77c3dfd 14import email.utils
921a1455 15import xml.etree.ElementTree
302efc19 16import random
17import math
6324fd1d 18import operator
d77c3dfd 19
9e8056d5 20from .utils import *
d77c3dfd
FV
21
22
23class InfoExtractor(object):
59ae15a5 24 """Information Extractor class.
d77c3dfd 25
59ae15a5
PH
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
cdb30764 29 others. The information is stored in a dictionary which is then
59ae15a5
PH
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
717b1f72 33
59ae15a5 34 The dictionaries must include the following fields:
717b1f72 35
59ae15a5
PH
36 id: Video identifier.
37 url: Final video URL.
59ae15a5
PH
38 title: Video title, unescaped.
39 ext: Video filename extension.
717b1f72 40
59ae15a5 41 The following fields are optional:
717b1f72 42
59ae15a5
PH
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
539679c7
PH
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
77c4beab 48 uploader_id: Nickname or id of the video uploader.
6119f78c 49 location: Physical location of the video.
59ae15a5 50 player_url: SWF Player URL (used for rtmpdump).
553d0974 51 subtitles: The subtitle file contents.
59ae15a5
PH
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
d77c3dfd 54
59ae15a5 55 The fields should all be Unicode strings.
9ce5d9ee 56
59ae15a5
PH
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
717b1f72 60
59ae15a5
PH
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
03c5b0fb 63
59ae15a5
PH
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
d77c3dfd 67
59ae15a5
PH
68 _ready = False
69 _downloader = None
70 _WORKING = True
d77c3dfd 71
59ae15a5
PH
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
d77c3dfd 76
89de9eb1
FV
77 @classmethod
78 def suitable(cls, url):
59ae15a5 79 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 80 return re.match(cls._VALID_URL, url) is not None
d77c3dfd 81
89de9eb1
FV
82 @classmethod
83 def working(cls):
59ae15a5 84 """Getter method for _WORKING."""
89de9eb1 85 return cls._WORKING
03c5b0fb 86
59ae15a5
PH
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
d77c3dfd 92
59ae15a5
PH
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
d77c3dfd 97
59ae15a5
PH
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
d77c3dfd 101
59ae15a5
PH
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
d77c3dfd 105
59ae15a5
PH
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
d77c3dfd 109
d0d4f277
PH
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
d77c3dfd 113
64ce2aad
PH
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
d830b7c2
PH
116 if note is None:
117 note = u'Downloading video webpage'
927c8c49
PH
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
d830b7c2 120 try:
64ce2aad 121 return compat_urllib_request.urlopen(url_or_request)
d830b7c2
PH
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
01951dda 125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
d830b7c2 126
64ce2aad
PH
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
e32b06e9
PH
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 if m:
133 encoding = m.group(1)
134 else:
135 encoding = 'utf-8'
64ce2aad 136 webpage_bytes = urlh.read()
855703e5
PH
137 if self._downloader.params.get('dump_intermediate_pages', False):
138 try:
139 url = url_or_request.get_full_url()
140 except AttributeError:
141 url = url_or_request
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
e32b06e9 145 return webpage_bytes.decode(encoding, 'replace')
64ce2aad 146
d830b7c2 147
d77c3dfd 148class YoutubeIE(InfoExtractor):
59ae15a5
PH
149 """Information extractor for youtube.com."""
150
151 _VALID_URL = r"""^
152 (
153 (?:https?://)? # http(s):// (optional)
154 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
155 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
156 (?:.*?\#/)? # handle anchor (#/) redirect urls
59ae15a5
PH
157 (?: # the various things that can precede the ID:
158 (?:(?:v|embed|e)/) # v/ or embed/ or e/
159 |(?: # or the v= param in all its forms
160 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
161 (?:\?|\#!?) # the params delimiter ? or # or #!
3bb61659 162 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
59ae15a5
PH
163 v=
164 )
165 )? # optional -> youtube.com/xxxx is OK
166 )? # all until now is optional -> you can pass the naked ID
167 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
168 (?(1).+)? # if we found the ID, everything can follow
169 $"""
170 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
d3f5f9f6 171 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
59ae15a5
PH
172 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
173 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
174 _NETRC_MACHINE = 'youtube'
175 # Listed in order of quality
176 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
178 _video_extensions = {
179 '13': '3gp',
180 '17': 'mp4',
181 '18': 'mp4',
182 '22': 'mp4',
183 '37': 'mp4',
184 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
185 '43': 'webm',
186 '44': 'webm',
187 '45': 'webm',
188 '46': 'webm',
189 }
190 _video_dimensions = {
191 '5': '240x400',
192 '6': '???',
193 '13': '???',
194 '17': '144x176',
195 '18': '360x640',
196 '22': '720x1280',
197 '34': '360x640',
198 '35': '480x854',
199 '37': '1080x1920',
200 '38': '3072x4096',
201 '43': '360x640',
202 '44': '480x854',
203 '45': '720x1280',
204 '46': '1080x1920',
cdb30764 205 }
59ae15a5
PH
206 IE_NAME = u'youtube'
207
89de9eb1
FV
208 @classmethod
209 def suitable(cls, url):
59ae15a5 210 """Receives a URL and returns True if suitable for this IE."""
89de9eb1
FV
211 if YoutubePlaylistIE.suitable(url): return False
212 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
59ae15a5
PH
213
214 def report_lang(self):
215 """Report attempt to set language."""
216 self._downloader.to_screen(u'[youtube] Setting language')
217
218 def report_login(self):
219 """Report attempt to log in."""
220 self._downloader.to_screen(u'[youtube] Logging in')
221
222 def report_age_confirmation(self):
223 """Report attempt to confirm age."""
224 self._downloader.to_screen(u'[youtube] Confirming age')
225
226 def report_video_webpage_download(self, video_id):
227 """Report attempt to download video webpage."""
228 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
229
230 def report_video_info_webpage_download(self, video_id):
231 """Report attempt to download video info webpage."""
232 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
233
234 def report_video_subtitles_download(self, video_id):
235 """Report attempt to download video info webpage."""
2a4093ea 236 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
59ae15a5 237
2a4093ea 238 def report_video_subtitles_request(self, video_id, sub_lang, format):
ae608b80 239 """Report attempt to download video info webpage."""
2a4093ea
IM
240 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
241
242 def report_video_subtitles_available(self, video_id, sub_lang_list):
243 """Report available subtitles."""
244 sub_lang = ",".join(list(sub_lang_list.keys()))
245 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
59ae15a5
PH
246
247 def report_information_extraction(self, video_id):
248 """Report attempt to extract video information."""
249 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
250
251 def report_unavailable_format(self, video_id, format):
252 """Report extracted video URL."""
253 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
254
255 def report_rtmp_download(self):
256 """Indicate the download will use the RTMP protocol."""
257 self._downloader.to_screen(u'[youtube] RTMP download detected')
258
ae608b80 259 def _get_available_subtitles(self, video_id):
056d8575
FV
260 self.report_video_subtitles_download(video_id)
261 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
262 try:
553d0974 263 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
056d8575 264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
bc97f6d6 265 return (u'unable to download video subtitles: %s' % compat_str(err), None)
553d0974
IM
266 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
267 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
268 if not sub_lang_list:
bc97f6d6 269 return (u'video doesn\'t have subtitles', None)
553d0974 270 return sub_lang_list
ae608b80 271
2a4093ea
IM
272 def _list_available_subtitles(self, video_id):
273 sub_lang_list = self._get_available_subtitles(video_id)
274 self.report_video_subtitles_available(video_id, sub_lang_list)
275
9e62bc44 276 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
6a205c88
JMF
277 """
278 Return tuple:
279 (error_message, sub_lang, sub)
280 """
2a4093ea 281 self.report_video_subtitles_request(video_id, sub_lang, format)
fb778e66 282 params = compat_urllib_parse.urlencode({
553d0974
IM
283 'lang': sub_lang,
284 'name': sub_name,
fb778e66 285 'v': video_id,
ae608b80 286 'fmt': format,
fb778e66
PH
287 })
288 url = 'http://www.youtube.com/api/timedtext?' + params
056d8575 289 try:
553d0974 290 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
056d8575 291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
bc97f6d6 292 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
553d0974 293 if not sub:
bc97f6d6 294 return (u'Did not fetch video subtitles', None, None)
553d0974 295 return (None, sub_lang, sub)
ae608b80
IM
296
297 def _extract_subtitle(self, video_id):
0fb37564
JMF
298 """
299 Return a list with a tuple:
300 [(error_message, sub_lang, sub)]
301 """
553d0974 302 sub_lang_list = self._get_available_subtitles(video_id)
9e62bc44 303 sub_format = self._downloader.params.get('subtitlesformat')
0fb37564
JMF
304 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
305 return [(sub_lang_list[0], None, None)]
ae608b80 306 if self._downloader.params.get('subtitleslang', False):
553d0974
IM
307 sub_lang = self._downloader.params.get('subtitleslang')
308 elif 'en' in sub_lang_list:
309 sub_lang = 'en'
ae608b80 310 else:
553d0974
IM
311 sub_lang = list(sub_lang_list.keys())[0]
312 if not sub_lang in sub_lang_list:
bc97f6d6 313 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
ae608b80 314
9e62bc44 315 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
553d0974 316 return [subtitle]
ae608b80
IM
317
318 def _extract_all_subtitles(self, video_id):
553d0974 319 sub_lang_list = self._get_available_subtitles(video_id)
9e62bc44 320 sub_format = self._downloader.params.get('subtitlesformat')
ef767f9f
JMF
321 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
322 return [(sub_lang_list[0], None, None)]
553d0974
IM
323 subtitles = []
324 for sub_lang in sub_lang_list:
9e62bc44 325 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
553d0974
IM
326 subtitles.append(subtitle)
327 return subtitles
056d8575 328
59ae15a5
PH
329 def _print_formats(self, formats):
330 print('Available formats:')
331 for x in formats:
332 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
333
334 def _real_initialize(self):
335 if self._downloader is None:
336 return
337
338 username = None
339 password = None
340 downloader_params = self._downloader.params
341
342 # Attempt to use provided username and password or .netrc data
343 if downloader_params.get('username', None) is not None:
344 username = downloader_params['username']
345 password = downloader_params['password']
346 elif downloader_params.get('usenetrc', False):
347 try:
348 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
349 if info is not None:
350 username = info[0]
351 password = info[2]
352 else:
353 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354 except (IOError, netrc.NetrcParseError) as err:
2e5457be 355 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
356 return
357
358 # Set language
359 request = compat_urllib_request.Request(self._LANG_URL)
360 try:
361 self.report_lang()
362 compat_urllib_request.urlopen(request).read()
363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 364 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59ae15a5
PH
365 return
366
367 # No authentication to be performed
368 if username is None:
369 return
370
d3f5f9f6
PH
371 request = compat_urllib_request.Request(self._LOGIN_URL)
372 try:
373 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 375 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
d3f5f9f6
PH
376 return
377
378 galx = None
379 dsh = None
380 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
381 if match:
382 galx = match.group(1)
383
384 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
385 if match:
386 dsh = match.group(1)
387
59ae15a5 388 # Log in
d3f5f9f6
PH
389 login_form_strs = {
390 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
391 u'Email': username,
392 u'GALX': galx,
393 u'Passwd': password,
394 u'PersistentCookie': u'yes',
395 u'_utf8': u'霱',
396 u'bgresponse': u'js_disabled',
397 u'checkConnection': u'',
398 u'checkedDomains': u'youtube',
399 u'dnConn': u'',
400 u'dsh': dsh,
401 u'pstMsg': u'0',
402 u'rmShown': u'1',
403 u'secTok': u'',
404 u'signIn': u'Sign in',
405 u'timeStmp': u'',
406 u'service': u'youtube',
407 u'uilel': u'3',
408 u'hl': u'en_US',
409 }
410 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
411 # chokes on unicode
412 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
413 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
414 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
59ae15a5
PH
415 try:
416 self.report_login()
80d3177e 417 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
d3f5f9f6 418 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
2e5457be 419 self._downloader.report_warning(u'unable to log in: bad username or password')
59ae15a5
PH
420 return
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 422 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
423 return
424
425 # Confirm age
426 age_form = {
427 'next_url': '/',
428 'action_confirm': 'Confirm',
429 }
430 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
431 try:
432 self.report_age_confirmation()
80d3177e 433 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 435 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
59ae15a5
PH
436 return
437
3bb61659 438 def _extract_id(self, url):
59ae15a5
PH
439 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
440 if mobj is None:
e5f30ade 441 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
442 return
443 video_id = mobj.group(2)
3bb61659
PH
444 return video_id
445
446 def _real_extract(self, url):
447 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
448 mobj = re.search(self._NEXT_URL_RE, url)
449 if mobj:
450 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
451 video_id = self._extract_id(url)
59ae15a5
PH
452
453 # Get video webpage
454 self.report_video_webpage_download(video_id)
3bb61659
PH
455 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
456 request = compat_urllib_request.Request(url)
59ae15a5
PH
457 try:
458 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 460 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
59ae15a5
PH
461 return
462
463 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
464
465 # Attempt to extract SWF player URL
466 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
467 if mobj is not None:
468 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
469 else:
470 player_url = None
471
472 # Get video info
473 self.report_video_info_webpage_download(video_id)
474 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
927c8c49 475 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
59ae15a5 476 % (video_id, el_type))
927c8c49
PH
477 video_info_webpage = self._download_webpage(video_info_url, video_id,
478 note=False,
479 errnote='unable to download video info webpage')
480 video_info = compat_parse_qs(video_info_webpage)
481 if 'token' in video_info:
482 break
59ae15a5
PH
483 if 'token' not in video_info:
484 if 'reason' in video_info:
e5f30ade 485 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
59ae15a5 486 else:
e5f30ade 487 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
59ae15a5
PH
488 return
489
490 # Check for "rental" videos
491 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
e5f30ade 492 self._downloader.report_error(u'"rental" videos not supported')
59ae15a5
PH
493 return
494
495 # Start extracting information
496 self.report_information_extraction(video_id)
497
498 # uploader
499 if 'author' not in video_info:
e5f30ade 500 self._downloader.report_error(u'unable to extract uploader name')
59ae15a5
PH
501 return
502 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
503
77c4beab
FV
504 # uploader_id
505 video_uploader_id = None
26cf0408 506 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
77c4beab
FV
507 if mobj is not None:
508 video_uploader_id = mobj.group(1)
509 else:
c9fa1cba 510 self._downloader.report_warning(u'unable to extract uploader nickname')
77c4beab 511
59ae15a5
PH
512 # title
513 if 'title' not in video_info:
e5f30ade 514 self._downloader.report_error(u'unable to extract video title')
59ae15a5
PH
515 return
516 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
517
518 # thumbnail image
519 if 'thumbnail_url' not in video_info:
c9fa1cba 520 self._downloader.report_warning(u'unable to extract video thumbnail')
59ae15a5
PH
521 video_thumbnail = ''
522 else: # don't panic if we can't find it
523 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
524
525 # upload date
526 upload_date = None
527 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
528 if mobj is not None:
529 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
530 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
531 for expression in format_expressions:
532 try:
533 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
534 except:
535 pass
536
537 # description
538 video_description = get_element_by_id("eow-description", video_webpage)
539 if video_description:
540 video_description = clean_html(video_description)
541 else:
542 video_description = ''
543
9e62bc44 544 # subtitles
59ae15a5 545 video_subtitles = None
ae608b80 546
59ae15a5 547 if self._downloader.params.get('writesubtitles', False):
ae608b80
IM
548 video_subtitles = self._extract_subtitle(video_id)
549 if video_subtitles:
553d0974
IM
550 (sub_error, sub_lang, sub) = video_subtitles[0]
551 if sub_error:
bc97f6d6 552 self._downloader.report_error(sub_error)
ae608b80
IM
553
554 if self._downloader.params.get('allsubtitles', False):
555 video_subtitles = self._extract_all_subtitles(video_id)
556 for video_subtitle in video_subtitles:
553d0974
IM
557 (sub_error, sub_lang, sub) = video_subtitle
558 if sub_error:
bc97f6d6 559 self._downloader.report_error(sub_error)
59ae15a5 560
2a4093ea
IM
561 if self._downloader.params.get('listsubtitles', False):
562 sub_lang_list = self._list_available_subtitles(video_id)
563 return
59ae15a5
PH
564
565 if 'length_seconds' not in video_info:
c9fa1cba 566 self._downloader.report_warning(u'unable to extract video duration')
59ae15a5
PH
567 video_duration = ''
568 else:
569 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
570
571 # token
572 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
573
574 # Decide which formats to download
575 req_format = self._downloader.params.get('format', None)
576
577 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
578 self.report_rtmp_download()
579 video_url_list = [(None, video_info['conn'][0])]
580 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
581 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
582 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1a2c3c0f 583 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
c8c5443b 584 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
59ae15a5
PH
585
586 format_limit = self._downloader.params.get('format_limit', None)
587 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
588 if format_limit is not None and format_limit in available_formats:
589 format_list = available_formats[available_formats.index(format_limit):]
590 else:
591 format_list = available_formats
592 existing_formats = [x for x in format_list if x in url_map]
593 if len(existing_formats) == 0:
e5f30ade 594 self._downloader.report_error(u'no known formats available for video')
59ae15a5
PH
595 return
596 if self._downloader.params.get('listformats', None):
597 self._print_formats(existing_formats)
598 return
599 if req_format is None or req_format == 'best':
600 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
601 elif req_format == 'worst':
602 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
603 elif req_format in ('-1', 'all'):
604 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
605 else:
606 # Specific formats. We pick the first in a slash-delimeted sequence.
607 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
608 req_formats = req_format.split('/')
609 video_url_list = None
610 for rf in req_formats:
611 if rf in url_map:
612 video_url_list = [(rf, url_map[rf])]
613 break
614 if video_url_list is None:
e5f30ade 615 self._downloader.report_error(u'requested format not available')
59ae15a5
PH
616 return
617 else:
e5f30ade 618 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
59ae15a5
PH
619 return
620
621 results = []
622 for format_param, video_real_url in video_url_list:
623 # Extension
624 video_extension = self._video_extensions.get(format_param, 'flv')
625
32761d86
FV
626 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
627 self._video_dimensions.get(format_param, '???'))
59ae15a5
PH
628
629 results.append({
630 'id': video_id,
631 'url': video_real_url,
632 'uploader': video_uploader,
77c4beab 633 'uploader_id': video_uploader_id,
59ae15a5
PH
634 'upload_date': upload_date,
635 'title': video_title,
636 'ext': video_extension,
637 'format': video_format,
638 'thumbnail': video_thumbnail,
639 'description': video_description,
640 'player_url': player_url,
641 'subtitles': video_subtitles,
642 'duration': video_duration
643 })
644 return results
d77c3dfd
FV
645
646
647class MetacafeIE(InfoExtractor):
59ae15a5
PH
648 """Information Extractor for metacafe.com."""
649
650 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
651 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
652 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
653 IE_NAME = u'metacafe'
654
655 def __init__(self, downloader=None):
656 InfoExtractor.__init__(self, downloader)
657
658 def report_disclaimer(self):
659 """Report disclaimer retrieval."""
660 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
661
662 def report_age_confirmation(self):
663 """Report attempt to confirm age."""
664 self._downloader.to_screen(u'[metacafe] Confirming age')
665
666 def report_download_webpage(self, video_id):
667 """Report webpage download."""
668 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
669
670 def report_extraction(self, video_id):
671 """Report information extraction."""
672 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
673
674 def _real_initialize(self):
675 # Retrieve disclaimer
676 request = compat_urllib_request.Request(self._DISCLAIMER)
677 try:
678 self.report_disclaimer()
679 disclaimer = compat_urllib_request.urlopen(request).read()
680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 681 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
59ae15a5
PH
682 return
683
684 # Confirm age
685 disclaimer_form = {
686 'filters': '0',
687 'submit': "Continue - I'm over 18",
688 }
689 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
690 try:
691 self.report_age_confirmation()
692 disclaimer = compat_urllib_request.urlopen(request).read()
693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 694 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
59ae15a5
PH
695 return
696
697 def _real_extract(self, url):
698 # Extract id and simplified title from URL
699 mobj = re.match(self._VALID_URL, url)
700 if mobj is None:
e5f30ade 701 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
702 return
703
704 video_id = mobj.group(1)
705
706 # Check if video comes from YouTube
707 mobj2 = re.match(r'^yt-(.*)$', video_id)
708 if mobj2 is not None:
709 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
710 return
711
712 # Retrieve video webpage to extract further information
713 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
714 try:
715 self.report_download_webpage(video_id)
716 webpage = compat_urllib_request.urlopen(request).read()
717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 718 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
719 return
720
721 # Extract URL, uploader and title from webpage
722 self.report_extraction(video_id)
723 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
724 if mobj is not None:
725 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
726 video_extension = mediaURL[-3:]
727
728 # Extract gdaKey if available
729 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
730 if mobj is None:
731 video_url = mediaURL
732 else:
733 gdaKey = mobj.group(1)
734 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
735 else:
736 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
737 if mobj is None:
e5f30ade 738 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
739 return
740 vardict = compat_parse_qs(mobj.group(1))
741 if 'mediaData' not in vardict:
e5f30ade 742 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
743 return
744 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
745 if mobj is None:
e5f30ade 746 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
747 return
748 mediaURL = mobj.group(1).replace('\\/', '/')
749 video_extension = mediaURL[-3:]
750 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
751
752 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
753 if mobj is None:
e5f30ade 754 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
755 return
756 video_title = mobj.group(1).decode('utf-8')
757
758 mobj = re.search(r'submitter=(.*?);', webpage)
759 if mobj is None:
e5f30ade 760 self._downloader.report_error(u'unable to extract uploader nickname')
59ae15a5
PH
761 return
762 video_uploader = mobj.group(1)
763
764 return [{
765 'id': video_id.decode('utf-8'),
766 'url': video_url.decode('utf-8'),
767 'uploader': video_uploader.decode('utf-8'),
768 'upload_date': None,
769 'title': video_title,
770 'ext': video_extension.decode('utf-8'),
771 }]
d77c3dfd
FV
772
773
774class DailymotionIE(InfoExtractor):
59ae15a5
PH
775 """Information Extractor for Dailymotion"""
776
777 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
778 IE_NAME = u'dailymotion'
b17c974a 779 _WORKING = False
59ae15a5
PH
780
781 def __init__(self, downloader=None):
782 InfoExtractor.__init__(self, downloader)
783
59ae15a5
PH
784 def report_extraction(self, video_id):
785 """Report information extraction."""
786 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
787
788 def _real_extract(self, url):
789 # Extract id and simplified title from URL
790 mobj = re.match(self._VALID_URL, url)
791 if mobj is None:
e5f30ade 792 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
793 return
794
795 video_id = mobj.group(1).split('_')[0].split('?')[0]
796
797 video_extension = 'mp4'
798
799 # Retrieve video webpage to extract further information
800 request = compat_urllib_request.Request(url)
801 request.add_header('Cookie', 'family_filter=off')
8e241d1a 802 webpage = self._download_webpage(request, video_id)
59ae15a5
PH
803
804 # Extract URL, uploader and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
807 if mobj is None:
e5f30ade 808 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
809 return
810 flashvars = compat_urllib_parse.unquote(mobj.group(1))
811
812 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
813 if key in flashvars:
814 max_quality = key
815 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
816 break
817 else:
e5f30ade 818 self._downloader.report_error(u'unable to extract video URL')
59ae15a5
PH
819 return
820
821 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
822 if mobj is None:
e5f30ade 823 self._downloader.report_error(u'unable to extract video URL')
59ae15a5
PH
824 return
825
826 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
827
828 # TODO: support choosing qualities
829
830 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
831 if mobj is None:
e5f30ade 832 self._downloader.report_error(u'unable to extract title')
59ae15a5 833 return
28ca6b5a 834 video_title = unescapeHTML(mobj.group('title'))
59ae15a5
PH
835
836 video_uploader = None
837 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
838 if mobj is None:
839 # lookin for official user
840 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841 if mobj_official is None:
c9fa1cba 842 self._downloader.report_warning(u'unable to extract uploader nickname')
59ae15a5
PH
843 else:
844 video_uploader = mobj_official.group(1)
845 else:
846 video_uploader = mobj.group(1)
847
848 video_upload_date = None
849 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
850 if mobj is not None:
851 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
852
853 return [{
28ca6b5a
PH
854 'id': video_id,
855 'url': video_url,
856 'uploader': video_uploader,
59ae15a5
PH
857 'upload_date': video_upload_date,
858 'title': video_title,
28ca6b5a 859 'ext': video_extension,
59ae15a5 860 }]
d77c3dfd
FV
861
862
d77c3dfd 863class PhotobucketIE(InfoExtractor):
59ae15a5
PH
864 """Information extractor for photobucket.com."""
865
866 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867 IE_NAME = u'photobucket'
868
869 def __init__(self, downloader=None):
870 InfoExtractor.__init__(self, downloader)
871
872 def report_download_webpage(self, video_id):
873 """Report webpage download."""
874 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
875
876 def report_extraction(self, video_id):
877 """Report information extraction."""
878 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
879
880 def _real_extract(self, url):
881 # Extract id from URL
882 mobj = re.match(self._VALID_URL, url)
883 if mobj is None:
e5f30ade 884 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
885 return
886
887 video_id = mobj.group(1)
888
889 video_extension = 'flv'
890
891 # Retrieve video webpage to extract further information
892 request = compat_urllib_request.Request(url)
893 try:
894 self.report_download_webpage(video_id)
895 webpage = compat_urllib_request.urlopen(request).read()
896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 897 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
898 return
899
900 # Extract URL, uploader, and title from webpage
901 self.report_extraction(video_id)
902 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
903 if mobj is None:
e5f30ade 904 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
905 return
906 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
907
908 video_url = mediaURL
909
910 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
911 if mobj is None:
e5f30ade 912 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
913 return
914 video_title = mobj.group(1).decode('utf-8')
915
916 video_uploader = mobj.group(2).decode('utf-8')
917
918 return [{
919 'id': video_id.decode('utf-8'),
920 'url': video_url.decode('utf-8'),
921 'uploader': video_uploader,
922 'upload_date': None,
923 'title': video_title,
924 'ext': video_extension.decode('utf-8'),
925 }]
d77c3dfd
FV
926
927
928class YahooIE(InfoExtractor):
59ae15a5
PH
929 """Information extractor for video.yahoo.com."""
930
93702113 931 _WORKING = False
59ae15a5
PH
932 # _VALID_URL matches all Yahoo! Video URLs
933 # _VPAGE_URL matches only the extractable '/watch/' URLs
934 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
935 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
936 IE_NAME = u'video.yahoo'
937
938 def __init__(self, downloader=None):
939 InfoExtractor.__init__(self, downloader)
940
941 def report_download_webpage(self, video_id):
942 """Report webpage download."""
943 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
944
945 def report_extraction(self, video_id):
946 """Report information extraction."""
947 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
948
949 def _real_extract(self, url, new_video=True):
950 # Extract ID from URL
951 mobj = re.match(self._VALID_URL, url)
952 if mobj is None:
e5f30ade 953 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
954 return
955
956 video_id = mobj.group(2)
957 video_extension = 'flv'
958
959 # Rewrite valid but non-extractable URLs as
960 # extractable English language /watch/ URLs
961 if re.match(self._VPAGE_URL, url) is None:
962 request = compat_urllib_request.Request(url)
963 try:
964 webpage = compat_urllib_request.urlopen(request).read()
965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 966 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
967 return
968
969 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
970 if mobj is None:
e5f30ade 971 self._downloader.report_error(u'Unable to extract id field')
59ae15a5
PH
972 return
973 yahoo_id = mobj.group(1)
974
975 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
976 if mobj is None:
e5f30ade 977 self._downloader.report_error(u'Unable to extract vid field')
59ae15a5
PH
978 return
979 yahoo_vid = mobj.group(1)
980
981 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
982 return self._real_extract(url, new_video=False)
983
984 # Retrieve video webpage to extract further information
985 request = compat_urllib_request.Request(url)
986 try:
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 990 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
991 return
992
993 # Extract uploader and title from webpage
994 self.report_extraction(video_id)
995 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
996 if mobj is None:
e5f30ade 997 self._downloader.report_error(u'unable to extract video title')
59ae15a5
PH
998 return
999 video_title = mobj.group(1).decode('utf-8')
1000
1001 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1002 if mobj is None:
e5f30ade 1003 self._downloader.report_error(u'unable to extract video uploader')
59ae15a5
PH
1004 return
1005 video_uploader = mobj.group(1).decode('utf-8')
1006
1007 # Extract video thumbnail
1008 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1009 if mobj is None:
e5f30ade 1010 self._downloader.report_error(u'unable to extract video thumbnail')
59ae15a5
PH
1011 return
1012 video_thumbnail = mobj.group(1).decode('utf-8')
1013
1014 # Extract video description
1015 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1016 if mobj is None:
e5f30ade 1017 self._downloader.report_error(u'unable to extract video description')
59ae15a5
PH
1018 return
1019 video_description = mobj.group(1).decode('utf-8')
1020 if not video_description:
1021 video_description = 'No description available.'
1022
1023 # Extract video height and width
1024 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1025 if mobj is None:
e5f30ade 1026 self._downloader.report_error(u'unable to extract video height')
59ae15a5
PH
1027 return
1028 yv_video_height = mobj.group(1)
1029
1030 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1031 if mobj is None:
e5f30ade 1032 self._downloader.report_error(u'unable to extract video width')
59ae15a5
PH
1033 return
1034 yv_video_width = mobj.group(1)
1035
1036 # Retrieve video playlist to extract media URL
1037 # I'm not completely sure what all these options are, but we
1038 # seem to need most of them, otherwise the server sends a 401.
1039 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1040 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1041 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1044 try:
1045 self.report_download_webpage(video_id)
1046 webpage = compat_urllib_request.urlopen(request).read()
1047 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1048 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
1049 return
1050
1051 # Extract media URL from playlist XML
1052 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1053 if mobj is None:
e5f30ade 1054 self._downloader.report_error(u'Unable to extract media URL')
59ae15a5
PH
1055 return
1056 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057 video_url = unescapeHTML(video_url)
1058
1059 return [{
1060 'id': video_id.decode('utf-8'),
1061 'url': video_url,
1062 'uploader': video_uploader,
1063 'upload_date': None,
1064 'title': video_title,
1065 'ext': video_extension.decode('utf-8'),
1066 'thumbnail': video_thumbnail.decode('utf-8'),
1067 'description': video_description,
1068 }]
d77c3dfd
FV
1069
1070
1071class VimeoIE(InfoExtractor):
59ae15a5
PH
1072 """Information extractor for vimeo.com."""
1073
1074 # _VALID_URL matches Vimeo URLs
8edc2cf8 1075 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
59ae15a5
PH
1076 IE_NAME = u'vimeo'
1077
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1080
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1084
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1088
1089 def _real_extract(self, url, new_video=True):
1090 # Extract ID from URL
1091 mobj = re.match(self._VALID_URL, url)
1092 if mobj is None:
e5f30ade 1093 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1094 return
1095
8edc2cf8
PH
1096 video_id = mobj.group('id')
1097 if not mobj.group('proto'):
1098 url = 'https://' + url
1099 if mobj.group('direct_link'):
1100 url = 'https://vimeo.com/' + video_id
59ae15a5
PH
1101
1102 # Retrieve video webpage to extract further information
1103 request = compat_urllib_request.Request(url, None, std_headers)
1104 try:
1105 self.report_download_webpage(video_id)
f1171f7c
PH
1106 webpage_bytes = compat_urllib_request.urlopen(request).read()
1107 webpage = webpage_bytes.decode('utf-8')
59ae15a5 1108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1109 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
1110 return
1111
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1116
1117 # Extract the config JSON
59ae15a5 1118 try:
1ca63e3a 1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
59ae15a5
PH
1120 config = json.loads(config)
1121 except:
e5f30ade 1122 self._downloader.report_error(u'unable to extract info section')
59ae15a5 1123 return
cdb30764 1124
59ae15a5
PH
1125 # Extract title
1126 video_title = config["video"]["title"]
1127
77c4beab 1128 # Extract uploader and uploader_id
59ae15a5 1129 video_uploader = config["video"]["owner"]["name"]
77c4beab 1130 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
59ae15a5
PH
1131
1132 # Extract video thumbnail
1133 video_thumbnail = config["video"]["thumbnail"]
1134
1135 # Extract video description
0dcfb234 1136 video_description = get_element_by_attribute("itemprop", "description", webpage)
59ae15a5 1137 if video_description: video_description = clean_html(video_description)
dc36bc94 1138 else: video_description = u''
59ae15a5
PH
1139
1140 # Extract upload date
1141 video_upload_date = None
6b3aef80 1142 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
59ae15a5 1143 if mobj is not None:
6b3aef80 1144 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
59ae15a5
PH
1145
1146 # Vimeo specific: extract request signature and timestamp
1147 sig = config['request']['signature']
1148 timestamp = config['request']['timestamp']
1149
1150 # Vimeo specific: extract video codec and quality information
1151 # First consider quality, then codecs, then take everything
1152 # TODO bind to format param
1153 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154 files = { 'hd': [], 'sd': [], 'other': []}
1155 for codec_name, codec_extension in codecs:
1156 if codec_name in config["video"]["files"]:
1157 if 'hd' in config["video"]["files"][codec_name]:
1158 files['hd'].append((codec_name, codec_extension, 'hd'))
1159 elif 'sd' in config["video"]["files"][codec_name]:
1160 files['sd'].append((codec_name, codec_extension, 'sd'))
1161 else:
1162 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163
1164 for quality in ('hd', 'sd', 'other'):
1165 if len(files[quality]) > 0:
1166 video_quality = files[quality][0][2]
1167 video_codec = files[quality][0][0]
1168 video_extension = files[quality][0][1]
1169 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1170 break
1171 else:
e5f30ade 1172 self._downloader.report_error(u'no known codec found')
59ae15a5
PH
1173 return
1174
1175 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1177
1178 return [{
1179 'id': video_id,
1180 'url': video_url,
1181 'uploader': video_uploader,
77c4beab 1182 'uploader_id': video_uploader_id,
59ae15a5
PH
1183 'upload_date': video_upload_date,
1184 'title': video_title,
1185 'ext': video_extension,
1186 'thumbnail': video_thumbnail,
1187 'description': video_description,
1188 }]
d77c3dfd
FV
1189
1190
f2ad10a9 1191class ArteTvIE(InfoExtractor):
59ae15a5
PH
1192 """arte.tv information extractor."""
1193
1194 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195 _LIVE_URL = r'index-[0-9]+\.html$'
1196
1197 IE_NAME = u'arte.tv'
1198
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1201
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1205
1206 def report_extraction(self, video_id):
1207 """Report information extraction."""
1208 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1209
1210 def fetch_webpage(self, url):
59ae15a5
PH
1211 request = compat_urllib_request.Request(url)
1212 try:
1213 self.report_download_webpage(url)
1214 webpage = compat_urllib_request.urlopen(request).read()
1215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1216 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
1217 return
1218 except ValueError as err:
e5f30ade 1219 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1220 return
1221 return webpage
1222
1223 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224 page = self.fetch_webpage(url)
1225 mobj = re.search(regex, page, regexFlags)
1226 info = {}
1227
1228 if mobj is None:
e5f30ade 1229 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1230 return
1231
1232 for (i, key, err) in matchTuples:
1233 if mobj.group(i) is None:
1234 self._downloader.trouble(err)
1235 return
1236 else:
1237 info[key] = mobj.group(i)
1238
1239 return info
1240
1241 def extractLiveStream(self, url):
1242 video_lang = url.split('/')[-4]
1243 info = self.grep_webpage(
1244 url,
1245 r'src="(.*?/videothek_js.*?\.js)',
1246 0,
1247 [
1248 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1249 ]
1250 )
1251 http_host = url.split('/')[2]
1252 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253 info = self.grep_webpage(
1254 next_url,
1255 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256 '(http://.*?\.swf).*?' +
1257 '(rtmp://.*?)\'',
1258 re.DOTALL,
1259 [
1260 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1261 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1263 ]
1264 )
1265 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1266
1267 def extractPlus7Stream(self, url):
1268 video_lang = url.split('/')[-3]
1269 info = self.grep_webpage(
1270 url,
1271 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1272 0,
1273 [
1274 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1275 ]
1276 )
1277 next_url = compat_urllib_parse.unquote(info.get('url'))
1278 info = self.grep_webpage(
1279 next_url,
1280 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1281 0,
1282 [
1283 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1284 ]
1285 )
1286 next_url = compat_urllib_parse.unquote(info.get('url'))
1287
1288 info = self.grep_webpage(
1289 next_url,
1290 r'<video id="(.*?)".*?>.*?' +
1291 '<name>(.*?)</name>.*?' +
1292 '<dateVideo>(.*?)</dateVideo>.*?' +
1293 '<url quality="hd">(.*?)</url>',
1294 re.DOTALL,
1295 [
1296 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1297 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1299 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1300 ]
1301 )
1302
1303 return {
1304 'id': info.get('id'),
1305 'url': compat_urllib_parse.unquote(info.get('url')),
1306 'uploader': u'arte.tv',
1307 'upload_date': info.get('date'),
93702113 1308 'title': info.get('title').decode('utf-8'),
59ae15a5
PH
1309 'ext': u'mp4',
1310 'format': u'NA',
1311 'player_url': None,
1312 }
1313
1314 def _real_extract(self, url):
1315 video_id = url.split('/')[-1]
1316 self.report_extraction(video_id)
1317
1318 if re.search(self._LIVE_URL, video_id) is not None:
1319 self.extractLiveStream(url)
1320 return
1321 else:
1322 info = self.extractPlus7Stream(url)
1323
1324 return [info]
f2ad10a9
CA
1325
1326
d77c3dfd 1327class GenericIE(InfoExtractor):
59ae15a5
PH
1328 """Generic last-resort information extractor."""
1329
1330 _VALID_URL = r'.*'
1331 IE_NAME = u'generic'
1332
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1335
1336 def report_download_webpage(self, video_id):
1337 """Report webpage download."""
3d342357
PH
1338 if not self._downloader.params.get('test', False):
1339 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
59ae15a5
PH
1340 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1341
1342 def report_extraction(self, video_id):
1343 """Report information extraction."""
1344 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1345
1346 def report_following_redirect(self, new_url):
1347 """Report information extraction."""
1348 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
cdb30764 1349
59ae15a5
PH
1350 def _test_redirect(self, url):
1351 """Check if it is a redirect, like url shorteners, in case restart chain."""
1352 class HeadRequest(compat_urllib_request.Request):
1353 def get_method(self):
1354 return "HEAD"
1355
1356 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1357 """
cdb30764 1358 Subclass the HTTPRedirectHandler to make it use our
59ae15a5
PH
1359 HeadRequest also on the redirected URL
1360 """
cdb30764 1361 def redirect_request(self, req, fp, code, msg, headers, newurl):
59ae15a5 1362 if code in (301, 302, 303, 307):
cdb30764 1363 newurl = newurl.replace(' ', '%20')
59ae15a5
PH
1364 newheaders = dict((k,v) for k,v in req.headers.items()
1365 if k.lower() not in ("content-length", "content-type"))
cdb30764 1366 return HeadRequest(newurl,
59ae15a5 1367 headers=newheaders,
cdb30764
ND
1368 origin_req_host=req.get_origin_req_host(),
1369 unverifiable=True)
1370 else:
1371 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
59ae15a5
PH
1372
1373 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1374 """
1375 Fallback to GET if HEAD is not allowed (405 HTTP error)
1376 """
cdb30764 1377 def http_error_405(self, req, fp, code, msg, headers):
59ae15a5
PH
1378 fp.read()
1379 fp.close()
1380
1381 newheaders = dict((k,v) for k,v in req.headers.items()
1382 if k.lower() not in ("content-length", "content-type"))
cdb30764
ND
1383 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1384 headers=newheaders,
1385 origin_req_host=req.get_origin_req_host(),
59ae15a5
PH
1386 unverifiable=True))
1387
1388 # Build our opener
cdb30764 1389 opener = compat_urllib_request.OpenerDirector()
59ae15a5
PH
1390 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391 HTTPMethodFallback, HEADRedirectHandler,
7c038b3c 1392 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
59ae15a5
PH
1393 opener.add_handler(handler())
1394
1395 response = opener.open(HeadRequest(url))
1396 new_url = response.geturl()
1397
1398 if url == new_url:
1399 return False
1400
1401 self.report_following_redirect(new_url)
1402 self._downloader.download([new_url])
1403 return True
1404
1405 def _real_extract(self, url):
1406 if self._test_redirect(url): return
1407
1408 video_id = url.split('/')[-1]
59ae15a5 1409 try:
3d342357 1410 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
1411 except ValueError as err:
1412 # since this is the last-resort InfoExtractor, if
1413 # this error is thrown, it'll be thrown here
e5f30ade 1414 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1415 return
1416
1417 self.report_extraction(video_id)
1418 # Start with something easy: JW Player in SWFObject
1419 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1420 if mobj is None:
1421 # Broaden the search a little bit
1422 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1013186a
PH
1423 if mobj is None:
1424 # Broaden the search a little bit: JWPlayer JS loader
1425 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
59ae15a5 1426 if mobj is None:
e5f30ade 1427 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1428 return
1429
1430 # It's possible that one of the regexes
1431 # matched, but returned an empty group:
1432 if mobj.group(1) is None:
e5f30ade 1433 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
1434 return
1435
1436 video_url = compat_urllib_parse.unquote(mobj.group(1))
1437 video_id = os.path.basename(video_url)
1438
1439 # here's a fun little line of code for you:
1440 video_extension = os.path.splitext(video_id)[1][1:]
1441 video_id = os.path.splitext(video_id)[0]
1442
1443 # it's tempting to parse this further, but you would
1444 # have to take into account all the variations like
1445 # Video Title - Site Name
1446 # Site Name | Video Title
1447 # Video Title - Tagline | Site Name
1448 # and so on and so forth; it's just not practical
1449 mobj = re.search(r'<title>(.*)</title>', webpage)
1450 if mobj is None:
e5f30ade 1451 self._downloader.report_error(u'unable to extract title')
59ae15a5 1452 return
f1171f7c 1453 video_title = mobj.group(1)
59ae15a5
PH
1454
1455 # video uploader is domain name
1456 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1457 if mobj is None:
e5f30ade 1458 self._downloader.report_error(u'unable to extract title')
59ae15a5 1459 return
f1171f7c 1460 video_uploader = mobj.group(1)
59ae15a5
PH
1461
1462 return [{
f1171f7c
PH
1463 'id': video_id,
1464 'url': video_url,
59ae15a5
PH
1465 'uploader': video_uploader,
1466 'upload_date': None,
1467 'title': video_title,
f1171f7c 1468 'ext': video_extension,
59ae15a5 1469 }]
d77c3dfd
FV
1470
1471
1472class YoutubeSearchIE(InfoExtractor):
59ae15a5
PH
1473 """Information Extractor for YouTube search queries."""
1474 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476 _max_youtube_results = 1000
1477 IE_NAME = u'youtube:search'
1478
1479 def __init__(self, downloader=None):
1480 InfoExtractor.__init__(self, downloader)
1481
1482 def report_download_page(self, query, pagenum):
1483 """Report attempt to download search page with given number."""
1484 query = query.decode(preferredencoding())
1485 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1486
1487 def _real_extract(self, query):
1488 mobj = re.match(self._VALID_URL, query)
1489 if mobj is None:
e5f30ade 1490 self._downloader.report_error(u'invalid search query "%s"' % query)
59ae15a5
PH
1491 return
1492
1493 prefix, query = query.split(':')
1494 prefix = prefix[8:]
1495 query = query.encode('utf-8')
1496 if prefix == '':
1497 self._download_n_results(query, 1)
1498 return
1499 elif prefix == 'all':
1500 self._download_n_results(query, self._max_youtube_results)
1501 return
1502 else:
1503 try:
1504 n = int(prefix)
1505 if n <= 0:
e5f30ade 1506 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
59ae15a5
PH
1507 return
1508 elif n > self._max_youtube_results:
2e5457be 1509 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
59ae15a5
PH
1510 n = self._max_youtube_results
1511 self._download_n_results(query, n)
1512 return
1513 except ValueError: # parsing prefix as integer fails
1514 self._download_n_results(query, 1)
1515 return
1516
1517 def _download_n_results(self, query, n):
1518 """Downloads a specified number of results for a query"""
1519
1520 video_ids = []
1521 pagenum = 0
1522 limit = n
1523
1524 while (50 * pagenum) < limit:
1525 self.report_download_page(query, pagenum+1)
1526 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527 request = compat_urllib_request.Request(result_url)
1528 try:
d1b7a243 1529 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 1530 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1531 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
59ae15a5
PH
1532 return
1533 api_response = json.loads(data)['data']
1534
9e07cf29
J
1535 if not 'items' in api_response:
1536 self._downloader.trouble(u'[youtube] No video results')
1537 return
1538
59ae15a5
PH
1539 new_ids = list(video['id'] for video in api_response['items'])
1540 video_ids += new_ids
1541
1542 limit = min(n, api_response['totalItems'])
1543 pagenum += 1
1544
1545 if len(video_ids) > n:
1546 video_ids = video_ids[:n]
1547 for id in video_ids:
1548 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1549 return
d77c3dfd
FV
1550
1551
1552class GoogleSearchIE(InfoExtractor):
59ae15a5
PH
1553 """Information Extractor for Google Video search queries."""
1554 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558 _max_google_results = 1000
1559 IE_NAME = u'video.google:search'
1560
1561 def __init__(self, downloader=None):
1562 InfoExtractor.__init__(self, downloader)
1563
1564 def report_download_page(self, query, pagenum):
1565 """Report attempt to download playlist page with given number."""
1566 query = query.decode(preferredencoding())
1567 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1568
1569 def _real_extract(self, query):
1570 mobj = re.match(self._VALID_URL, query)
1571 if mobj is None:
e5f30ade 1572 self._downloader.report_error(u'invalid search query "%s"' % query)
59ae15a5
PH
1573 return
1574
1575 prefix, query = query.split(':')
1576 prefix = prefix[8:]
1577 query = query.encode('utf-8')
1578 if prefix == '':
1579 self._download_n_results(query, 1)
1580 return
1581 elif prefix == 'all':
1582 self._download_n_results(query, self._max_google_results)
1583 return
1584 else:
1585 try:
1586 n = int(prefix)
1587 if n <= 0:
e5f30ade 1588 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
59ae15a5
PH
1589 return
1590 elif n > self._max_google_results:
2e5457be 1591 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
59ae15a5
PH
1592 n = self._max_google_results
1593 self._download_n_results(query, n)
1594 return
1595 except ValueError: # parsing prefix as integer fails
1596 self._download_n_results(query, 1)
1597 return
1598
1599 def _download_n_results(self, query, n):
1600 """Downloads a specified number of results for a query"""
1601
1602 video_ids = []
1603 pagenum = 0
1604
1605 while True:
1606 self.report_download_page(query, pagenum)
1607 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608 request = compat_urllib_request.Request(result_url)
1609 try:
1610 page = compat_urllib_request.urlopen(request).read()
1611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1612 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
1613 return
1614
1615 # Extract video identifiers
1616 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617 video_id = mobj.group(1)
1618 if video_id not in video_ids:
1619 video_ids.append(video_id)
1620 if len(video_ids) == n:
1621 # Specified n videos reached
1622 for id in video_ids:
1623 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1624 return
1625
1626 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627 for id in video_ids:
1628 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1629 return
1630
1631 pagenum = pagenum + 1
d77c3dfd
FV
1632
1633
1634class YahooSearchIE(InfoExtractor):
59ae15a5 1635 """Information Extractor for Yahoo! Video search queries."""
93702113
FV
1636
1637 _WORKING = False
59ae15a5
PH
1638 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641 _MORE_PAGES_INDICATOR = r'\s*Next'
1642 _max_yahoo_results = 1000
1643 IE_NAME = u'video.yahoo:search'
1644
1645 def __init__(self, downloader=None):
1646 InfoExtractor.__init__(self, downloader)
1647
1648 def report_download_page(self, query, pagenum):
1649 """Report attempt to download playlist page with given number."""
1650 query = query.decode(preferredencoding())
1651 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1652
1653 def _real_extract(self, query):
1654 mobj = re.match(self._VALID_URL, query)
1655 if mobj is None:
e5f30ade 1656 self._downloader.report_error(u'invalid search query "%s"' % query)
59ae15a5
PH
1657 return
1658
1659 prefix, query = query.split(':')
1660 prefix = prefix[8:]
1661 query = query.encode('utf-8')
1662 if prefix == '':
1663 self._download_n_results(query, 1)
1664 return
1665 elif prefix == 'all':
1666 self._download_n_results(query, self._max_yahoo_results)
1667 return
1668 else:
1669 try:
1670 n = int(prefix)
1671 if n <= 0:
e5f30ade 1672 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
59ae15a5
PH
1673 return
1674 elif n > self._max_yahoo_results:
2e5457be 1675 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
59ae15a5
PH
1676 n = self._max_yahoo_results
1677 self._download_n_results(query, n)
1678 return
1679 except ValueError: # parsing prefix as integer fails
1680 self._download_n_results(query, 1)
1681 return
1682
1683 def _download_n_results(self, query, n):
1684 """Downloads a specified number of results for a query"""
1685
1686 video_ids = []
1687 already_seen = set()
1688 pagenum = 1
1689
1690 while True:
1691 self.report_download_page(query, pagenum)
1692 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693 request = compat_urllib_request.Request(result_url)
1694 try:
1695 page = compat_urllib_request.urlopen(request).read()
1696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1697 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
1698 return
1699
1700 # Extract video identifiers
1701 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702 video_id = mobj.group(1)
1703 if video_id not in already_seen:
1704 video_ids.append(video_id)
1705 already_seen.add(video_id)
1706 if len(video_ids) == n:
1707 # Specified n videos reached
1708 for id in video_ids:
1709 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1710 return
1711
1712 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713 for id in video_ids:
1714 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1715 return
1716
1717 pagenum = pagenum + 1
d77c3dfd
FV
1718
1719
1720class YoutubePlaylistIE(InfoExtractor):
59ae15a5
PH
1721 """Information Extractor for YouTube playlists."""
1722
6324fd1d
FV
1723 _VALID_URL = r"""(?:
1724 (?:https?://)?
1725 (?:\w+\.)?
1726 youtube\.com/
1727 (?:
89de9eb1
FV
1728 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729 \? (?:.*?&)*? (?:p|a|list)=
6324fd1d 1730 | p/
6324fd1d 1731 )
89de9eb1
FV
1732 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1733 .*
1734 |
1735 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1736 )"""
6324fd1d
FV
1737 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1738 _MAX_RESULTS = 50
59ae15a5
PH
1739 IE_NAME = u'youtube:playlist'
1740
1741 def __init__(self, downloader=None):
1742 InfoExtractor.__init__(self, downloader)
1743
89de9eb1
FV
1744 @classmethod
1745 def suitable(cls, url):
6324fd1d 1746 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 1747 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
6324fd1d 1748
59ae15a5
PH
1749 def report_download_page(self, playlist_id, pagenum):
1750 """Report attempt to download playlist page with given number."""
1751 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1752
1753 def _real_extract(self, url):
1754 # Extract playlist id
6324fd1d 1755 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 1756 if mobj is None:
e5f30ade 1757 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5
PH
1758 return
1759
6324fd1d 1760 # Download playlist videos from API
89de9eb1 1761 playlist_id = mobj.group(1) or mobj.group(2)
6324fd1d
FV
1762 page_num = 1
1763 videos = []
59ae15a5
PH
1764
1765 while True:
6324fd1d
FV
1766 self.report_download_page(playlist_id, page_num)
1767
1768 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
59ae15a5 1769 try:
6324fd1d 1770 page = compat_urllib_request.urlopen(url).read().decode('utf8')
59ae15a5 1771 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1772 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
1773 return
1774
6324fd1d
FV
1775 try:
1776 response = json.loads(page)
1777 except ValueError as err:
e5f30ade 1778 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
6324fd1d 1779 return
59ae15a5 1780
89de9eb1 1781 if not 'feed' in response or not 'entry' in response['feed']:
e5f30ade 1782 self._downloader.report_error(u'Got a malformed response from YouTube API')
89de9eb1
FV
1783 return
1784 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1785 for entry in response['feed']['entry']
1786 if 'content' in entry ]
6324fd1d
FV
1787
1788 if len(response['feed']['entry']) < self._MAX_RESULTS:
59ae15a5 1789 break
6324fd1d 1790 page_num += 1
59ae15a5 1791
691db5ba 1792 videos = [v[1] for v in sorted(videos)]
6324fd1d 1793 total = len(videos)
9789a05c 1794
59ae15a5
PH
1795 playliststart = self._downloader.params.get('playliststart', 1) - 1
1796 playlistend = self._downloader.params.get('playlistend', -1)
1797 if playlistend == -1:
6324fd1d 1798 videos = videos[playliststart:]
59ae15a5 1799 else:
6324fd1d 1800 videos = videos[playliststart:playlistend]
59ae15a5 1801
6324fd1d 1802 if len(videos) == total:
9789a05c
FV
1803 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1804 else:
6324fd1d 1805 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
9789a05c 1806
6324fd1d
FV
1807 for video in videos:
1808 self._downloader.download([video])
59ae15a5 1809 return
d77c3dfd
FV
1810
1811
902b2a0a 1812class YoutubeChannelIE(InfoExtractor):
59ae15a5
PH
1813 """Information Extractor for YouTube channels."""
1814
1815 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1816 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
9789a05c 1817 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
59ae15a5
PH
1818 IE_NAME = u'youtube:channel'
1819
1820 def report_download_page(self, channel_id, pagenum):
1821 """Report attempt to download channel page with given number."""
1822 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1823
1824 def _real_extract(self, url):
1825 # Extract channel id
1826 mobj = re.match(self._VALID_URL, url)
1827 if mobj is None:
e5f30ade 1828 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5
PH
1829 return
1830
1831 # Download channel pages
1832 channel_id = mobj.group(1)
1833 video_ids = []
1834 pagenum = 1
1835
1836 while True:
1837 self.report_download_page(channel_id, pagenum)
1838 url = self._TEMPLATE_URL % (channel_id, pagenum)
1839 request = compat_urllib_request.Request(url)
1840 try:
9789a05c 1841 page = compat_urllib_request.urlopen(request).read().decode('utf8')
59ae15a5 1842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1843 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
1844 return
1845
1846 # Extract video identifiers
1847 ids_in_page = []
1848 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1849 if mobj.group(1) not in ids_in_page:
1850 ids_in_page.append(mobj.group(1))
1851 video_ids.extend(ids_in_page)
1852
9789a05c 1853 if self._MORE_PAGES_INDICATOR not in page:
59ae15a5
PH
1854 break
1855 pagenum = pagenum + 1
1856
9789a05c
FV
1857 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1858
59ae15a5
PH
1859 for id in video_ids:
1860 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1861 return
902b2a0a
FV
1862
1863
d77c3dfd 1864class YoutubeUserIE(InfoExtractor):
59ae15a5 1865 """Information Extractor for YouTube users."""
d77c3dfd 1866
59ae15a5
PH
1867 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1868 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1869 _GDATA_PAGE_SIZE = 50
1870 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1871 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1872 IE_NAME = u'youtube:user'
d77c3dfd 1873
59ae15a5
PH
1874 def __init__(self, downloader=None):
1875 InfoExtractor.__init__(self, downloader)
d77c3dfd 1876
59ae15a5
PH
1877 def report_download_page(self, username, start_index):
1878 """Report attempt to download user page."""
1879 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1880 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
d77c3dfd 1881
59ae15a5
PH
1882 def _real_extract(self, url):
1883 # Extract username
1884 mobj = re.match(self._VALID_URL, url)
1885 if mobj is None:
e5f30ade 1886 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5 1887 return
d77c3dfd 1888
59ae15a5 1889 username = mobj.group(1)
d77c3dfd 1890
59ae15a5
PH
1891 # Download video ids using YouTube Data API. Result size per
1892 # query is limited (currently to 50 videos) so we need to query
1893 # page by page until there are no video ids - it means we got
1894 # all of them.
d77c3dfd 1895
59ae15a5
PH
1896 video_ids = []
1897 pagenum = 0
d77c3dfd 1898
59ae15a5
PH
1899 while True:
1900 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1901 self.report_download_page(username, start_index)
d77c3dfd 1902
59ae15a5 1903 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
d77c3dfd 1904
59ae15a5 1905 try:
80d3177e 1906 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 1907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1908 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5 1909 return
d77c3dfd 1910
59ae15a5
PH
1911 # Extract video identifiers
1912 ids_in_page = []
d77c3dfd 1913
59ae15a5
PH
1914 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915 if mobj.group(1) not in ids_in_page:
1916 ids_in_page.append(mobj.group(1))
d77c3dfd 1917
59ae15a5 1918 video_ids.extend(ids_in_page)
d77c3dfd 1919
59ae15a5
PH
1920 # A little optimization - if current page is not
1921 # "full", ie. does not contain PAGE_SIZE video ids then
1922 # we can assume that this page is the last one - there
1923 # are no more ids on further pages - no need to query
1924 # again.
d77c3dfd 1925
59ae15a5
PH
1926 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1927 break
d77c3dfd 1928
59ae15a5 1929 pagenum += 1
d77c3dfd 1930
59ae15a5
PH
1931 all_ids_count = len(video_ids)
1932 playliststart = self._downloader.params.get('playliststart', 1) - 1
1933 playlistend = self._downloader.params.get('playlistend', -1)
d77c3dfd 1934
59ae15a5
PH
1935 if playlistend == -1:
1936 video_ids = video_ids[playliststart:]
1937 else:
1938 video_ids = video_ids[playliststart:playlistend]
d77c3dfd 1939
59ae15a5
PH
1940 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1941 (username, all_ids_count, len(video_ids)))
d77c3dfd 1942
59ae15a5
PH
1943 for video_id in video_ids:
1944 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
d77c3dfd
FV
1945
1946
eeeb4daa 1947class BlipTVUserIE(InfoExtractor):
59ae15a5 1948 """Information Extractor for blip.tv users."""
eeeb4daa 1949
59ae15a5
PH
1950 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1951 _PAGE_SIZE = 12
1952 IE_NAME = u'blip.tv:user'
eeeb4daa 1953
59ae15a5
PH
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
eeeb4daa 1956
59ae15a5
PH
1957 def report_download_page(self, username, pagenum):
1958 """Report attempt to download user page."""
1959 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960 (self.IE_NAME, username, pagenum))
eeeb4daa 1961
59ae15a5
PH
1962 def _real_extract(self, url):
1963 # Extract username
1964 mobj = re.match(self._VALID_URL, url)
1965 if mobj is None:
e5f30ade 1966 self._downloader.report_error(u'invalid url: %s' % url)
59ae15a5 1967 return
eeeb4daa 1968
59ae15a5 1969 username = mobj.group(1)
eeeb4daa 1970
59ae15a5 1971 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
eeeb4daa 1972
59ae15a5 1973 request = compat_urllib_request.Request(url)
eeeb4daa 1974
59ae15a5
PH
1975 try:
1976 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977 mobj = re.search(r'data-users-id="([^"]+)"', page)
1978 page_base = page_base % mobj.group(1)
1979 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1980 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5 1981 return
eeeb4daa
JCGS
1982
1983
59ae15a5
PH
1984 # Download video ids using BlipTV Ajax calls. Result size per
1985 # query is limited (currently to 12 videos) so we need to query
1986 # page by page until there are no video ids - it means we got
1987 # all of them.
eeeb4daa 1988
59ae15a5
PH
1989 video_ids = []
1990 pagenum = 1
eeeb4daa 1991
59ae15a5
PH
1992 while True:
1993 self.report_download_page(username, pagenum)
450e7099
PH
1994 url = page_base + "&page=" + str(pagenum)
1995 request = compat_urllib_request.Request( url )
59ae15a5
PH
1996 try:
1997 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 1999 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
59ae15a5 2000 return
eeeb4daa 2001
59ae15a5
PH
2002 # Extract video identifiers
2003 ids_in_page = []
eeeb4daa 2004
59ae15a5
PH
2005 for mobj in re.finditer(r'href="/([^"]+)"', page):
2006 if mobj.group(1) not in ids_in_page:
2007 ids_in_page.append(unescapeHTML(mobj.group(1)))
eeeb4daa 2008
59ae15a5 2009 video_ids.extend(ids_in_page)
eeeb4daa 2010
59ae15a5
PH
2011 # A little optimization - if current page is not
2012 # "full", ie. does not contain PAGE_SIZE video ids then
2013 # we can assume that this page is the last one - there
2014 # are no more ids on further pages - no need to query
2015 # again.
eeeb4daa 2016
59ae15a5
PH
2017 if len(ids_in_page) < self._PAGE_SIZE:
2018 break
eeeb4daa 2019
59ae15a5 2020 pagenum += 1
eeeb4daa 2021
59ae15a5
PH
2022 all_ids_count = len(video_ids)
2023 playliststart = self._downloader.params.get('playliststart', 1) - 1
2024 playlistend = self._downloader.params.get('playlistend', -1)
eeeb4daa 2025
59ae15a5
PH
2026 if playlistend == -1:
2027 video_ids = video_ids[playliststart:]
2028 else:
2029 video_ids = video_ids[playliststart:playlistend]
eeeb4daa 2030
59ae15a5
PH
2031 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2032 (self.IE_NAME, username, all_ids_count, len(video_ids)))
eeeb4daa 2033
59ae15a5
PH
2034 for video_id in video_ids:
2035 self._downloader.download([u'http://blip.tv/'+video_id])
eeeb4daa
JCGS
2036
2037
d77c3dfd 2038class DepositFilesIE(InfoExtractor):
59ae15a5
PH
2039 """Information extractor for depositfiles.com"""
2040
2041 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
59ae15a5
PH
2042
2043 def report_download_webpage(self, file_id):
2044 """Report webpage download."""
2045 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2046
2047 def report_extraction(self, file_id):
2048 """Report information extraction."""
2049 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2050
2051 def _real_extract(self, url):
2052 file_id = url.split('/')[-1]
2053 # Rebuild url in english locale
2054 url = 'http://depositfiles.com/en/files/' + file_id
2055
2056 # Retrieve file webpage with 'Free download' button pressed
2057 free_download_indication = { 'gateway_result' : '1' }
2058 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2059 try:
2060 self.report_download_webpage(file_id)
2061 webpage = compat_urllib_request.urlopen(request).read()
2062 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2063 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
59ae15a5
PH
2064 return
2065
2066 # Search for the real file URL
2067 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2068 if (mobj is None) or (mobj.group(1) is None):
2069 # Try to figure out reason of the error.
2070 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2071 if (mobj is not None) and (mobj.group(1) is not None):
2072 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
e5f30ade 2073 self._downloader.report_error(u'%s' % restriction_message)
59ae15a5 2074 else:
e5f30ade 2075 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
59ae15a5
PH
2076 return
2077
2078 file_url = mobj.group(1)
2079 file_extension = os.path.splitext(file_url)[1][1:]
2080
2081 # Search for file title
2082 mobj = re.search(r'<b title="(.*?)">', webpage)
2083 if mobj is None:
e5f30ade 2084 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
2085 return
2086 file_title = mobj.group(1).decode('utf-8')
2087
2088 return [{
2089 'id': file_id.decode('utf-8'),
2090 'url': file_url.decode('utf-8'),
2091 'uploader': None,
2092 'upload_date': None,
2093 'title': file_title,
2094 'ext': file_extension.decode('utf-8'),
2095 }]
d77c3dfd
FV
2096
2097
2098class FacebookIE(InfoExtractor):
59ae15a5
PH
2099 """Information Extractor for Facebook"""
2100
59ae15a5
PH
2101 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2102 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2103 _NETRC_MACHINE = 'facebook'
59ae15a5
PH
2104 IE_NAME = u'facebook'
2105
59ae15a5
PH
2106 def report_login(self):
2107 """Report attempt to log in."""
b954070d 2108 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
59ae15a5
PH
2109
2110 def _real_initialize(self):
2111 if self._downloader is None:
2112 return
2113
2114 useremail = None
2115 password = None
2116 downloader_params = self._downloader.params
2117
2118 # Attempt to use provided username and password or .netrc data
2119 if downloader_params.get('username', None) is not None:
2120 useremail = downloader_params['username']
2121 password = downloader_params['password']
2122 elif downloader_params.get('usenetrc', False):
2123 try:
2124 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2125 if info is not None:
2126 useremail = info[0]
2127 password = info[2]
2128 else:
2129 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2130 except (IOError, netrc.NetrcParseError) as err:
2e5457be 2131 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
59ae15a5
PH
2132 return
2133
2134 if useremail is None:
2135 return
2136
2137 # Log in
2138 login_form = {
2139 'email': useremail,
2140 'pass': password,
2141 'login': 'Log+In'
2142 }
2143 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2144 try:
2145 self.report_login()
2146 login_results = compat_urllib_request.urlopen(request).read()
2147 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2e5457be 2148 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
59ae15a5
PH
2149 return
2150 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2e5457be 2151 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
59ae15a5
PH
2152 return
2153
2154 def _real_extract(self, url):
2155 mobj = re.match(self._VALID_URL, url)
2156 if mobj is None:
e5f30ade 2157 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2158 return
2159 video_id = mobj.group('ID')
2160
b954070d
PH
2161 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2162 webpage = self._download_webpage(url, video_id)
2163
32c96387 2164 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
b954070d
PH
2165 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2166 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2167 if not m:
2168 raise ExtractorError(u'Cannot parse data')
2169 data = dict(json.loads(m.group(1)))
edba5137
PH
2170 params_raw = compat_urllib_parse.unquote(data['params'])
2171 params = json.loads(params_raw)
32c96387
PH
2172 video_data = params['video_data'][0]
2173 video_url = video_data.get('hd_src')
7796e8c2 2174 if not video_url:
32c96387 2175 video_url = video_data['sd_src']
7796e8c2
PH
2176 if not video_url:
2177 raise ExtractorError(u'Cannot find video URL')
32c96387
PH
2178 video_duration = int(video_data['video_duration'])
2179 thumbnail = video_data['thumbnail_src']
b954070d
PH
2180
2181 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2182 if not m:
2183 raise ExtractorError(u'Cannot find title in webpage')
2184 video_title = unescapeHTML(m.group(1))
2185
2186 info = {
2187 'id': video_id,
2188 'title': video_title,
2189 'url': video_url,
2190 'ext': 'mp4',
2191 'duration': video_duration,
32c96387 2192 'thumbnail': thumbnail,
b954070d
PH
2193 }
2194 return [info]
59ae15a5 2195
d77c3dfd
FV
2196
2197class BlipTVIE(InfoExtractor):
59ae15a5
PH
2198 """Information extractor for blip.tv"""
2199
2200 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2201 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2202 IE_NAME = u'blip.tv'
2203
2204 def report_extraction(self, file_id):
2205 """Report information extraction."""
2206 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2207
2208 def report_direct_download(self, title):
2209 """Report information extraction."""
2210 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2211
2212 def _real_extract(self, url):
2213 mobj = re.match(self._VALID_URL, url)
2214 if mobj is None:
e5f30ade 2215 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2216 return
2217
f7b567ff
PH
2218 urlp = compat_urllib_parse_urlparse(url)
2219 if urlp.path.startswith('/play/'):
7f9d41a5
JCGS
2220 request = compat_urllib_request.Request(url)
2221 response = compat_urllib_request.urlopen(request)
2222 redirecturl = response.geturl()
f7b567ff
PH
2223 rurlp = compat_urllib_parse_urlparse(redirecturl)
2224 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2225 url = 'http://blip.tv/a/a-' + file_id
2226 return self._real_extract(url)
2227
7f9d41a5 2228
59ae15a5
PH
2229 if '?' in url:
2230 cchar = '&'
2231 else:
2232 cchar = '?'
2233 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
55c05398 2234 request = compat_urllib_request.Request(json_url)
3446dfb7 2235 request.add_header('User-Agent', 'iTunes/10.6.1')
59ae15a5
PH
2236 self.report_extraction(mobj.group(1))
2237 info = None
2238 try:
2239 urlh = compat_urllib_request.urlopen(request)
2240 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2241 basename = url.split('/')[-1]
2242 title,ext = os.path.splitext(basename)
2243 title = title.decode('UTF-8')
2244 ext = ext.replace('.', '')
2245 self.report_direct_download(title)
2246 info = {
2247 'id': title,
2248 'url': url,
2249 'uploader': None,
2250 'upload_date': None,
2251 'title': title,
2252 'ext': ext,
2253 'urlhandle': urlh
2254 }
2255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446dfb7 2256 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
59ae15a5
PH
2257 if info is None: # Regular URL
2258 try:
55c05398
PH
2259 json_code_bytes = urlh.read()
2260 json_code = json_code_bytes.decode('utf-8')
59ae15a5 2261 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2262 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
59ae15a5
PH
2263 return
2264
2265 try:
2266 json_data = json.loads(json_code)
2267 if 'Post' in json_data:
2268 data = json_data['Post']
2269 else:
2270 data = json_data
2271
2272 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2273 video_url = data['media']['url']
2274 umobj = re.match(self._URL_EXT, video_url)
2275 if umobj is None:
2276 raise ValueError('Can not determine filename extension')
2277 ext = umobj.group(1)
2278
2279 info = {
2280 'id': data['item_id'],
2281 'url': video_url,
2282 'uploader': data['display_name'],
2283 'upload_date': upload_date,
2284 'title': data['title'],
2285 'ext': ext,
2286 'format': data['media']['mimeType'],
2287 'thumbnail': data['thumbnailUrl'],
2288 'description': data['description'],
3446dfb7
PH
2289 'player_url': data['embedUrl'],
2290 'user_agent': 'iTunes/10.6.1',
59ae15a5
PH
2291 }
2292 except (ValueError,KeyError) as err:
e5f30ade 2293 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
59ae15a5
PH
2294 return
2295
59ae15a5 2296 return [info]
d77c3dfd
FV
2297
2298
2299class MyVideoIE(InfoExtractor):
59ae15a5
PH
2300 """Information Extractor for myvideo.de."""
2301
2302 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2303 IE_NAME = u'myvideo'
2304
2305 def __init__(self, downloader=None):
2306 InfoExtractor.__init__(self, downloader)
cdb30764 2307
59ae15a5
PH
2308 def report_extraction(self, video_id):
2309 """Report information extraction."""
2310 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2311
2312 def _real_extract(self,url):
2313 mobj = re.match(self._VALID_URL, url)
2314 if mobj is None:
e5f30ade 2315 self._download.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2316 return
2317
2318 video_id = mobj.group(1)
2319
2320 # Get video webpage
5f955171
PH
2321 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2322 webpage = self._download_webpage(webpage_url, video_id)
59ae15a5
PH
2323
2324 self.report_extraction(video_id)
7decf895 2325 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
59ae15a5
PH
2326 webpage)
2327 if mobj is None:
e5f30ade 2328 self._downloader.report_error(u'unable to extract media URL')
59ae15a5
PH
2329 return
2330 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2331
2332 mobj = re.search('<title>([^<]+)</title>', webpage)
2333 if mobj is None:
e5f30ade 2334 self._downloader.report_error(u'unable to extract title')
59ae15a5
PH
2335 return
2336
2337 video_title = mobj.group(1)
2338
2339 return [{
2340 'id': video_id,
2341 'url': video_url,
2342 'uploader': None,
2343 'upload_date': None,
2344 'title': video_title,
2345 'ext': u'flv',
2346 }]
d77c3dfd
FV
2347
2348class ComedyCentralIE(InfoExtractor):
59ae15a5
PH
2349 """Information extractor for The Daily Show and Colbert Report """
2350
ca6849e6 2351 # urls can be abbreviations like :thedailyshow or :colbert
cdb30764 2352 # urls for episodes like:
ca6849e6 2353 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2354 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
cdb30764 2355 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
ca6849e6 2356 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2357 |(https?://)?(www\.)?
2358 (?P<showname>thedailyshow|colbertnation)\.com/
2359 (full-episodes/(?P<episode>.*)|
2360 (?P<clip>
2361 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2362 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
cdb30764 2363 $"""
59ae15a5
PH
2364
2365 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2366
2367 _video_extensions = {
2368 '3500': 'mp4',
2369 '2200': 'mp4',
2370 '1700': 'mp4',
2371 '1200': 'mp4',
2372 '750': 'mp4',
2373 '400': 'mp4',
2374 }
2375 _video_dimensions = {
2376 '3500': '1280x720',
2377 '2200': '960x540',
2378 '1700': '768x432',
2379 '1200': '640x360',
2380 '750': '512x288',
2381 '400': '384x216',
2382 }
2383
89de9eb1
FV
2384 @classmethod
2385 def suitable(cls, url):
ca6849e6 2386 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 2387 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
ca6849e6 2388
59ae15a5
PH
2389 def report_extraction(self, episode_id):
2390 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2391
32635ec6
PH
2392 def report_config_download(self, episode_id, media_id):
2393 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
59ae15a5
PH
2394
2395 def report_index_download(self, episode_id):
2396 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2397
59ae15a5
PH
2398 def _print_formats(self, formats):
2399 print('Available formats:')
2400 for x in formats:
2401 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2402
2403
2404 def _real_extract(self, url):
ca6849e6 2405 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 2406 if mobj is None:
e5f30ade 2407 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2408 return
2409
2410 if mobj.group('shortname'):
2411 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2412 url = u'http://www.thedailyshow.com/full-episodes/'
2413 else:
2414 url = u'http://www.colbertnation.com/full-episodes/'
ca6849e6 2415 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5
PH
2416 assert mobj is not None
2417
ca6849e6 2418 if mobj.group('clip'):
2419 if mobj.group('showname') == 'thedailyshow':
2420 epTitle = mobj.group('tdstitle')
2421 else:
2422 epTitle = mobj.group('cntitle')
2423 dlNewest = False
59ae15a5 2424 else:
ca6849e6 2425 dlNewest = not mobj.group('episode')
2426 if dlNewest:
2427 epTitle = mobj.group('showname')
2428 else:
2429 epTitle = mobj.group('episode')
59ae15a5
PH
2430
2431 req = compat_urllib_request.Request(url)
2432 self.report_extraction(epTitle)
2433 try:
2434 htmlHandle = compat_urllib_request.urlopen(req)
2435 html = htmlHandle.read()
93148102 2436 webpage = html.decode('utf-8')
59ae15a5 2437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2438 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
2439 return
2440 if dlNewest:
2441 url = htmlHandle.geturl()
ca6849e6 2442 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
59ae15a5 2443 if mobj is None:
e5f30ade 2444 self._downloader.report_error(u'Invalid redirected URL: ' + url)
59ae15a5
PH
2445 return
2446 if mobj.group('episode') == '':
e5f30ade 2447 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
59ae15a5
PH
2448 return
2449 epTitle = mobj.group('episode')
2450
93148102 2451 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
59ae15a5
PH
2452
2453 if len(mMovieParams) == 0:
2454 # The Colbert Report embeds the information in a without
2455 # a URL prefix; so extract the alternate reference
2456 # and then add the URL prefix manually.
2457
93148102 2458 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
59ae15a5 2459 if len(altMovieParams) == 0:
e5f30ade 2460 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
59ae15a5
PH
2461 return
2462 else:
2463 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
cdb30764 2464
59ae15a5
PH
2465 uri = mMovieParams[0][1]
2466 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2467 self.report_index_download(epTitle)
2468 try:
2469 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2471 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
59ae15a5
PH
2472 return
2473
2474 results = []
2475
2476 idoc = xml.etree.ElementTree.fromstring(indexXml)
2477 itemEls = idoc.findall('.//item')
7717ae19 2478 for partNum,itemEl in enumerate(itemEls):
59ae15a5
PH
2479 mediaId = itemEl.findall('./guid')[0].text
2480 shortMediaId = mediaId.split(':')[-1]
2481 showId = mediaId.split(':')[-2].replace('.com', '')
2482 officialTitle = itemEl.findall('./title')[0].text
2483 officialDate = itemEl.findall('./pubDate')[0].text
2484
2485 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2486 compat_urllib_parse.urlencode({'uri': mediaId}))
2487 configReq = compat_urllib_request.Request(configUrl)
32635ec6 2488 self.report_config_download(epTitle, shortMediaId)
59ae15a5
PH
2489 try:
2490 configXml = compat_urllib_request.urlopen(configReq).read()
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2492 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
59ae15a5
PH
2493 return
2494
2495 cdoc = xml.etree.ElementTree.fromstring(configXml)
2496 turls = []
2497 for rendition in cdoc.findall('.//rendition'):
2498 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2499 turls.append(finfo)
2500
2501 if len(turls) == 0:
c9fa1cba 2502 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
59ae15a5 2503 continue
cdb30764 2504
59ae15a5
PH
2505 if self._downloader.params.get('listformats', None):
2506 self._print_formats([i[0] for i in turls])
2507 return
2508
2509 # For now, just pick the highest bitrate
32635ec6 2510 format,rtmp_video_url = turls[-1]
59ae15a5
PH
2511
2512 # Get the format arg from the arg stream
2513 req_format = self._downloader.params.get('format', None)
2514
2515 # Select format if we can find one
2516 for f,v in turls:
2517 if f == req_format:
32635ec6 2518 format, rtmp_video_url = f, v
59ae15a5
PH
2519 break
2520
32635ec6
PH
2521 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2522 if not m:
2523 raise ExtractorError(u'Cannot transform RTMP url')
2524 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2525 video_url = base + m.group('finalid')
59ae15a5 2526
7717ae19 2527 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
59ae15a5
PH
2528 info = {
2529 'id': shortMediaId,
2530 'url': video_url,
2531 'uploader': showId,
2532 'upload_date': officialDate,
2533 'title': effTitle,
2534 'ext': 'mp4',
2535 'format': format,
2536 'thumbnail': None,
2537 'description': officialTitle,
59ae15a5 2538 }
59ae15a5 2539 results.append(info)
cdb30764 2540
59ae15a5 2541 return results
d77c3dfd
FV
2542
2543
2544class EscapistIE(InfoExtractor):
59ae15a5
PH
2545 """Information extractor for The Escapist """
2546
2547 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2548 IE_NAME = u'escapist'
2549
2550 def report_extraction(self, showName):
2551 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2552
2553 def report_config_download(self, showName):
2554 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2555
2556 def _real_extract(self, url):
2557 mobj = re.match(self._VALID_URL, url)
2558 if mobj is None:
e5f30ade 2559 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2560 return
2561 showName = mobj.group('showname')
2562 videoId = mobj.group('episode')
2563
2564 self.report_extraction(showName)
2565 try:
2566 webPage = compat_urllib_request.urlopen(url)
2567 webPageBytes = webPage.read()
2568 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2569 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2570 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2571 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
59ae15a5
PH
2572 return
2573
2574 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2575 description = unescapeHTML(descMatch.group(1))
2576 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2577 imgUrl = unescapeHTML(imgMatch.group(1))
2578 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2579 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2580 configUrlMatch = re.search('config=(.*)$', playerUrl)
2581 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2582
2583 self.report_config_download(showName)
2584 try:
93702113
FV
2585 configJSON = compat_urllib_request.urlopen(configUrl)
2586 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2587 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
59ae15a5 2588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2589 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
59ae15a5
PH
2590 return
2591
2592 # Technically, it's JavaScript, not JSON
2593 configJSON = configJSON.replace("'", '"')
2594
2595 try:
2596 config = json.loads(configJSON)
2597 except (ValueError,) as err:
e5f30ade 2598 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
59ae15a5
PH
2599 return
2600
2601 playlist = config['playlist']
2602 videoUrl = playlist[1]['url']
2603
2604 info = {
2605 'id': videoId,
2606 'url': videoUrl,
2607 'uploader': showName,
2608 'upload_date': None,
2609 'title': showName,
47dcd621 2610 'ext': 'mp4',
59ae15a5
PH
2611 'thumbnail': imgUrl,
2612 'description': description,
2613 'player_url': playerUrl,
2614 }
2615
2616 return [info]
d77c3dfd 2617
d77c3dfd 2618class CollegeHumorIE(InfoExtractor):
59ae15a5
PH
2619 """Information extractor for collegehumor.com"""
2620
0eb0faa2 2621 _WORKING = False
59ae15a5
PH
2622 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2623 IE_NAME = u'collegehumor'
2624
799c0763 2625 def report_manifest(self, video_id):
59ae15a5 2626 """Report information extraction."""
799c0763 2627 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
59ae15a5
PH
2628
2629 def report_extraction(self, video_id):
2630 """Report information extraction."""
2631 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2632
2633 def _real_extract(self, url):
2634 mobj = re.match(self._VALID_URL, url)
2635 if mobj is None:
e5f30ade 2636 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2637 return
2638 video_id = mobj.group('videoid')
2639
59ae15a5
PH
2640 info = {
2641 'id': video_id,
59ae15a5
PH
2642 'uploader': None,
2643 'upload_date': None,
2644 }
2645
2646 self.report_extraction(video_id)
799c0763 2647 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
59ae15a5
PH
2648 try:
2649 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2650 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2651 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
59ae15a5
PH
2652 return
2653
2654 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2655 try:
2656 videoNode = mdoc.findall('./video')[0]
2657 info['description'] = videoNode.findall('./description')[0].text
2658 info['title'] = videoNode.findall('./caption')[0].text
59ae15a5 2659 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
799c0763 2660 manifest_url = videoNode.findall('./file')[0].text
59ae15a5 2661 except IndexError:
c9fa1cba 2662 self._downloader.report_error(u'Invalid metadata XML file')
59ae15a5
PH
2663 return
2664
799c0763
PH
2665 manifest_url += '?hdcore=2.10.3'
2666 self.report_manifest(video_id)
2667 try:
2668 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2670 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
799c0763
PH
2671 return
2672
2673 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2674 try:
2675 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2676 node_id = media_node.attrib['url']
2677 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2678 except IndexError as err:
c9fa1cba 2679 self._downloader.report_error(u'Invalid manifest file')
799c0763
PH
2680 return
2681
2682 url_pr = compat_urllib_parse_urlparse(manifest_url)
2683 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2684
2685 info['url'] = url
2686 info['ext'] = 'f4f'
59ae15a5 2687 return [info]
d77c3dfd
FV
2688
2689
2690class XVideosIE(InfoExtractor):
59ae15a5 2691 """Information extractor for xvideos.com"""
d77c3dfd 2692
59ae15a5
PH
2693 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2694 IE_NAME = u'xvideos'
d77c3dfd 2695
59ae15a5
PH
2696 def report_extraction(self, video_id):
2697 """Report information extraction."""
2698 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
d77c3dfd 2699
59ae15a5
PH
2700 def _real_extract(self, url):
2701 mobj = re.match(self._VALID_URL, url)
2702 if mobj is None:
e5f30ade 2703 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5 2704 return
8588a86f 2705 video_id = mobj.group(1)
d77c3dfd 2706
5f955171 2707 webpage = self._download_webpage(url, video_id)
d77c3dfd 2708
59ae15a5 2709 self.report_extraction(video_id)
d77c3dfd
FV
2710
2711
59ae15a5
PH
2712 # Extract video URL
2713 mobj = re.search(r'flv_url=(.+?)&', webpage)
2714 if mobj is None:
e5f30ade 2715 self._downloader.report_error(u'unable to extract video url')
59ae15a5 2716 return
8588a86f 2717 video_url = compat_urllib_parse.unquote(mobj.group(1))
d77c3dfd
FV
2718
2719
59ae15a5
PH
2720 # Extract title
2721 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2722 if mobj is None:
e5f30ade 2723 self._downloader.report_error(u'unable to extract video title')
59ae15a5 2724 return
8588a86f 2725 video_title = mobj.group(1)
d77c3dfd
FV
2726
2727
59ae15a5
PH
2728 # Extract video thumbnail
2729 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2730 if mobj is None:
e5f30ade 2731 self._downloader.report_error(u'unable to extract video thumbnail')
59ae15a5 2732 return
8588a86f 2733 video_thumbnail = mobj.group(0)
d77c3dfd 2734
59ae15a5
PH
2735 info = {
2736 'id': video_id,
2737 'url': video_url,
2738 'uploader': None,
2739 'upload_date': None,
2740 'title': video_title,
2741 'ext': 'flv',
2742 'thumbnail': video_thumbnail,
2743 'description': None,
2744 }
d77c3dfd 2745
59ae15a5 2746 return [info]
d77c3dfd
FV
2747
2748
2749class SoundcloudIE(InfoExtractor):
59ae15a5
PH
2750 """Information extractor for soundcloud.com
2751 To access the media, the uid of the song and a stream token
2752 must be extracted from the page source and the script must make
2753 a request to media.soundcloud.com/crossdomain.xml. Then
2754 the media can be grabbed by requesting from an url composed
2755 of the stream token and uid
2756 """
2757
2758 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2759 IE_NAME = u'soundcloud'
2760
2761 def __init__(self, downloader=None):
2762 InfoExtractor.__init__(self, downloader)
2763
8fd3afd5 2764 def report_resolve(self, video_id):
59ae15a5 2765 """Report information extraction."""
8fd3afd5 2766 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
59ae15a5
PH
2767
2768 def report_extraction(self, video_id):
2769 """Report information extraction."""
8fd3afd5 2770 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
59ae15a5
PH
2771
2772 def _real_extract(self, url):
2773 mobj = re.match(self._VALID_URL, url)
2774 if mobj is None:
e5f30ade 2775 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2776 return
2777
2778 # extract uploader (which is in the url)
15c8d833 2779 uploader = mobj.group(1)
59ae15a5 2780 # extract simple title (uploader + slug of song title)
15c8d833 2781 slug_title = mobj.group(2)
59ae15a5
PH
2782 simple_title = uploader + u'-' + slug_title
2783
8fd3afd5 2784 self.report_resolve('%s/%s' % (uploader, slug_title))
59ae15a5 2785
8fd3afd5
PH
2786 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2787 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2788 request = compat_urllib_request.Request(resolv_url)
59ae15a5 2789 try:
8fd3afd5
PH
2790 info_json_bytes = compat_urllib_request.urlopen(request).read()
2791 info_json = info_json_bytes.decode('utf-8')
59ae15a5 2792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2793 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
59ae15a5
PH
2794 return
2795
8fd3afd5
PH
2796 info = json.loads(info_json)
2797 video_id = info['id']
59ae15a5
PH
2798 self.report_extraction('%s/%s' % (uploader, slug_title))
2799
8fd3afd5 2800 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
c7214f9a 2801 request = compat_urllib_request.Request(streams_url)
8fd3afd5
PH
2802 try:
2803 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2804 stream_json = stream_json_bytes.decode('utf-8')
2805 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 2806 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
b4cd069d 2807 return
59ae15a5 2808
8fd3afd5 2809 streams = json.loads(stream_json)
c7214f9a 2810 mediaURL = streams['http_mp3_128_url']
59ae15a5
PH
2811
2812 return [{
c7214f9a 2813 'id': info['id'],
59ae15a5 2814 'url': mediaURL,
c7214f9a
PH
2815 'uploader': info['user']['username'],
2816 'upload_date': info['created_at'],
2817 'title': info['title'],
59ae15a5 2818 'ext': u'mp3',
c7214f9a 2819 'description': info['description'],
59ae15a5 2820 }]
d77c3dfd 2821
5011cded 2822class SoundcloudSetIE(InfoExtractor):
2823 """Information extractor for soundcloud.com sets
2824 To access the media, the uid of the song and a stream token
2825 must be extracted from the page source and the script must make
2826 a request to media.soundcloud.com/crossdomain.xml. Then
2827 the media can be grabbed by requesting from an url composed
2828 of the stream token and uid
2829 """
2830
2831 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2832 IE_NAME = u'soundcloud'
2833
2834 def __init__(self, downloader=None):
2835 InfoExtractor.__init__(self, downloader)
2836
2837 def report_resolve(self, video_id):
2838 """Report information extraction."""
2839 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2840
2841 def report_extraction(self, video_id):
2842 """Report information extraction."""
2843 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2844
2845 def _real_extract(self, url):
2846 mobj = re.match(self._VALID_URL, url)
2847 if mobj is None:
2848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849 return
2850
2851 # extract uploader (which is in the url)
2852 uploader = mobj.group(1)
2853 # extract simple title (uploader + slug of song title)
2854 slug_title = mobj.group(2)
2855 simple_title = uploader + u'-' + slug_title
2856
2857 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2858
2859 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2860 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2861 request = compat_urllib_request.Request(resolv_url)
2862 try:
2863 info_json_bytes = compat_urllib_request.urlopen(request).read()
2864 info_json = info_json_bytes.decode('utf-8')
2865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2867 return
2868
2869 videos = []
2870 info = json.loads(info_json)
2871 if 'errors' in info:
2872 for err in info['errors']:
2873 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2874 return
2875
2876 for track in info['tracks']:
2877 video_id = track['id']
2878 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2879
2880 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2881 request = compat_urllib_request.Request(streams_url)
2882 try:
2883 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2884 stream_json = stream_json_bytes.decode('utf-8')
2885 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2886 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2887 return
2888
2889 streams = json.loads(stream_json)
2890 mediaURL = streams['http_mp3_128_url']
2891
2892 videos.append({
2893 'id': video_id,
2894 'url': mediaURL,
2895 'uploader': track['user']['username'],
2896 'upload_date': track['created_at'],
2897 'title': track['title'],
2898 'ext': u'mp3',
2899 'description': track['description'],
2900 })
2901 return videos
2902
d77c3dfd
FV
2903
2904class InfoQIE(InfoExtractor):
59ae15a5 2905 """Information extractor for infoq.com"""
59ae15a5 2906 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
59ae15a5 2907
59ae15a5
PH
2908 def report_extraction(self, video_id):
2909 """Report information extraction."""
2910 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2911
2912 def _real_extract(self, url):
2913 mobj = re.match(self._VALID_URL, url)
2914 if mobj is None:
e5f30ade 2915 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
2916 return
2917
4fcca4bb 2918 webpage = self._download_webpage(url, video_id=url)
59ae15a5
PH
2919 self.report_extraction(url)
2920
59ae15a5
PH
2921 # Extract video URL
2922 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2923 if mobj is None:
e5f30ade 2924 self._downloader.report_error(u'unable to extract video url')
59ae15a5 2925 return
4fcca4bb
PH
2926 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2927 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
59ae15a5
PH
2928
2929 # Extract title
2930 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2931 if mobj is None:
e5f30ade 2932 self._downloader.report_error(u'unable to extract video title')
59ae15a5 2933 return
4fcca4bb 2934 video_title = mobj.group(1)
59ae15a5
PH
2935
2936 # Extract description
2937 video_description = u'No description available.'
2938 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2939 if mobj is not None:
4fcca4bb 2940 video_description = mobj.group(1)
59ae15a5
PH
2941
2942 video_filename = video_url.split('/')[-1]
2943 video_id, extension = video_filename.split('.')
2944
2945 info = {
2946 'id': video_id,
2947 'url': video_url,
2948 'uploader': None,
2949 'upload_date': None,
2950 'title': video_title,
2951 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2952 'thumbnail': None,
2953 'description': video_description,
2954 }
2955
2956 return [info]
d77c3dfd
FV
2957
2958class MixcloudIE(InfoExtractor):
59ae15a5 2959 """Information extractor for www.mixcloud.com"""
93702113
FV
2960
2961 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
59ae15a5
PH
2962 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2963 IE_NAME = u'mixcloud'
2964
2965 def __init__(self, downloader=None):
2966 InfoExtractor.__init__(self, downloader)
2967
2968 def report_download_json(self, file_id):
2969 """Report JSON download."""
2970 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2971
2972 def report_extraction(self, file_id):
2973 """Report information extraction."""
2974 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2975
2976 def get_urls(self, jsonData, fmt, bitrate='best'):
2977 """Get urls from 'audio_formats' section in json"""
2978 file_url = None
2979 try:
2980 bitrate_list = jsonData[fmt]
2981 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2982 bitrate = max(bitrate_list) # select highest
2983
2984 url_list = jsonData[fmt][bitrate]
2985 except TypeError: # we have no bitrate info.
2986 url_list = jsonData[fmt]
2987 return url_list
2988
2989 def check_urls(self, url_list):
2990 """Returns 1st active url from list"""
2991 for url in url_list:
2992 try:
2993 compat_urllib_request.urlopen(url)
2994 return url
2995 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2996 url = None
2997
2998 return None
2999
3000 def _print_formats(self, formats):
3001 print('Available formats:')
3002 for fmt in formats.keys():
3003 for b in formats[fmt]:
3004 try:
3005 ext = formats[fmt][b][0]
3006 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3007 except TypeError: # we have no bitrate info
3008 ext = formats[fmt][0]
3009 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3010 break
3011
3012 def _real_extract(self, url):
3013 mobj = re.match(self._VALID_URL, url)
3014 if mobj is None:
e5f30ade 3015 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
3016 return
3017 # extract uploader & filename from url
3018 uploader = mobj.group(1).decode('utf-8')
3019 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3020
3021 # construct API request
3022 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3023 # retrieve .json file with links to files
3024 request = compat_urllib_request.Request(file_url)
3025 try:
3026 self.report_download_json(file_url)
3027 jsonData = compat_urllib_request.urlopen(request).read()
3028 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3029 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
59ae15a5
PH
3030 return
3031
3032 # parse JSON
3033 json_data = json.loads(jsonData)
3034 player_url = json_data['player_swf_url']
3035 formats = dict(json_data['audio_formats'])
3036
3037 req_format = self._downloader.params.get('format', None)
3038 bitrate = None
3039
3040 if self._downloader.params.get('listformats', None):
3041 self._print_formats(formats)
3042 return
3043
3044 if req_format is None or req_format == 'best':
3045 for format_param in formats.keys():
3046 url_list = self.get_urls(formats, format_param)
3047 # check urls
3048 file_url = self.check_urls(url_list)
3049 if file_url is not None:
3050 break # got it!
3051 else:
99b0a129 3052 if req_format not in formats:
e5f30ade 3053 self._downloader.report_error(u'format is not available')
59ae15a5
PH
3054 return
3055
3056 url_list = self.get_urls(formats, req_format)
3057 file_url = self.check_urls(url_list)
3058 format_param = req_format
3059
3060 return [{
3061 'id': file_id.decode('utf-8'),
3062 'url': file_url.decode('utf-8'),
3063 'uploader': uploader.decode('utf-8'),
3064 'upload_date': None,
3065 'title': json_data['name'],
3066 'ext': file_url.split('.')[-1].decode('utf-8'),
3067 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3068 'thumbnail': json_data['thumbnail_url'],
3069 'description': json_data['description'],
3070 'player_url': player_url.decode('utf-8'),
3071 }]
d77c3dfd
FV
3072
3073class StanfordOpenClassroomIE(InfoExtractor):
59ae15a5
PH
3074 """Information extractor for Stanford's Open ClassRoom"""
3075
3076 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3077 IE_NAME = u'stanfordoc'
3078
3079 def report_download_webpage(self, objid):
3080 """Report information extraction."""
3081 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3082
3083 def report_extraction(self, video_id):
3084 """Report information extraction."""
3085 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3086
3087 def _real_extract(self, url):
3088 mobj = re.match(self._VALID_URL, url)
3089 if mobj is None:
f0bad2b0 3090 raise ExtractorError(u'Invalid URL: %s' % url)
59ae15a5
PH
3091
3092 if mobj.group('course') and mobj.group('video'): # A specific video
3093 course = mobj.group('course')
3094 video = mobj.group('video')
3095 info = {
3096 'id': course + '_' + video,
3097 'uploader': None,
3098 'upload_date': None,
3099 }
3100
3101 self.report_extraction(info['id'])
3102 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3103 xmlUrl = baseUrl + video + '.xml'
3104 try:
3105 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3106 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3107 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
59ae15a5
PH
3108 return
3109 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3110 try:
3111 info['title'] = mdoc.findall('./title')[0].text
3112 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3113 except IndexError:
c9fa1cba 3114 self._downloader.report_error(u'Invalid metadata XML file')
59ae15a5
PH
3115 return
3116 info['ext'] = info['url'].rpartition('.')[2]
3117 return [info]
3118 elif mobj.group('course'): # A course page
3119 course = mobj.group('course')
3120 info = {
3121 'id': course,
3122 'type': 'playlist',
3123 'uploader': None,
3124 'upload_date': None,
3125 }
3126
f0bad2b0
PH
3127 coursepage = self._download_webpage(url, info['id'],
3128 note='Downloading course info page',
3129 errnote='Unable to download course info page')
59ae15a5
PH
3130
3131 m = re.search('<h1>([^<]+)</h1>', coursepage)
3132 if m:
3133 info['title'] = unescapeHTML(m.group(1))
3134 else:
3135 info['title'] = info['id']
3136
3137 m = re.search('<description>([^<]+)</description>', coursepage)
3138 if m:
3139 info['description'] = unescapeHTML(m.group(1))
3140
3141 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3142 info['list'] = [
3143 {
3144 'type': 'reference',
3145 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3146 }
3147 for vpage in links]
3148 results = []
3149 for entry in info['list']:
3150 assert entry['type'] == 'reference'
3151 results += self.extract(entry['url'])
3152 return results
59ae15a5
PH
3153 else: # Root page
3154 info = {
3155 'id': 'Stanford OpenClassroom',
3156 'type': 'playlist',
3157 'uploader': None,
3158 'upload_date': None,
3159 }
3160
3161 self.report_download_webpage(info['id'])
3162 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3163 try:
3164 rootpage = compat_urllib_request.urlopen(rootURL).read()
3165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3166 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
59ae15a5
PH
3167 return
3168
3169 info['title'] = info['id']
3170
3171 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3172 info['list'] = [
3173 {
3174 'type': 'reference',
3175 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3176 }
3177 for cpage in links]
3178
3179 results = []
3180 for entry in info['list']:
3181 assert entry['type'] == 'reference'
3182 results += self.extract(entry['url'])
3183 return results
d77c3dfd
FV
3184
3185class MTVIE(InfoExtractor):
59ae15a5
PH
3186 """Information extractor for MTV.com"""
3187
3188 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3189 IE_NAME = u'mtv'
3190
59ae15a5
PH
3191 def report_extraction(self, video_id):
3192 """Report information extraction."""
3193 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3194
3195 def _real_extract(self, url):
3196 mobj = re.match(self._VALID_URL, url)
3197 if mobj is None:
e5f30ade 3198 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
3199 return
3200 if not mobj.group('proto'):
3201 url = 'http://' + url
3202 video_id = mobj.group('videoid')
59ae15a5 3203
5f955171 3204 webpage = self._download_webpage(url, video_id)
59ae15a5
PH
3205
3206 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3207 if mobj is None:
e5f30ade 3208 self._downloader.report_error(u'unable to extract song name')
59ae15a5
PH
3209 return
3210 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3212 if mobj is None:
e5f30ade 3213 self._downloader.report_error(u'unable to extract performer')
59ae15a5
PH
3214 return
3215 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
cdb30764 3216 video_title = performer + ' - ' + song_name
59ae15a5
PH
3217
3218 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3219 if mobj is None:
e5f30ade 3220 self._downloader.report_error(u'unable to mtvn_uri')
59ae15a5
PH
3221 return
3222 mtvn_uri = mobj.group(1)
3223
3224 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3225 if mobj is None:
e5f30ade 3226 self._downloader.report_error(u'unable to extract content id')
59ae15a5
PH
3227 return
3228 content_id = mobj.group(1)
3229
3230 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3231 self.report_extraction(video_id)
3232 request = compat_urllib_request.Request(videogen_url)
3233 try:
3234 metadataXml = compat_urllib_request.urlopen(request).read()
3235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3236 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
59ae15a5
PH
3237 return
3238
3239 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3240 renditions = mdoc.findall('.//rendition')
3241
3242 # For now, always pick the highest quality.
3243 rendition = renditions[-1]
3244
3245 try:
3246 _,_,ext = rendition.attrib['type'].partition('/')
3247 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3248 video_url = rendition.find('./src').text
3249 except KeyError:
3250 self._downloader.trouble('Invalid rendition field.')
3251 return
3252
3253 info = {
3254 'id': video_id,
3255 'url': video_url,
3256 'uploader': performer,
3257 'upload_date': None,
3258 'title': video_title,
3259 'ext': ext,
3260 'format': format,
3261 }
3262
3263 return [info]
6de7ef9b 3264
302efc19 3265
302efc19 3266class YoukuIE(InfoExtractor):
59ae15a5 3267 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
59ae15a5
PH
3268
3269 def report_download_webpage(self, file_id):
3270 """Report webpage download."""
a34dd63b 3271 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
59ae15a5
PH
3272
3273 def report_extraction(self, file_id):
3274 """Report information extraction."""
a34dd63b 3275 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
59ae15a5
PH
3276
3277 def _gen_sid(self):
3278 nowTime = int(time.time() * 1000)
3279 random1 = random.randint(1000,1998)
3280 random2 = random.randint(1000,9999)
3281
3282 return "%d%d%d" %(nowTime,random1,random2)
3283
3284 def _get_file_ID_mix_string(self, seed):
3285 mixed = []
3286 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3287 seed = float(seed)
3288 for i in range(len(source)):
3289 seed = (seed * 211 + 30031 ) % 65536
3290 index = math.floor(seed / 65536 * len(source) )
3291 mixed.append(source[int(index)])
3292 source.remove(source[int(index)])
3293 #return ''.join(mixed)
3294 return mixed
3295
3296 def _get_file_id(self, fileId, seed):
3297 mixed = self._get_file_ID_mix_string(seed)
3298 ids = fileId.split('*')
3299 realId = []
3300 for ch in ids:
3301 if ch:
3302 realId.append(mixed[int(ch)])
3303 return ''.join(realId)
3304
3305 def _real_extract(self, url):
3306 mobj = re.match(self._VALID_URL, url)
3307 if mobj is None:
e5f30ade 3308 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5
PH
3309 return
3310 video_id = mobj.group('ID')
3311
3312 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3313
3314 request = compat_urllib_request.Request(info_url, None, std_headers)
3315 try:
3316 self.report_download_webpage(video_id)
3317 jsondata = compat_urllib_request.urlopen(request).read()
3318 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3319 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
3320 return
3321
3322 self.report_extraction(video_id)
3323 try:
8f6f40d9
PH
3324 jsonstr = jsondata.decode('utf-8')
3325 config = json.loads(jsonstr)
59ae15a5
PH
3326
3327 video_title = config['data'][0]['title']
3328 seed = config['data'][0]['seed']
3329
3330 format = self._downloader.params.get('format', None)
1a2c3c0f 3331 supported_format = list(config['data'][0]['streamfileids'].keys())
59ae15a5
PH
3332
3333 if format is None or format == 'best':
3334 if 'hd2' in supported_format:
3335 format = 'hd2'
3336 else:
3337 format = 'flv'
3338 ext = u'flv'
3339 elif format == 'worst':
3340 format = 'mp4'
3341 ext = u'mp4'
3342 else:
3343 format = 'flv'
3344 ext = u'flv'
3345
3346
3347 fileid = config['data'][0]['streamfileids'][format]
e2a8ff24 3348 keys = [s['k'] for s in config['data'][0]['segs'][format]]
8f6f40d9 3349 except (UnicodeDecodeError, ValueError, KeyError):
e5f30ade 3350 self._downloader.report_error(u'unable to extract info section')
59ae15a5
PH
3351 return
3352
3353 files_info=[]
3354 sid = self._gen_sid()
3355 fileid = self._get_file_id(fileid, seed)
3356
3357 #column 8,9 of fileid represent the segment number
3358 #fileid[7:9] should be changed
3359 for index, key in enumerate(keys):
3360
3361 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3362 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3363
3364 info = {
3365 'id': '%s_part%02d' % (video_id, index),
3366 'url': download_url,
3367 'uploader': None,
3368 'upload_date': None,
3369 'title': video_title,
3370 'ext': ext,
3371 }
3372 files_info.append(info)
3373
3374 return files_info
5dc846fa
FV
3375
3376
6de7ef9b 3377class XNXXIE(InfoExtractor):
59ae15a5
PH
3378 """Information extractor for xnxx.com"""
3379
caec7618 3380 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
59ae15a5
PH
3381 IE_NAME = u'xnxx'
3382 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3383 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3384 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3385
3386 def report_webpage(self, video_id):
3387 """Report information extraction"""
3388 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3389
3390 def report_extraction(self, video_id):
3391 """Report information extraction"""
3392 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3393
3394 def _real_extract(self, url):
3395 mobj = re.match(self._VALID_URL, url)
3396 if mobj is None:
e5f30ade 3397 self._downloader.report_error(u'invalid URL: %s' % url)
59ae15a5 3398 return
bec102a8 3399 video_id = mobj.group(1)
59ae15a5
PH
3400
3401 self.report_webpage(video_id)
3402
3403 # Get webpage content
3404 try:
bec102a8
PH
3405 webpage_bytes = compat_urllib_request.urlopen(url).read()
3406 webpage = webpage_bytes.decode('utf-8')
59ae15a5 3407 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3408 self._downloader.report_error(u'unable to download video webpage: %s' % err)
59ae15a5
PH
3409 return
3410
3411 result = re.search(self.VIDEO_URL_RE, webpage)
3412 if result is None:
e5f30ade 3413 self._downloader.report_error(u'unable to extract video url')
59ae15a5 3414 return
bec102a8 3415 video_url = compat_urllib_parse.unquote(result.group(1))
59ae15a5
PH
3416
3417 result = re.search(self.VIDEO_TITLE_RE, webpage)
3418 if result is None:
e5f30ade 3419 self._downloader.report_error(u'unable to extract video title')
59ae15a5 3420 return
bec102a8 3421 video_title = result.group(1)
59ae15a5
PH
3422
3423 result = re.search(self.VIDEO_THUMB_RE, webpage)
3424 if result is None:
e5f30ade 3425 self._downloader.report_error(u'unable to extract video thumbnail')
59ae15a5 3426 return
bec102a8 3427 video_thumbnail = result.group(1)
59ae15a5
PH
3428
3429 return [{
3430 'id': video_id,
3431 'url': video_url,
3432 'uploader': None,
3433 'upload_date': None,
3434 'title': video_title,
3435 'ext': 'flv',
3436 'thumbnail': video_thumbnail,
3437 'description': None,
3438 }]
fd873c69
FV
3439
3440
d443aca8 3441class GooglePlusIE(InfoExtractor):
59ae15a5
PH
3442 """Information extractor for plus.google.com."""
3443
93702113 3444 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
59ae15a5
PH
3445 IE_NAME = u'plus.google'
3446
3447 def __init__(self, downloader=None):
3448 InfoExtractor.__init__(self, downloader)
3449
3450 def report_extract_entry(self, url):
3451 """Report downloading extry"""
93702113 3452 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
59ae15a5
PH
3453
3454 def report_date(self, upload_date):
3455 """Report downloading extry"""
3456 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3457
3458 def report_uploader(self, uploader):
3459 """Report downloading extry"""
93702113 3460 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
59ae15a5
PH
3461
3462 def report_title(self, video_title):
3463 """Report downloading extry"""
93702113 3464 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
59ae15a5
PH
3465
3466 def report_extract_vid_page(self, video_page):
3467 """Report information extraction."""
93702113 3468 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
59ae15a5
PH
3469
3470 def _real_extract(self, url):
3471 # Extract id from URL
3472 mobj = re.match(self._VALID_URL, url)
3473 if mobj is None:
e5f30ade 3474 self._downloader.report_error(u'Invalid URL: %s' % url)
59ae15a5
PH
3475 return
3476
3477 post_url = mobj.group(0)
93702113 3478 video_id = mobj.group(1)
59ae15a5
PH
3479
3480 video_extension = 'flv'
3481
3482 # Step 1, Retrieve post webpage to extract further information
3483 self.report_extract_entry(post_url)
3484 request = compat_urllib_request.Request(post_url)
3485 try:
93702113 3486 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 3487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3488 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
59ae15a5
PH
3489 return
3490
3491 # Extract update date
3492 upload_date = None
3493 pattern = 'title="Timestamp">(.*?)</a>'
3494 mobj = re.search(pattern, webpage)
3495 if mobj:
3496 upload_date = mobj.group(1)
3497 # Convert timestring to a format suitable for filename
3498 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3499 upload_date = upload_date.strftime('%Y%m%d')
3500 self.report_date(upload_date)
3501
3502 # Extract uploader
3503 uploader = None
3504 pattern = r'rel\="author".*?>(.*?)</a>'
3505 mobj = re.search(pattern, webpage)
3506 if mobj:
3507 uploader = mobj.group(1)
3508 self.report_uploader(uploader)
3509
3510 # Extract title
3511 # Get the first line for title
3512 video_title = u'NA'
3513 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3514 mobj = re.search(pattern, webpage)
3515 if mobj:
3516 video_title = mobj.group(1)
3517 self.report_title(video_title)
3518
3519 # Step 2, Stimulate clicking the image box to launch video
3520 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3521 mobj = re.search(pattern, webpage)
3522 if mobj is None:
e5f30ade 3523 self._downloader.report_error(u'unable to extract video page URL')
59ae15a5
PH
3524
3525 video_page = mobj.group(1)
3526 request = compat_urllib_request.Request(video_page)
3527 try:
93702113 3528 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
59ae15a5 3529 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3530 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
59ae15a5
PH
3531 return
3532 self.report_extract_vid_page(video_page)
3533
3534
3535 # Extract video links on video page
3536 """Extract video links of all sizes"""
3537 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3538 mobj = re.findall(pattern, webpage)
3539 if len(mobj) == 0:
e5f30ade 3540 self._downloader.report_error(u'unable to extract video links')
59ae15a5
PH
3541
3542 # Sort in resolution
3543 links = sorted(mobj)
3544
3545 # Choose the lowest of the sort, i.e. highest resolution
3546 video_url = links[-1]
3547 # Only get the url. The resolution part in the tuple has no use anymore
3548 video_url = video_url[-1]
3549 # Treat escaped \u0026 style hex
93702113
FV
3550 try:
3551 video_url = video_url.decode("unicode_escape")
3552 except AttributeError: # Python 3
3553 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
59ae15a5
PH
3554
3555
3556 return [{
93702113 3557 'id': video_id,
59ae15a5 3558 'url': video_url,
93702113
FV
3559 'uploader': uploader,
3560 'upload_date': upload_date,
3561 'title': video_title,
3562 'ext': video_extension,
59ae15a5 3563 }]
4cc3d074
PH
3564
3565class NBAIE(InfoExtractor):
3566 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3567 IE_NAME = u'nba'
3568
4cc3d074
PH
3569 def _real_extract(self, url):
3570 mobj = re.match(self._VALID_URL, url)
3571 if mobj is None:
e5f30ade 3572 self._downloader.report_error(u'invalid URL: %s' % url)
4cc3d074
PH
3573 return
3574
3575 video_id = mobj.group(1)
3576 if video_id.endswith('/index.html'):
3577 video_id = video_id[:-len('/index.html')]
3578
5f955171 3579 webpage = self._download_webpage(url, video_id)
4cc3d074
PH
3580
3581 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3582 def _findProp(rexp, default=None):
3583 m = re.search(rexp, webpage)
3584 if m:
3585 return unescapeHTML(m.group(1))
3586 else:
3587 return default
3588
3589 shortened_video_id = video_id.rpartition('/')[2]
3590 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3591 info = {
3592 'id': shortened_video_id,
3593 'url': video_url,
3594 'ext': 'mp4',
3595 'title': title,
3596 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3597 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3598 }
3599 return [info]
0b40544f
DV
3600
3601class JustinTVIE(InfoExtractor):
3602 """Information extractor for justin.tv and twitch.tv"""
2ab1c5ed
DV
3603 # TODO: One broadcast may be split into multiple videos. The key
3604 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3605 # starts at 1 and increases. Can we treat all parts as one video?
3606
4096b609
DV
3607 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3608 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3609 _JUSTIN_PAGE_LIMIT = 100
0b40544f
DV
3610 IE_NAME = u'justin.tv'
3611
3612 def report_extraction(self, file_id):
3613 """Report information extraction."""
3614 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3615
4096b609
DV
3616 def report_download_page(self, channel, offset):
3617 """Report attempt to download a single page of videos."""
3618 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3619 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3620
2ab1c5ed
DV
3621 # Return count of items, list of *valid* items
3622 def _parse_page(self, url):
0b40544f 3623 try:
2ab1c5ed 3624 urlh = compat_urllib_request.urlopen(url)
0b40544f
DV
3625 webpage_bytes = urlh.read()
3626 webpage = webpage_bytes.decode('utf-8', 'ignore')
3627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
e5f30ade 3628 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
0b40544f 3629 return
cdb30764 3630
0b40544f 3631 response = json.loads(webpage)
fa1bf9c6 3632 if type(response) != list:
3633 error_text = response.get('error', 'unknown error')
e5f30ade 3634 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
fa1bf9c6 3635 return
0b40544f
DV
3636 info = []
3637 for clip in response:
3638 video_url = clip['video_file_url']
3639 if video_url:
3640 video_extension = os.path.splitext(video_url)[1][1:]
fa1bf9c6 3641 video_date = re.sub('-', '', clip['start_time'][:10])
3642 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
97f194c1
PH
3643 video_id = clip['id']
3644 video_title = clip.get('title', video_id)
0b40544f 3645 info.append({
97f194c1 3646 'id': video_id,
0b40544f 3647 'url': video_url,
97f194c1 3648 'title': video_title,
fa1bf9c6 3649 'uploader': clip.get('channel_name', video_uploader_id),
3650 'uploader_id': video_uploader_id,
0b40544f
DV
3651 'upload_date': video_date,
3652 'ext': video_extension,
3653 })
2ab1c5ed
DV
3654 return (len(response), info)
3655
3656 def _real_extract(self, url):
3657 mobj = re.match(self._VALID_URL, url)
3658 if mobj is None:
e5f30ade 3659 self._downloader.report_error(u'invalid URL: %s' % url)
2ab1c5ed 3660 return
cdb30764 3661
2ab1c5ed
DV
3662 api = 'http://api.justin.tv'
3663 video_id = mobj.group(mobj.lastindex)
3664 paged = False
3665 if mobj.lastindex == 1:
3666 paged = True
3667 api += '/channel/archives/%s.json'
3668 else:
fa1bf9c6 3669 api += '/broadcast/by_archive/%s.json'
2ab1c5ed 3670 api = api % (video_id,)
cdb30764 3671
2ab1c5ed 3672 self.report_extraction(video_id)
cdb30764 3673
2ab1c5ed
DV
3674 info = []
3675 offset = 0
4096b609
DV
3676 limit = self._JUSTIN_PAGE_LIMIT
3677 while True:
3678 if paged:
3679 self.report_download_page(video_id, offset)
2ab1c5ed
DV
3680 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3681 page_count, page_info = self._parse_page(page_url)
3682 info.extend(page_info)
3683 if not paged or page_count != limit:
3684 break
3685 offset += limit
0b40544f 3686 return info
21a9c6aa
PH
3687
3688class FunnyOrDieIE(InfoExtractor):
3689 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
21a9c6aa 3690
21a9c6aa
PH
3691 def _real_extract(self, url):
3692 mobj = re.match(self._VALID_URL, url)
3693 if mobj is None:
e5f30ade 3694 self._downloader.report_error(u'invalid URL: %s' % url)
21a9c6aa
PH
3695 return
3696
3697 video_id = mobj.group('id')
5f955171 3698 webpage = self._download_webpage(url, video_id)
21a9c6aa
PH
3699
3700 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3701 if not m:
e5f30ade 3702 self._downloader.report_error(u'unable to find video information')
21a9c6aa 3703 video_url = unescapeHTML(m.group('url'))
21a9c6aa 3704
7decf895 3705 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
21a9c6aa 3706 if not m:
bfdf4692
PH
3707 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3708 if not m:
3709 self._downloader.trouble(u'Cannot find video title')
7decf895 3710 title = clean_html(m.group('title'))
21a9c6aa
PH
3711
3712 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3713 if m:
3714 desc = unescapeHTML(m.group('desc'))
3715 else:
3716 desc = None
3717
3718 info = {
3719 'id': video_id,
3720 'url': video_url,
3721 'ext': 'mp4',
3722 'title': title,
3723 'description': desc,
3724 }
3725 return [info]
d0d4f277 3726
e314ba67 3727class SteamIE(InfoExtractor):
6324fd1d 3728 _VALID_URL = r"""http://store.steampowered.com/
e314ba67
JMF
3729 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3730 (?P<gameID>\d+)/?
3731 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3732 """
4aeae91f 3733
89de9eb1
FV
3734 @classmethod
3735 def suitable(cls, url):
e314ba67 3736 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 3737 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
5f955171 3738
e314ba67
JMF
3739 def _real_extract(self, url):
3740 m = re.match(self._VALID_URL, url, re.VERBOSE)
3741 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3742 gameID = m.group('gameID')
3743 videourl = 'http://store.steampowered.com/video/%s/' % gameID
5f955171 3744 webpage = self._download_webpage(videourl, gameID)
e314ba67 3745 mweb = re.finditer(urlRE, webpage)
5e9d042d
JMF
3746 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3747 titles = re.finditer(namesRE, webpage)
60bd48b1
JMF
3748 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3749 thumbs = re.finditer(thumbsRE, webpage)
e314ba67 3750 videos = []
60bd48b1 3751 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
e314ba67 3752 video_id = vid.group('videoID')
5f955171
PH
3753 title = vtitle.group('videoName')
3754 video_url = vid.group('videoURL')
60bd48b1 3755 video_thumb = thumb.group('thumbnail')
e314ba67 3756 if not video_url:
e5f30ade 3757 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
e314ba67
JMF
3758 info = {
3759 'id':video_id,
3760 'url':video_url,
3761 'ext': 'flv',
60bd48b1
JMF
3762 'title': unescapeHTML(title),
3763 'thumbnail': video_thumb
e314ba67
JMF
3764 }
3765 videos.append(info)
3766 return videos
ef0c8d5f 3767
278986ea 3768class UstreamIE(InfoExtractor):
ef0c8d5f 3769 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
278986ea 3770 IE_NAME = u'ustream'
ef0c8d5f 3771
278986ea
JMF
3772 def _real_extract(self, url):
3773 m = re.match(self._VALID_URL, url)
3774 video_id = m.group('videoID')
3775 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
d830b7c2 3776 webpage = self._download_webpage(url, video_id)
278986ea
JMF
3777 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3778 title = m.group('title')
3779 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3780 uploader = m.group('uploader')
3781 info = {
3782 'id':video_id,
3783 'url':video_url,
3784 'ext': 'flv',
3785 'title': title,
3786 'uploader': uploader
3787 }
3788 return [info]
4aeae91f 3789
40634747 3790class WorldStarHipHopIE(InfoExtractor):
64c78d50 3791 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
40634747
JMS
3792 IE_NAME = u'WorldStarHipHop'
3793
3794 def _real_extract(self, url):
40634747
JMS
3795 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3796
3b221c54 3797 webpage_src = compat_urllib_request.urlopen(url).read()
b3bcca08 3798 webpage_src = webpage_src.decode('utf-8')
40634747
JMS
3799
3800 mobj = re.search(_src_url, webpage_src)
3801
08ec0af7
JMS
3802 m = re.match(self._VALID_URL, url)
3803 video_id = m.group('id')
3804
40634747
JMS
3805 if mobj is not None:
3806 video_url = mobj.group()
3807 if 'mp4' in video_url:
b3bcca08 3808 ext = 'mp4'
40634747 3809 else:
b3bcca08 3810 ext = 'flv'
40634747 3811 else:
08ec0af7
JMS
3812 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3813 return
40634747
JMS
3814
3815 _title = r"""<title>(.*)</title>"""
3816
3817 mobj = re.search(_title, webpage_src)
fa41fbd3 3818
40634747
JMS
3819 if mobj is not None:
3820 title = mobj.group(1)
40634747 3821 else:
b3bcca08 3822 title = 'World Start Hip Hop - %s' % time.ctime()
40634747
JMS
3823
3824 _thumbnail = r"""rel="image_src" href="(.*)" />"""
40634747
JMS
3825 mobj = re.search(_thumbnail, webpage_src)
3826
3827 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3828 if mobj is not None:
3829 thumbnail = mobj.group(1)
3830 else:
3831 _title = r"""candytitles.*>(.*)</span>"""
3832 mobj = re.search(_title, webpage_src)
3833 if mobj is not None:
3834 title = mobj.group(1)
3835 thumbnail = None
fa41fbd3 3836
b3bcca08 3837 results = [{
64c78d50 3838 'id': video_id,
b3bcca08
JMS
3839 'url' : video_url,
3840 'title' : title,
3841 'thumbnail' : thumbnail,
3842 'ext' : ext,
3843 }]
40634747
JMS
3844 return results
3845
ca0a0bbe
PH
3846class RBMARadioIE(InfoExtractor):
3847 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3848
3849 def _real_extract(self, url):
3850 m = re.match(self._VALID_URL, url)
3851 video_id = m.group('videoID')
3852
3853 webpage = self._download_webpage(url, video_id)
3854 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3855 if not m:
3856 raise ExtractorError(u'Cannot find metadata')
3857 json_data = m.group(1)
3858
3859 try:
3860 data = json.loads(json_data)
3861 except ValueError as e:
3862 raise ExtractorError(u'Invalid JSON: ' + str(e))
3863
3864 video_url = data['akamai_url'] + '&cbr=256'
3865 url_parts = compat_urllib_parse_urlparse(video_url)
3866 video_ext = url_parts.path.rpartition('.')[2]
3867 info = {
3868 'id': video_id,
3869 'url': video_url,
3870 'ext': video_ext,
3871 'title': data['title'],
3872 'description': data.get('teaser_text'),
3873 'location': data.get('country_of_origin'),
3874 'uploader': data.get('host', {}).get('name'),
3875 'uploader_id': data.get('host', {}).get('slug'),
187f491a 3876 'thumbnail': data.get('image', {}).get('large_url_2x'),
ca0a0bbe
PH
3877 'duration': data.get('duration'),
3878 }
3879 return [info]
4aeae91f 3880
991ba7fa
JC
3881
3882class YouPornIE(InfoExtractor):
3883 """Information extractor for youporn.com."""
991ba7fa 3884 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
6324fd1d 3885
991ba7fa
JC
3886 def _print_formats(self, formats):
3887 """Print all available formats"""
565f7519 3888 print(u'Available formats:')
ca6710ee
JC
3889 print(u'ext\t\tformat')
3890 print(u'---------------------------------')
991ba7fa 3891 for format in formats:
ca6710ee 3892 print(u'%s\t\t%s' % (format['ext'], format['format']))
991ba7fa
JC
3893
3894 def _specific(self, req_format, formats):
3895 for x in formats:
3896 if(x["format"]==req_format):
3897 return x
3898 return None
3899
991ba7fa
JC
3900 def _real_extract(self, url):
3901 mobj = re.match(self._VALID_URL, url)
3902 if mobj is None:
e5f30ade 3903 self._downloader.report_error(u'invalid URL: %s' % url)
991ba7fa
JC
3904 return
3905
ca6710ee 3906 video_id = mobj.group('videoid')
991ba7fa 3907
629fcdd1
PH
3908 req = compat_urllib_request.Request(url)
3909 req.add_header('Cookie', 'age_verified=1')
3910 webpage = self._download_webpage(req, video_id)
991ba7fa
JC
3911
3912 # Get the video title
e711babb 3913 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
991ba7fa 3914 if result is None:
e711babb 3915 raise ExtractorError(u'Unable to extract video title')
ca6710ee 3916 video_title = result.group('title').strip()
991ba7fa
JC
3917
3918 # Get the video date
e711babb 3919 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
991ba7fa 3920 if result is None:
2e5457be 3921 self._downloader.report_warning(u'unable to extract video date')
629fcdd1
PH
3922 upload_date = None
3923 else:
3924 upload_date = result.group('date').strip()
991ba7fa
JC
3925
3926 # Get the video uploader
e711babb 3927 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
991ba7fa 3928 if result is None:
2e5457be 3929 self._downloader.report_warning(u'unable to extract uploader')
629fcdd1
PH
3930 video_uploader = None
3931 else:
3932 video_uploader = result.group('uploader').strip()
3933 video_uploader = clean_html( video_uploader )
991ba7fa
JC
3934
3935 # Get all of the formats available
ca6710ee
JC
3936 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3937 result = re.search(DOWNLOAD_LIST_RE, webpage)
991ba7fa 3938 if result is None:
629fcdd1 3939 raise ExtractorError(u'Unable to extract download list')
ca6710ee 3940 download_list_html = result.group('download_list').strip()
991ba7fa
JC
3941
3942 # Get all of the links from the page
ca6710ee
JC
3943 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3944 links = re.findall(LINK_RE, download_list_html)
991ba7fa 3945 if(len(links) == 0):
629fcdd1 3946 raise ExtractorError(u'ERROR: no known formats available for video')
6324fd1d
FV
3947
3948 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
991ba7fa
JC
3949
3950 formats = []
3951 for link in links:
3952
3953 # A link looks like this:
3954 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3955 # A path looks like this:
3956 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
ca6710ee
JC
3957 video_url = unescapeHTML( link )
3958 path = compat_urllib_parse_urlparse( video_url ).path
991ba7fa
JC
3959 extension = os.path.splitext( path )[1][1:]
3960 format = path.split('/')[4].split('_')[:2]
3961 size = format[0]
3962 bitrate = format[1]
3963 format = "-".join( format )
3964 title = u'%s-%s-%s' % (video_title, size, bitrate)
3965
3966 formats.append({
3967 'id': video_id,
3968 'url': video_url,
3969 'uploader': video_uploader,
3970 'upload_date': upload_date,
3971 'title': title,
3972 'ext': extension,
3973 'format': format,
3974 'thumbnail': None,
3975 'description': None,
3976 'player_url': None
3977 })
3978
3979 if self._downloader.params.get('listformats', None):
3980 self._print_formats(formats)
3981 return
3982
3983 req_format = self._downloader.params.get('format', None)
991ba7fa
JC
3984 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3985
991ba7fa
JC
3986 if req_format is None or req_format == 'best':
3987 return [formats[0]]
3988 elif req_format == 'worst':
3989 return [formats[-1]]
3990 elif req_format in ('-1', 'all'):
3991 return formats
3992 else:
3993 format = self._specific( req_format, formats )
3994 if result is None:
e5f30ade 3995 self._downloader.report_error(u'requested format not available')
991ba7fa
JC
3996 return
3997 return [format]
3998
6324fd1d 3999
991ba7fa
JC
4000
4001class PornotubeIE(InfoExtractor):
4002 """Information extractor for pornotube.com."""
991ba7fa 4003 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
991ba7fa 4004
991ba7fa
JC
4005 def _real_extract(self, url):
4006 mobj = re.match(self._VALID_URL, url)
4007 if mobj is None:
e5f30ade 4008 self._downloader.report_error(u'invalid URL: %s' % url)
991ba7fa
JC
4009 return
4010
ca6710ee
JC
4011 video_id = mobj.group('videoid')
4012 video_title = mobj.group('title')
991ba7fa
JC
4013
4014 # Get webpage content
ca6710ee 4015 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
4016
4017 # Get the video URL
ca6710ee
JC
4018 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4019 result = re.search(VIDEO_URL_RE, webpage)
991ba7fa 4020 if result is None:
e5f30ade 4021 self._downloader.report_error(u'unable to extract video url')
991ba7fa 4022 return
ca6710ee 4023 video_url = compat_urllib_parse.unquote(result.group('url'))
991ba7fa
JC
4024
4025 #Get the uploaded date
ca6710ee
JC
4026 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4027 result = re.search(VIDEO_UPLOADED_RE, webpage)
991ba7fa 4028 if result is None:
e5f30ade 4029 self._downloader.report_error(u'unable to extract video title')
991ba7fa 4030 return
ca6710ee 4031 upload_date = result.group('date')
991ba7fa
JC
4032
4033 info = {'id': video_id,
4034 'url': video_url,
4035 'uploader': None,
4036 'upload_date': upload_date,
4037 'title': video_title,
4038 'ext': 'flv',
565f7519 4039 'format': 'flv'}
991ba7fa
JC
4040
4041 return [info]
4042
991ba7fa
JC
4043class YouJizzIE(InfoExtractor):
4044 """Information extractor for youjizz.com."""
ca6710ee 4045 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
991ba7fa 4046
991ba7fa 4047 def _real_extract(self, url):
ca6710ee
JC
4048 mobj = re.match(self._VALID_URL, url)
4049 if mobj is None:
e5f30ade 4050 self._downloader.report_error(u'invalid URL: %s' % url)
991ba7fa 4051 return
ca6710ee
JC
4052
4053 video_id = mobj.group('videoid')
4054
4055 # Get webpage content
4056 webpage = self._download_webpage(url, video_id)
991ba7fa
JC
4057
4058 # Get the video title
db16276b 4059 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
991ba7fa 4060 if result is None:
db16276b 4061 raise ExtractorError(u'ERROR: unable to extract video title')
ca6710ee 4062 video_title = result.group('title').strip()
991ba7fa
JC
4063
4064 # Get the embed page
db16276b 4065 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
991ba7fa 4066 if result is None:
db16276b 4067 raise ExtractorError(u'ERROR: unable to extract embed page')
991ba7fa 4068
ca6710ee
JC
4069 embed_page_url = result.group(0).strip()
4070 video_id = result.group('videoid')
6324fd1d 4071
ca6710ee
JC
4072 webpage = self._download_webpage(embed_page_url, video_id)
4073
991ba7fa 4074 # Get the video URL
db16276b 4075 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
991ba7fa 4076 if result is None:
db16276b 4077 raise ExtractorError(u'ERROR: unable to extract video url')
ca6710ee 4078 video_url = result.group('source')
991ba7fa
JC
4079
4080 info = {'id': video_id,
4081 'url': video_url,
991ba7fa
JC
4082 'title': video_title,
4083 'ext': 'flv',
4084 'format': 'flv',
991ba7fa
JC
4085 'player_url': embed_page_url}
4086
4087 return [info]
4088
ccf65f9d
PH
4089class EightTracksIE(InfoExtractor):
4090 IE_NAME = '8tracks'
25580f32 4091 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
ccf65f9d
PH
4092
4093 def _real_extract(self, url):
4094 mobj = re.match(self._VALID_URL, url)
4095 if mobj is None:
4096 raise ExtractorError(u'Invalid URL: %s' % url)
4097 playlist_id = mobj.group('id')
4098
4099 webpage = self._download_webpage(url, playlist_id)
4100
2a9983b7 4101 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
ccf65f9d
PH
4102 if not m:
4103 raise ExtractorError(u'Cannot find trax information')
4104 json_like = m.group(1)
4105 data = json.loads(json_like)
4106
4107 session = str(random.randint(0, 1000000000))
4108 mix_id = data['id']
4109 track_count = data['tracks_count']
4110 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4111 next_url = first_url
4112 res = []
4113 for i in itertools.count():
4114 api_json = self._download_webpage(next_url, playlist_id,
4115 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4116 errnote=u'Failed to download song information')
4117 api_data = json.loads(api_json)
4118 track_data = api_data[u'set']['track']
4119 info = {
4120 'id': track_data['id'],
4121 'url': track_data['track_file_stream_url'],
da4de959
PH
4122 'title': track_data['performer'] + u' - ' + track_data['name'],
4123 'raw_title': track_data['name'],
4124 'uploader_id': data['user']['login'],
ccf65f9d
PH
4125 'ext': 'm4a',
4126 }
4127 res.append(info)
4128 if api_data['set']['at_last_track']:
4129 break
4130 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4131 return res
991ba7fa 4132
da06e2da
OK
4133class KeekIE(InfoExtractor):
4134 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4135 IE_NAME = u'keek'
4136
4137 def _real_extract(self, url):
4138 m = re.match(self._VALID_URL, url)
4139 video_id = m.group('videoID')
4140 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4141 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4142 webpage = self._download_webpage(url, video_id)
f4381ab8 4143 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
f0877a44 4144 title = unescapeHTML(m.group('title'))
f10b2a9c
FV
4145 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4146 uploader = clean_html(m.group('uploader'))
da06e2da 4147 info = {
f10b2a9c
FV
4148 'id': video_id,
4149 'url': video_url,
da06e2da
OK
4150 'ext': 'mp4',
4151 'title': title,
4152 'thumbnail': thumbnail,
4153 'uploader': uploader
f0877a44 4154 }
da06e2da
OK
4155 return [info]
4156
3a468f2d 4157class TEDIE(InfoExtractor):
414638cd
JMF
4158 _VALID_URL=r'''http://www.ted.com/
4159 (
4160 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4161 |
4162 ((?P<type_talk>talks)) # We have a simple talk
4163 )
4164 /(?P<name>\w+) # Here goes the name and then ".html"
4165 '''
4166
89de9eb1
FV
4167 @classmethod
4168 def suitable(cls, url):
414638cd 4169 """Receives a URL and returns True if suitable for this IE."""
89de9eb1 4170 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
414638cd 4171
3a468f2d 4172 def _real_extract(self, url):
414638cd
JMF
4173 m=re.match(self._VALID_URL, url, re.VERBOSE)
4174 if m.group('type_talk'):
4175 return [self._talk_info(url)]
4176 else :
4177 playlist_id=m.group('playlist_id')
4178 name=m.group('name')
4179 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4180 return self._playlist_videos_info(url,name,playlist_id)
4181
4182 def _talk_video_link(self,mediaSlug):
4183 '''Returns the video link for that mediaSlug'''
4184 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4185
4186 def _playlist_videos_info(self,url,name,playlist_id=0):
4187 '''Returns the videos of the playlist'''
4188 video_RE=r'''
4189 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4190 ([.\s]*?)data-playlist_item_id="(\d+)"
4191 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4192 '''
c85538db 4193 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
414638cd
JMF
4194 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4195 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4196 m_names=re.finditer(video_name_RE,webpage)
4197 info=[]
4198 for m_video, m_name in zip(m_videos,m_names):
c85538db
JMF
4199 video_id=m_video.group('video_id')
4200 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4201 info.append(self._talk_info(talk_url,video_id))
414638cd 4202 return info
c85538db 4203
414638cd
JMF
4204 def _talk_info(self, url, video_id=0):
4205 """Return the video for the talk in the url"""
4206 m=re.match(self._VALID_URL, url,re.VERBOSE)
4207 videoName=m.group('name')
4208 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4209 # If the url includes the language we get the title translated
7decf895 4210 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3a468f2d
JMF
4211 title=re.search(title_RE, webpage).group('title')
4212 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4213 "id":(?P<videoID>[\d]+).*?
4214 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
c85538db
JMF
4215 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4216 thumb_match=re.search(thumb_RE,webpage)
3a468f2d
JMF
4217 info_match=re.search(info_RE,webpage,re.VERBOSE)
4218 video_id=info_match.group('videoID')
4219 mediaSlug=info_match.group('mediaSlug')
414638cd 4220 video_url=self._talk_video_link(mediaSlug)
3a468f2d 4221 info = {
414638cd
JMF
4222 'id': video_id,
4223 'url': video_url,
3a468f2d 4224 'ext': 'mp4',
c85538db
JMF
4225 'title': title,
4226 'thumbnail': thumb_match.group('thumbnail')
414638cd
JMF
4227 }
4228 return info
da06e2da 4229
58994225 4230class MySpassIE(InfoExtractor):
1ad5d872 4231 _VALID_URL = r'http://www.myspass.de/.*'
6324fd1d 4232
1ad5d872 4233 def _real_extract(self, url):
4234 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
58994225 4235
1ad5d872 4236 # video id is the last path element of the URL
4237 # usually there is a trailing slash, so also try the second but last
4238 url_path = compat_urllib_parse_urlparse(url).path
4239 url_parent_path, video_id = os.path.split(url_path)
4240 if not video_id:
4241 _, video_id = os.path.split(url_parent_path)
6324fd1d 4242
1ad5d872 4243 # get metadata
4244 metadata_url = META_DATA_URL_TEMPLATE % video_id
4245 metadata_text = self._download_webpage(metadata_url, video_id)
4246 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
6324fd1d 4247
1ad5d872 4248 # extract values from metadata
4249 url_flv_el = metadata.find('url_flv')
4250 if url_flv_el is None:
e5f30ade 4251 self._downloader.report_error(u'unable to extract download url')
1ad5d872 4252 return
4253 video_url = url_flv_el.text
4254 extension = os.path.splitext(video_url)[1][1:]
4255 title_el = metadata.find('title')
4256 if title_el is None:
e5f30ade 4257 self._downloader.report_error(u'unable to extract title')
1ad5d872 4258 return
4259 title = title_el.text
4260 format_id_el = metadata.find('format_id')
4261 if format_id_el is None:
4262 format = ext
4263 else:
4264 format = format_id_el.text
4265 description_el = metadata.find('description')
4266 if description_el is not None:
4267 description = description_el.text
4268 else:
4269 description = None
4270 imagePreview_el = metadata.find('imagePreview')
4271 if imagePreview_el is not None:
4272 thumbnail = imagePreview_el.text
4273 else:
4274 thumbnail = None
4275 info = {
4276 'id': video_id,
4277 'url': video_url,
4278 'title': title,
4279 'ext': extension,
4280 'format': format,
4281 'thumbnail': thumbnail,
4282 'description': description
4283 }
4284 return [info]
4285
e32b06e9 4286class SpiegelIE(InfoExtractor):
1f46c152 4287 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
e32b06e9
PH
4288
4289 def _real_extract(self, url):
4290 m = re.match(self._VALID_URL, url)
4291 video_id = m.group('videoID')
4292
4293 webpage = self._download_webpage(url, video_id)
4294 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4295 if not m:
4296 raise ExtractorError(u'Cannot find title')
4297 video_title = unescapeHTML(m.group(1))
4298
4299 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4300 xml_code = self._download_webpage(xml_url, video_id,
4301 note=u'Downloading XML', errnote=u'Failed to download XML')
4302
4303 idoc = xml.etree.ElementTree.fromstring(xml_code)
4304 last_type = idoc[-1]
4305 filename = last_type.findall('./filename')[0].text
4306 duration = float(last_type.findall('./duration')[0].text)
4307
4308 video_url = 'http://video2.spiegel.de/flash/' + filename
4309 video_ext = filename.rpartition('.')[2]
4310 info = {
4311 'id': video_id,
4312 'url': video_url,
4313 'ext': video_ext,
4314 'title': video_title,
4315 'duration': duration,
4316 }
4317 return [info]
4318
0cd35867 4319class LiveLeakIE(InfoExtractor):
43113d92 4320
0cd35867 4321 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
43113d92 4322 IE_NAME = u'liveleak'
4323
4324 def _real_extract(self, url):
4325 mobj = re.match(self._VALID_URL, url)
4326 if mobj is None:
4327 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4328 return
4329
0cd35867 4330 video_id = mobj.group('video_id')
43113d92 4331
4332 webpage = self._download_webpage(url, video_id)
4333
0cd35867
FV
4334 m = re.search(r'file: "(.*?)",', webpage)
4335 if not m:
4336 self._downloader.report_error(u'unable to find video url')
4337 return
4338 video_url = m.group(1)
4339
43113d92 4340 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4341 if not m:
4342 self._downloader.trouble(u'Cannot find video title')
0cd35867 4343 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
43113d92 4344
4345 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4346 if m:
4347 desc = unescapeHTML(m.group('desc'))
4348 else:
4349 desc = None
4350
0cd35867
FV
4351 m = re.search(r'By:.*?(\w+)</a>', webpage)
4352 if m:
4353 uploader = clean_html(m.group(1))
4354 else:
4355 uploader = None
43113d92 4356
4357 info = {
4358 'id': video_id,
4359 'url': video_url,
4360 'ext': 'mp4',
4361 'title': title,
0cd35867
FV
4362 'description': desc,
4363 'uploader': uploader
43113d92 4364 }
4365
4366 return [info]
4367
df2dedee 4368class ARDIE(InfoExtractor):
b03d65c2
PH
4369 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4370 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
df2dedee
MW
4371 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4372
4373 def _real_extract(self, url):
4374 # determine video id from url
4375 m = re.match(self._VALID_URL, url)
b03d65c2
PH
4376
4377 numid = re.search(r'documentId=([0-9]+)', url)
4378 if numid:
4379 video_id = numid.group(1)
4380 else:
4381 video_id = m.group('video_id')
df2dedee
MW
4382
4383 # determine title and media streams from webpage
4384 html = self._download_webpage(url, video_id)
4385 title = re.search(self._TITLE, html).group('title')
4386 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4387 if not streams:
4388 assert '"fsk"' in html
4389 self._downloader.report_error(u'this video is only available after 8:00 pm')
4390 return
4391
4392 # choose default media type and highest quality for now
b03d65c2
PH
4393 stream = max([s for s in streams if int(s["media_type"]) == 0],
4394 key=lambda s: int(s["quality"]))
df2dedee
MW
4395
4396 # there's two possibilities: RTMP stream or HTTP download
4397 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4398 if stream['rtmp_url']:
4399 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4400 assert stream['video_url'].startswith('mp4:')
4401 info["url"] = stream["rtmp_url"]
4402 info["play_path"] = stream['video_url']
4403 else:
4404 assert stream["video_url"].endswith('.mp4')
4405 info["url"] = stream["video_url"]
4406 return [info]
4407
e32b06e9 4408
4aeae91f
PH
4409def gen_extractors():
4410 """ Return a list of an instance of every supported extractor.
4411 The order does matter; the first extractor matched is the one handling the URL.
4412 """
4413 return [
4414 YoutubePlaylistIE(),
4415 YoutubeChannelIE(),
4416 YoutubeUserIE(),
4417 YoutubeSearchIE(),
4418 YoutubeIE(),
4419 MetacafeIE(),
4420 DailymotionIE(),
4421 GoogleSearchIE(),
4422 PhotobucketIE(),
4423 YahooIE(),
4424 YahooSearchIE(),
4425 DepositFilesIE(),
4426 FacebookIE(),
4427 BlipTVUserIE(),
4428 BlipTVIE(),
4429 VimeoIE(),
4430 MyVideoIE(),
4431 ComedyCentralIE(),
4432 EscapistIE(),
4433 CollegeHumorIE(),
4434 XVideosIE(),
5011cded 4435 SoundcloudSetIE(),
4aeae91f
PH
4436 SoundcloudIE(),
4437 InfoQIE(),
4438 MixcloudIE(),
4439 StanfordOpenClassroomIE(),
4440 MTVIE(),
4441 YoukuIE(),
4442 XNXXIE(),
18be482a
JC
4443 YouJizzIE(),
4444 PornotubeIE(),
4445 YouPornIE(),
4aeae91f
PH
4446 GooglePlusIE(),
4447 ArteTvIE(),
4448 NBAIE(),
40634747 4449 WorldStarHipHopIE(),
4aeae91f
PH
4450 JustinTVIE(),
4451 FunnyOrDieIE(),
4aeae91f
PH
4452 SteamIE(),
4453 UstreamIE(),
ca0a0bbe 4454 RBMARadioIE(),
ccf65f9d 4455 EightTracksIE(),
da06e2da 4456 KeekIE(),
3a468f2d 4457 TEDIE(),
58994225 4458 MySpassIE(),
e32b06e9 4459 SpiegelIE(),
0cd35867 4460 LiveLeakIE(),
df2dedee 4461 ARDIE(),
4aeae91f
PH
4462 GenericIE()
4463 ]