]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
Split code as a package, compiled into an executable zip
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15
16 try:
17 import cStringIO as StringIO
18 except ImportError:
19 import StringIO
20
21 # parse_qs was moved from the cgi module to the urlparse module recently.
22 try:
23 from urlparse import parse_qs
24 except ImportError:
25 from cgi import parse_qs
26
27 try:
28 import lxml.etree
29 except ImportError:
30 pass # Handled below
31
32 try:
33 import xml.etree.ElementTree
34 except ImportError: # Python<2.5: Not officially supported, but let it slip
35 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
36
37 from Utils import *
38
39
40 class InfoExtractor(object):
41 """Information Extractor class.
42
43 Information extractors are the classes that, given a URL, extract
44 information from the video (or videos) the URL refers to. This
45 information includes the real video URL, the video title and simplified
46 title, author and others. The information is stored in a dictionary
47 which is then passed to the FileDownloader. The FileDownloader
48 processes this information possibly downloading the video to the file
49 system, among other possible outcomes. The dictionaries must include
50 the following fields:
51
52 id: Video identifier.
53 url: Final video URL.
54 uploader: Nickname of the video uploader.
55 title: Literal title.
56 stitle: Simplified title.
57 ext: Video filename extension.
58 format: Video format.
59 player_url: SWF Player URL (may be None).
60
61 The following fields are optional. Their primary purpose is to allow
62 youtube-dl to serve as the backend for a video search function, such
63 as the one in youtube2mp3. They are only used when their respective
64 forced printing functions are called:
65
66 thumbnail: Full URL to a video thumbnail image.
67 description: One-line video description.
68
69 Subclasses of this one should re-define the _real_initialize() and
70 _real_extract() methods and define a _VALID_URL regexp.
71 Probably, they should also be added to the list of extractors.
72 """
73
74 _ready = False
75 _downloader = None
76
77 def __init__(self, downloader=None):
78 """Constructor. Receives an optional downloader."""
79 self._ready = False
80 self.set_downloader(downloader)
81
82 def suitable(self, url):
83 """Receives a URL and returns True if suitable for this IE."""
84 return re.match(self._VALID_URL, url) is not None
85
86 def initialize(self):
87 """Initializes an instance (authentication, etc)."""
88 if not self._ready:
89 self._real_initialize()
90 self._ready = True
91
92 def extract(self, url):
93 """Extracts URL information and returns it in list of dicts."""
94 self.initialize()
95 return self._real_extract(url)
96
97 def set_downloader(self, downloader):
98 """Sets the downloader for this IE."""
99 self._downloader = downloader
100
101 def _real_initialize(self):
102 """Real initialization process. Redefine in subclasses."""
103 pass
104
105 def _real_extract(self, url):
106 """Real extraction process. Redefine in subclasses."""
107 pass
108
109
110 class YoutubeIE(InfoExtractor):
111 """Information extractor for youtube.com."""
112
113 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
114 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
115 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
116 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
117 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
118 _NETRC_MACHINE = 'youtube'
119 # Listed in order of quality
120 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
121 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
122 _video_extensions = {
123 '13': '3gp',
124 '17': 'mp4',
125 '18': 'mp4',
126 '22': 'mp4',
127 '37': 'mp4',
128 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
129 '43': 'webm',
130 '44': 'webm',
131 '45': 'webm',
132 }
133 _video_dimensions = {
134 '5': '240x400',
135 '6': '???',
136 '13': '???',
137 '17': '144x176',
138 '18': '360x640',
139 '22': '720x1280',
140 '34': '360x640',
141 '35': '480x854',
142 '37': '1080x1920',
143 '38': '3072x4096',
144 '43': '360x640',
145 '44': '480x854',
146 '45': '720x1280',
147 }
148 IE_NAME = u'youtube'
149
150 def report_lang(self):
151 """Report attempt to set language."""
152 self._downloader.to_screen(u'[youtube] Setting language')
153
154 def report_login(self):
155 """Report attempt to log in."""
156 self._downloader.to_screen(u'[youtube] Logging in')
157
158 def report_age_confirmation(self):
159 """Report attempt to confirm age."""
160 self._downloader.to_screen(u'[youtube] Confirming age')
161
162 def report_video_webpage_download(self, video_id):
163 """Report attempt to download video webpage."""
164 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
165
166 def report_video_info_webpage_download(self, video_id):
167 """Report attempt to download video info webpage."""
168 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
169
170 def report_video_subtitles_download(self, video_id):
171 """Report attempt to download video info webpage."""
172 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
173
174 def report_information_extraction(self, video_id):
175 """Report attempt to extract video information."""
176 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
177
178 def report_unavailable_format(self, video_id, format):
179 """Report extracted video URL."""
180 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
181
182 def report_rtmp_download(self):
183 """Indicate the download will use the RTMP protocol."""
184 self._downloader.to_screen(u'[youtube] RTMP download detected')
185
186 def _closed_captions_xml_to_srt(self, xml_string):
187 srt = ''
188 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
189 # TODO parse xml instead of regex
190 for n, (start, dur_tag, dur, caption) in enumerate(texts):
191 if not dur: dur = '4'
192 start = float(start)
193 end = start + float(dur)
194 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
195 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
196 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
197 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
198 srt += str(n) + '\n'
199 srt += start + ' --> ' + end + '\n'
200 srt += caption + '\n\n'
201 return srt
202
203 def _print_formats(self, formats):
204 print 'Available formats:'
205 for x in formats:
206 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
207
208 def _real_initialize(self):
209 if self._downloader is None:
210 return
211
212 username = None
213 password = None
214 downloader_params = self._downloader.params
215
216 # Attempt to use provided username and password or .netrc data
217 if downloader_params.get('username', None) is not None:
218 username = downloader_params['username']
219 password = downloader_params['password']
220 elif downloader_params.get('usenetrc', False):
221 try:
222 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
223 if info is not None:
224 username = info[0]
225 password = info[2]
226 else:
227 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
228 except (IOError, netrc.NetrcParseError), err:
229 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
230 return
231
232 # Set language
233 request = urllib2.Request(self._LANG_URL)
234 try:
235 self.report_lang()
236 urllib2.urlopen(request).read()
237 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
238 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
239 return
240
241 # No authentication to be performed
242 if username is None:
243 return
244
245 # Log in
246 login_form = {
247 'current_form': 'loginForm',
248 'next': '/',
249 'action_login': 'Log In',
250 'username': username,
251 'password': password,
252 }
253 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
254 try:
255 self.report_login()
256 login_results = urllib2.urlopen(request).read()
257 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
258 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
259 return
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
262 return
263
264 # Confirm age
265 age_form = {
266 'next_url': '/',
267 'action_confirm': 'Confirm',
268 }
269 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
270 try:
271 self.report_age_confirmation()
272 age_results = urllib2.urlopen(request).read()
273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
274 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
275 return
276
277 def _real_extract(self, url):
278 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
279 mobj = re.search(self._NEXT_URL_RE, url)
280 if mobj:
281 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
282
283 # Extract video id from URL
284 mobj = re.match(self._VALID_URL, url)
285 if mobj is None:
286 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
287 return
288 video_id = mobj.group(2)
289
290 # Get video webpage
291 self.report_video_webpage_download(video_id)
292 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
293 try:
294 video_webpage = urllib2.urlopen(request).read()
295 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
296 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
297 return
298
299 # Attempt to extract SWF player URL
300 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
301 if mobj is not None:
302 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
303 else:
304 player_url = None
305
306 # Get video info
307 self.report_video_info_webpage_download(video_id)
308 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
309 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
310 % (video_id, el_type))
311 request = urllib2.Request(video_info_url)
312 try:
313 video_info_webpage = urllib2.urlopen(request).read()
314 video_info = parse_qs(video_info_webpage)
315 if 'token' in video_info:
316 break
317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
318 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
319 return
320 if 'token' not in video_info:
321 if 'reason' in video_info:
322 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
323 else:
324 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
325 return
326
327 # Start extracting information
328 self.report_information_extraction(video_id)
329
330 # uploader
331 if 'author' not in video_info:
332 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
333 return
334 video_uploader = urllib.unquote_plus(video_info['author'][0])
335
336 # title
337 if 'title' not in video_info:
338 self._downloader.trouble(u'ERROR: unable to extract video title')
339 return
340 video_title = urllib.unquote_plus(video_info['title'][0])
341 video_title = video_title.decode('utf-8')
342 video_title = sanitize_title(video_title)
343
344 # simplified title
345 simple_title = simplify_title(video_title)
346
347 # thumbnail image
348 if 'thumbnail_url' not in video_info:
349 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
350 video_thumbnail = ''
351 else: # don't panic if we can't find it
352 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
353
354 # upload date
355 upload_date = u'NA'
356 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
357 if mobj is not None:
358 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
359 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
360 for expression in format_expressions:
361 try:
362 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
363 except:
364 pass
365
366 # description
367 try:
368 lxml.etree
369 except NameError:
370 video_description = u'No description available.'
371 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
372 if mobj is not None:
373 video_description = mobj.group(1).decode('utf-8')
374 else:
375 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
376 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
377 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
378 # TODO use another parser
379
380 # closed captions
381 video_subtitles = None
382 if self._downloader.params.get('writesubtitles', False):
383 self.report_video_subtitles_download(video_id)
384 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
385 try:
386 srt_list = urllib2.urlopen(request).read()
387 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
388 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
389 else:
390 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
391 if srt_lang_list:
392 if self._downloader.params.get('subtitleslang', False):
393 srt_lang = self._downloader.params.get('subtitleslang')
394 elif 'en' in srt_lang_list:
395 srt_lang = 'en'
396 else:
397 srt_lang = srt_lang_list[0]
398 if not srt_lang in srt_lang_list:
399 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
400 else:
401 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
402 try:
403 srt_xml = urllib2.urlopen(request).read()
404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
405 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
406 else:
407 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
408 else:
409 self._downloader.trouble(u'WARNING: video has no closed captions')
410
411 # token
412 video_token = urllib.unquote_plus(video_info['token'][0])
413
414 # Decide which formats to download
415 req_format = self._downloader.params.get('format', None)
416
417 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
418 self.report_rtmp_download()
419 video_url_list = [(None, video_info['conn'][0])]
420 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
421 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
422 url_data = [parse_qs(uds) for uds in url_data_strs]
423 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
424 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
425
426 format_limit = self._downloader.params.get('format_limit', None)
427 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
428 if format_limit is not None and format_limit in available_formats:
429 format_list = available_formats[available_formats.index(format_limit):]
430 else:
431 format_list = available_formats
432 existing_formats = [x for x in format_list if x in url_map]
433 if len(existing_formats) == 0:
434 self._downloader.trouble(u'ERROR: no known formats available for video')
435 return
436 if self._downloader.params.get('listformats', None):
437 self._print_formats(existing_formats)
438 return
439 if req_format is None or req_format == 'best':
440 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
441 elif req_format == 'worst':
442 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
443 elif req_format in ('-1', 'all'):
444 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
445 else:
446 # Specific formats. We pick the first in a slash-delimeted sequence.
447 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
448 req_formats = req_format.split('/')
449 video_url_list = None
450 for rf in req_formats:
451 if rf in url_map:
452 video_url_list = [(rf, url_map[rf])]
453 break
454 if video_url_list is None:
455 self._downloader.trouble(u'ERROR: requested format not available')
456 return
457 else:
458 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
459 return
460
461 for format_param, video_real_url in video_url_list:
462 # At this point we have a new video
463 self._downloader.increment_downloads()
464
465 # Extension
466 video_extension = self._video_extensions.get(format_param, 'flv')
467
468 try:
469 # Process video information
470 self._downloader.process_info({
471 'id': video_id.decode('utf-8'),
472 'url': video_real_url.decode('utf-8'),
473 'uploader': video_uploader.decode('utf-8'),
474 'upload_date': upload_date,
475 'title': video_title,
476 'stitle': simple_title,
477 'ext': video_extension.decode('utf-8'),
478 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
479 'thumbnail': video_thumbnail.decode('utf-8'),
480 'description': video_description,
481 'player_url': player_url,
482 'subtitles': video_subtitles
483 })
484 except UnavailableVideoError, err:
485 self._downloader.trouble(u'\nERROR: unable to download video')
486
487
488 class MetacafeIE(InfoExtractor):
489 """Information Extractor for metacafe.com."""
490
491 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
492 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
493 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
494 _youtube_ie = None
495 IE_NAME = u'metacafe'
496
497 def __init__(self, youtube_ie, downloader=None):
498 InfoExtractor.__init__(self, downloader)
499 self._youtube_ie = youtube_ie
500
501 def report_disclaimer(self):
502 """Report disclaimer retrieval."""
503 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
504
505 def report_age_confirmation(self):
506 """Report attempt to confirm age."""
507 self._downloader.to_screen(u'[metacafe] Confirming age')
508
509 def report_download_webpage(self, video_id):
510 """Report webpage download."""
511 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
512
513 def report_extraction(self, video_id):
514 """Report information extraction."""
515 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
516
517 def _real_initialize(self):
518 # Retrieve disclaimer
519 request = urllib2.Request(self._DISCLAIMER)
520 try:
521 self.report_disclaimer()
522 disclaimer = urllib2.urlopen(request).read()
523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
524 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
525 return
526
527 # Confirm age
528 disclaimer_form = {
529 'filters': '0',
530 'submit': "Continue - I'm over 18",
531 }
532 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
533 try:
534 self.report_age_confirmation()
535 disclaimer = urllib2.urlopen(request).read()
536 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
537 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
538 return
539
540 def _real_extract(self, url):
541 # Extract id and simplified title from URL
542 mobj = re.match(self._VALID_URL, url)
543 if mobj is None:
544 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
545 return
546
547 video_id = mobj.group(1)
548
549 # Check if video comes from YouTube
550 mobj2 = re.match(r'^yt-(.*)$', video_id)
551 if mobj2 is not None:
552 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
553 return
554
555 # At this point we have a new video
556 self._downloader.increment_downloads()
557
558 simple_title = mobj.group(2).decode('utf-8')
559
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
562 try:
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
567 return
568
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
572 if mobj is not None:
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
575
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
578 if mobj is None:
579 video_url = mediaURL
580 else:
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
583 else:
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
585 if mobj is None:
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
587 return
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
591 return
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
593 if mobj is None:
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
595 return
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
599
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
601 if mobj is None:
602 self._downloader.trouble(u'ERROR: unable to extract title')
603 return
604 video_title = mobj.group(1).decode('utf-8')
605 video_title = sanitize_title(video_title)
606
607 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
608 if mobj is None:
609 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
610 return
611 video_uploader = mobj.group(1)
612
613 try:
614 # Process video information
615 self._downloader.process_info({
616 'id': video_id.decode('utf-8'),
617 'url': video_url.decode('utf-8'),
618 'uploader': video_uploader.decode('utf-8'),
619 'upload_date': u'NA',
620 'title': video_title,
621 'stitle': simple_title,
622 'ext': video_extension.decode('utf-8'),
623 'format': u'NA',
624 'player_url': None,
625 })
626 except UnavailableVideoError:
627 self._downloader.trouble(u'\nERROR: unable to download video')
628
629
630 class DailymotionIE(InfoExtractor):
631 """Information Extractor for Dailymotion"""
632
633 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
634 IE_NAME = u'dailymotion'
635
636 def __init__(self, downloader=None):
637 InfoExtractor.__init__(self, downloader)
638
639 def report_download_webpage(self, video_id):
640 """Report webpage download."""
641 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
642
643 def report_extraction(self, video_id):
644 """Report information extraction."""
645 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
646
647 def _real_extract(self, url):
648 # Extract id and simplified title from URL
649 mobj = re.match(self._VALID_URL, url)
650 if mobj is None:
651 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
652 return
653
654 # At this point we have a new video
655 self._downloader.increment_downloads()
656 video_id = mobj.group(1)
657
658 video_extension = 'flv'
659
660 # Retrieve video webpage to extract further information
661 request = urllib2.Request(url)
662 request.add_header('Cookie', 'family_filter=off')
663 try:
664 self.report_download_webpage(video_id)
665 webpage = urllib2.urlopen(request).read()
666 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
667 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
668 return
669
670 # Extract URL, uploader and title from webpage
671 self.report_extraction(video_id)
672 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
673 if mobj is None:
674 self._downloader.trouble(u'ERROR: unable to extract media URL')
675 return
676 sequence = urllib.unquote(mobj.group(1))
677 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
678 if mobj is None:
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
680 return
681 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
682
683 # if needed add http://www.dailymotion.com/ if relative URL
684
685 video_url = mediaURL
686
687 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
688 if mobj is None:
689 self._downloader.trouble(u'ERROR: unable to extract title')
690 return
691 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
692 video_title = sanitize_title(video_title)
693 simple_title = simplify_title(video_title)
694
695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
696 if mobj is None:
697 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
698 return
699 video_uploader = mobj.group(1)
700
701 try:
702 # Process video information
703 self._downloader.process_info({
704 'id': video_id.decode('utf-8'),
705 'url': video_url.decode('utf-8'),
706 'uploader': video_uploader.decode('utf-8'),
707 'upload_date': u'NA',
708 'title': video_title,
709 'stitle': simple_title,
710 'ext': video_extension.decode('utf-8'),
711 'format': u'NA',
712 'player_url': None,
713 })
714 except UnavailableVideoError:
715 self._downloader.trouble(u'\nERROR: unable to download video')
716
717
718 class GoogleIE(InfoExtractor):
719 """Information extractor for video.google.com."""
720
721 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
722 IE_NAME = u'video.google'
723
724 def __init__(self, downloader=None):
725 InfoExtractor.__init__(self, downloader)
726
727 def report_download_webpage(self, video_id):
728 """Report webpage download."""
729 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
730
731 def report_extraction(self, video_id):
732 """Report information extraction."""
733 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
734
735 def _real_extract(self, url):
736 # Extract id from URL
737 mobj = re.match(self._VALID_URL, url)
738 if mobj is None:
739 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
740 return
741
742 # At this point we have a new video
743 self._downloader.increment_downloads()
744 video_id = mobj.group(1)
745
746 video_extension = 'mp4'
747
748 # Retrieve video webpage to extract further information
749 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
750 try:
751 self.report_download_webpage(video_id)
752 webpage = urllib2.urlopen(request).read()
753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
754 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
755 return
756
757 # Extract URL, uploader, and title from webpage
758 self.report_extraction(video_id)
759 mobj = re.search(r"download_url:'([^']+)'", webpage)
760 if mobj is None:
761 video_extension = 'flv'
762 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
763 if mobj is None:
764 self._downloader.trouble(u'ERROR: unable to extract media URL')
765 return
766 mediaURL = urllib.unquote(mobj.group(1))
767 mediaURL = mediaURL.replace('\\x3d', '\x3d')
768 mediaURL = mediaURL.replace('\\x26', '\x26')
769
770 video_url = mediaURL
771
772 mobj = re.search(r'<title>(.*)</title>', webpage)
773 if mobj is None:
774 self._downloader.trouble(u'ERROR: unable to extract title')
775 return
776 video_title = mobj.group(1).decode('utf-8')
777 video_title = sanitize_title(video_title)
778 simple_title = simplify_title(video_title)
779
780 # Extract video description
781 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
782 if mobj is None:
783 self._downloader.trouble(u'ERROR: unable to extract video description')
784 return
785 video_description = mobj.group(1).decode('utf-8')
786 if not video_description:
787 video_description = 'No description available.'
788
789 # Extract video thumbnail
790 if self._downloader.params.get('forcethumbnail', False):
791 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
792 try:
793 webpage = urllib2.urlopen(request).read()
794 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
795 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
796 return
797 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
798 if mobj is None:
799 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
800 return
801 video_thumbnail = mobj.group(1)
802 else: # we need something to pass to process_info
803 video_thumbnail = ''
804
805 try:
806 # Process video information
807 self._downloader.process_info({
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
810 'uploader': u'NA',
811 'upload_date': u'NA',
812 'title': video_title,
813 'stitle': simple_title,
814 'ext': video_extension.decode('utf-8'),
815 'format': u'NA',
816 'player_url': None,
817 })
818 except UnavailableVideoError:
819 self._downloader.trouble(u'\nERROR: unable to download video')
820
821
822 class PhotobucketIE(InfoExtractor):
823 """Information extractor for photobucket.com."""
824
825 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
826 IE_NAME = u'photobucket'
827
828 def __init__(self, downloader=None):
829 InfoExtractor.__init__(self, downloader)
830
831 def report_download_webpage(self, video_id):
832 """Report webpage download."""
833 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
834
835 def report_extraction(self, video_id):
836 """Report information extraction."""
837 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
838
839 def _real_extract(self, url):
840 # Extract id from URL
841 mobj = re.match(self._VALID_URL, url)
842 if mobj is None:
843 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
844 return
845
846 # At this point we have a new video
847 self._downloader.increment_downloads()
848 video_id = mobj.group(1)
849
850 video_extension = 'flv'
851
852 # Retrieve video webpage to extract further information
853 request = urllib2.Request(url)
854 try:
855 self.report_download_webpage(video_id)
856 webpage = urllib2.urlopen(request).read()
857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
858 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
859 return
860
861 # Extract URL, uploader, and title from webpage
862 self.report_extraction(video_id)
863 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
864 if mobj is None:
865 self._downloader.trouble(u'ERROR: unable to extract media URL')
866 return
867 mediaURL = urllib.unquote(mobj.group(1))
868
869 video_url = mediaURL
870
871 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
872 if mobj is None:
873 self._downloader.trouble(u'ERROR: unable to extract title')
874 return
875 video_title = mobj.group(1).decode('utf-8')
876 video_title = sanitize_title(video_title)
877 simple_title = simplify_title(video_title)
878
879 video_uploader = mobj.group(2).decode('utf-8')
880
881 try:
882 # Process video information
883 self._downloader.process_info({
884 'id': video_id.decode('utf-8'),
885 'url': video_url.decode('utf-8'),
886 'uploader': video_uploader,
887 'upload_date': u'NA',
888 'title': video_title,
889 'stitle': simple_title,
890 'ext': video_extension.decode('utf-8'),
891 'format': u'NA',
892 'player_url': None,
893 })
894 except UnavailableVideoError:
895 self._downloader.trouble(u'\nERROR: unable to download video')
896
897
898 class YahooIE(InfoExtractor):
899 """Information extractor for video.yahoo.com."""
900
901 # _VALID_URL matches all Yahoo! Video URLs
902 # _VPAGE_URL matches only the extractable '/watch/' URLs
903 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
904 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
905 IE_NAME = u'video.yahoo'
906
907 def __init__(self, downloader=None):
908 InfoExtractor.__init__(self, downloader)
909
910 def report_download_webpage(self, video_id):
911 """Report webpage download."""
912 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
913
914 def report_extraction(self, video_id):
915 """Report information extraction."""
916 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
917
918 def _real_extract(self, url, new_video=True):
919 # Extract ID from URL
920 mobj = re.match(self._VALID_URL, url)
921 if mobj is None:
922 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
923 return
924
925 # At this point we have a new video
926 self._downloader.increment_downloads()
927 video_id = mobj.group(2)
928 video_extension = 'flv'
929
930 # Rewrite valid but non-extractable URLs as
931 # extractable English language /watch/ URLs
932 if re.match(self._VPAGE_URL, url) is None:
933 request = urllib2.Request(url)
934 try:
935 webpage = urllib2.urlopen(request).read()
936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
938 return
939
940 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
941 if mobj is None:
942 self._downloader.trouble(u'ERROR: Unable to extract id field')
943 return
944 yahoo_id = mobj.group(1)
945
946 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
947 if mobj is None:
948 self._downloader.trouble(u'ERROR: Unable to extract vid field')
949 return
950 yahoo_vid = mobj.group(1)
951
952 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
953 return self._real_extract(url, new_video=False)
954
955 # Retrieve video webpage to extract further information
956 request = urllib2.Request(url)
957 try:
958 self.report_download_webpage(video_id)
959 webpage = urllib2.urlopen(request).read()
960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
961 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
962 return
963
964 # Extract uploader and title from webpage
965 self.report_extraction(video_id)
966 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
967 if mobj is None:
968 self._downloader.trouble(u'ERROR: unable to extract video title')
969 return
970 video_title = mobj.group(1).decode('utf-8')
971 simple_title = simplify_title(video_title)
972
973 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
974 if mobj is None:
975 self._downloader.trouble(u'ERROR: unable to extract video uploader')
976 return
977 video_uploader = mobj.group(1).decode('utf-8')
978
979 # Extract video thumbnail
980 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
981 if mobj is None:
982 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
983 return
984 video_thumbnail = mobj.group(1).decode('utf-8')
985
986 # Extract video description
987 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
988 if mobj is None:
989 self._downloader.trouble(u'ERROR: unable to extract video description')
990 return
991 video_description = mobj.group(1).decode('utf-8')
992 if not video_description:
993 video_description = 'No description available.'
994
995 # Extract video height and width
996 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
997 if mobj is None:
998 self._downloader.trouble(u'ERROR: unable to extract video height')
999 return
1000 yv_video_height = mobj.group(1)
1001
1002 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1003 if mobj is None:
1004 self._downloader.trouble(u'ERROR: unable to extract video width')
1005 return
1006 yv_video_width = mobj.group(1)
1007
1008 # Retrieve video playlist to extract media URL
1009 # I'm not completely sure what all these options are, but we
1010 # seem to need most of them, otherwise the server sends a 401.
1011 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1012 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1013 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1014 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1015 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1016 try:
1017 self.report_download_webpage(video_id)
1018 webpage = urllib2.urlopen(request).read()
1019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1021 return
1022
1023 # Extract media URL from playlist XML
1024 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1025 if mobj is None:
1026 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1027 return
1028 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1029 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1030
1031 try:
1032 # Process video information
1033 self._downloader.process_info({
1034 'id': video_id.decode('utf-8'),
1035 'url': video_url,
1036 'uploader': video_uploader,
1037 'upload_date': u'NA',
1038 'title': video_title,
1039 'stitle': simple_title,
1040 'ext': video_extension.decode('utf-8'),
1041 'thumbnail': video_thumbnail.decode('utf-8'),
1042 'description': video_description,
1043 'thumbnail': video_thumbnail,
1044 'player_url': None,
1045 })
1046 except UnavailableVideoError:
1047 self._downloader.trouble(u'\nERROR: unable to download video')
1048
1049
1050 class VimeoIE(InfoExtractor):
1051 """Information extractor for vimeo.com."""
1052
1053 # _VALID_URL matches Vimeo URLs
1054 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1055 IE_NAME = u'vimeo'
1056
1057 def __init__(self, downloader=None):
1058 InfoExtractor.__init__(self, downloader)
1059
1060 def report_download_webpage(self, video_id):
1061 """Report webpage download."""
1062 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1063
1064 def report_extraction(self, video_id):
1065 """Report information extraction."""
1066 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1067
1068 def _real_extract(self, url, new_video=True):
1069 # Extract ID from URL
1070 mobj = re.match(self._VALID_URL, url)
1071 if mobj is None:
1072 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1073 return
1074
1075 # At this point we have a new video
1076 self._downloader.increment_downloads()
1077 video_id = mobj.group(1)
1078
1079 # Retrieve video webpage to extract further information
1080 request = urllib2.Request(url, None, std_headers)
1081 try:
1082 self.report_download_webpage(video_id)
1083 webpage = urllib2.urlopen(request).read()
1084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1085 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1086 return
1087
1088 # Now we begin extracting as much information as we can from what we
1089 # retrieved. First we extract the information common to all extractors,
1090 # and latter we extract those that are Vimeo specific.
1091 self.report_extraction(video_id)
1092
1093 # Extract the config JSON
1094 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1095 try:
1096 config = json.loads(config)
1097 except:
1098 self._downloader.trouble(u'ERROR: unable to extract info section')
1099 return
1100
1101 # Extract title
1102 video_title = config["video"]["title"]
1103 simple_title = simplify_title(video_title)
1104
1105 # Extract uploader
1106 video_uploader = config["video"]["owner"]["name"]
1107
1108 # Extract video thumbnail
1109 video_thumbnail = config["video"]["thumbnail"]
1110
1111 # Extract video description
1112 try:
1113 lxml.etree
1114 except NameError:
1115 video_description = u'No description available.'
1116 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
1117 if mobj is not None:
1118 video_description = mobj.group(1)
1119 else:
1120 html_parser = lxml.etree.HTMLParser()
1121 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
1122 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
1123 # TODO use another parser
1124
1125 # Extract upload date
1126 video_upload_date = u'NA'
1127 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1128 if mobj is not None:
1129 video_upload_date = mobj.group(1)
1130
1131 # Vimeo specific: extract request signature and timestamp
1132 sig = config['request']['signature']
1133 timestamp = config['request']['timestamp']
1134
1135 # Vimeo specific: extract video codec and quality information
1136 # TODO bind to format param
1137 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1138 for codec in codecs:
1139 if codec[0] in config["video"]["files"]:
1140 video_codec = codec[0]
1141 video_extension = codec[1]
1142 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1143 else: quality = 'sd'
1144 break
1145 else:
1146 self._downloader.trouble(u'ERROR: no known codec found')
1147 return
1148
1149 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1150 %(video_id, sig, timestamp, quality, video_codec.upper())
1151
1152 try:
1153 # Process video information
1154 self._downloader.process_info({
1155 'id': video_id,
1156 'url': video_url,
1157 'uploader': video_uploader,
1158 'upload_date': video_upload_date,
1159 'title': video_title,
1160 'stitle': simple_title,
1161 'ext': video_extension,
1162 'thumbnail': video_thumbnail,
1163 'description': video_description,
1164 'player_url': None,
1165 })
1166 except UnavailableVideoError:
1167 self._downloader.trouble(u'ERROR: unable to download video')
1168
1169
1170 class GenericIE(InfoExtractor):
1171 """Generic last-resort information extractor."""
1172
1173 _VALID_URL = r'.*'
1174 IE_NAME = u'generic'
1175
1176 def __init__(self, downloader=None):
1177 InfoExtractor.__init__(self, downloader)
1178
1179 def report_download_webpage(self, video_id):
1180 """Report webpage download."""
1181 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1182 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1183
1184 def report_extraction(self, video_id):
1185 """Report information extraction."""
1186 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1187
1188 def report_following_redirect(self, new_url):
1189 """Report information extraction."""
1190 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1191
1192 def _test_redirect(self, url):
1193 """Check if it is a redirect, like url shorteners, in case restart chain."""
1194 class HeadRequest(urllib2.Request):
1195 def get_method(self):
1196 return "HEAD"
1197
1198 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1199 """
1200 Subclass the HTTPRedirectHandler to make it use our
1201 HeadRequest also on the redirected URL
1202 """
1203 def redirect_request(self, req, fp, code, msg, headers, newurl):
1204 if code in (301, 302, 303, 307):
1205 newurl = newurl.replace(' ', '%20')
1206 newheaders = dict((k,v) for k,v in req.headers.items()
1207 if k.lower() not in ("content-length", "content-type"))
1208 return HeadRequest(newurl,
1209 headers=newheaders,
1210 origin_req_host=req.get_origin_req_host(),
1211 unverifiable=True)
1212 else:
1213 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1214
1215 class HTTPMethodFallback(urllib2.BaseHandler):
1216 """
1217 Fallback to GET if HEAD is not allowed (405 HTTP error)
1218 """
1219 def http_error_405(self, req, fp, code, msg, headers):
1220 fp.read()
1221 fp.close()
1222
1223 newheaders = dict((k,v) for k,v in req.headers.items()
1224 if k.lower() not in ("content-length", "content-type"))
1225 return self.parent.open(urllib2.Request(req.get_full_url(),
1226 headers=newheaders,
1227 origin_req_host=req.get_origin_req_host(),
1228 unverifiable=True))
1229
1230 # Build our opener
1231 opener = urllib2.OpenerDirector()
1232 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1233 HTTPMethodFallback, HEADRedirectHandler,
1234 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1235 opener.add_handler(handler())
1236
1237 response = opener.open(HeadRequest(url))
1238 new_url = response.geturl()
1239
1240 if url == new_url: return False
1241
1242 self.report_following_redirect(new_url)
1243 self._downloader.download([new_url])
1244 return True
1245
1246 def _real_extract(self, url):
1247 if self._test_redirect(url): return
1248
1249 # At this point we have a new video
1250 self._downloader.increment_downloads()
1251
1252 video_id = url.split('/')[-1]
1253 request = urllib2.Request(url)
1254 try:
1255 self.report_download_webpage(video_id)
1256 webpage = urllib2.urlopen(request).read()
1257 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1258 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1259 return
1260 except ValueError, err:
1261 # since this is the last-resort InfoExtractor, if
1262 # this error is thrown, it'll be thrown here
1263 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1264 return
1265
1266 self.report_extraction(video_id)
1267 # Start with something easy: JW Player in SWFObject
1268 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1269 if mobj is None:
1270 # Broaden the search a little bit
1271 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1272 if mobj is None:
1273 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1274 return
1275
1276 # It's possible that one of the regexes
1277 # matched, but returned an empty group:
1278 if mobj.group(1) is None:
1279 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1280 return
1281
1282 video_url = urllib.unquote(mobj.group(1))
1283 video_id = os.path.basename(video_url)
1284
1285 # here's a fun little line of code for you:
1286 video_extension = os.path.splitext(video_id)[1][1:]
1287 video_id = os.path.splitext(video_id)[0]
1288
1289 # it's tempting to parse this further, but you would
1290 # have to take into account all the variations like
1291 # Video Title - Site Name
1292 # Site Name | Video Title
1293 # Video Title - Tagline | Site Name
1294 # and so on and so forth; it's just not practical
1295 mobj = re.search(r'<title>(.*)</title>', webpage)
1296 if mobj is None:
1297 self._downloader.trouble(u'ERROR: unable to extract title')
1298 return
1299 video_title = mobj.group(1).decode('utf-8')
1300 video_title = sanitize_title(video_title)
1301 simple_title = simplify_title(video_title)
1302
1303 # video uploader is domain name
1304 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1305 if mobj is None:
1306 self._downloader.trouble(u'ERROR: unable to extract title')
1307 return
1308 video_uploader = mobj.group(1).decode('utf-8')
1309
1310 try:
1311 # Process video information
1312 self._downloader.process_info({
1313 'id': video_id.decode('utf-8'),
1314 'url': video_url.decode('utf-8'),
1315 'uploader': video_uploader,
1316 'upload_date': u'NA',
1317 'title': video_title,
1318 'stitle': simple_title,
1319 'ext': video_extension.decode('utf-8'),
1320 'format': u'NA',
1321 'player_url': None,
1322 })
1323 except UnavailableVideoError, err:
1324 self._downloader.trouble(u'\nERROR: unable to download video')
1325
1326
1327 class YoutubeSearchIE(InfoExtractor):
1328 """Information Extractor for YouTube search queries."""
1329 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1330 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1331 _youtube_ie = None
1332 _max_youtube_results = 1000
1333 IE_NAME = u'youtube:search'
1334
1335 def __init__(self, youtube_ie, downloader=None):
1336 InfoExtractor.__init__(self, downloader)
1337 self._youtube_ie = youtube_ie
1338
1339 def report_download_page(self, query, pagenum):
1340 """Report attempt to download playlist page with given number."""
1341 query = query.decode(preferredencoding())
1342 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1343
1344 def _real_initialize(self):
1345 self._youtube_ie.initialize()
1346
1347 def _real_extract(self, query):
1348 mobj = re.match(self._VALID_URL, query)
1349 if mobj is None:
1350 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1351 return
1352
1353 prefix, query = query.split(':')
1354 prefix = prefix[8:]
1355 query = query.encode('utf-8')
1356 if prefix == '':
1357 self._download_n_results(query, 1)
1358 return
1359 elif prefix == 'all':
1360 self._download_n_results(query, self._max_youtube_results)
1361 return
1362 else:
1363 try:
1364 n = long(prefix)
1365 if n <= 0:
1366 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1367 return
1368 elif n > self._max_youtube_results:
1369 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370 n = self._max_youtube_results
1371 self._download_n_results(query, n)
1372 return
1373 except ValueError: # parsing prefix as integer fails
1374 self._download_n_results(query, 1)
1375 return
1376
1377 def _download_n_results(self, query, n):
1378 """Downloads a specified number of results for a query"""
1379
1380 video_ids = []
1381 pagenum = 0
1382 limit = n
1383
1384 while (50 * pagenum) < limit:
1385 self.report_download_page(query, pagenum+1)
1386 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1387 request = urllib2.Request(result_url)
1388 try:
1389 data = urllib2.urlopen(request).read()
1390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1391 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1392 return
1393 api_response = json.loads(data)['data']
1394
1395 new_ids = list(video['id'] for video in api_response['items'])
1396 video_ids += new_ids
1397
1398 limit = min(n, api_response['totalItems'])
1399 pagenum += 1
1400
1401 if len(video_ids) > n:
1402 video_ids = video_ids[:n]
1403 for id in video_ids:
1404 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1405 return
1406
1407
1408 class GoogleSearchIE(InfoExtractor):
1409 """Information Extractor for Google Video search queries."""
1410 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1411 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1412 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1413 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1414 _google_ie = None
1415 _max_google_results = 1000
1416 IE_NAME = u'video.google:search'
1417
1418 def __init__(self, google_ie, downloader=None):
1419 InfoExtractor.__init__(self, downloader)
1420 self._google_ie = google_ie
1421
1422 def report_download_page(self, query, pagenum):
1423 """Report attempt to download playlist page with given number."""
1424 query = query.decode(preferredencoding())
1425 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1426
1427 def _real_initialize(self):
1428 self._google_ie.initialize()
1429
1430 def _real_extract(self, query):
1431 mobj = re.match(self._VALID_URL, query)
1432 if mobj is None:
1433 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1434 return
1435
1436 prefix, query = query.split(':')
1437 prefix = prefix[8:]
1438 query = query.encode('utf-8')
1439 if prefix == '':
1440 self._download_n_results(query, 1)
1441 return
1442 elif prefix == 'all':
1443 self._download_n_results(query, self._max_google_results)
1444 return
1445 else:
1446 try:
1447 n = long(prefix)
1448 if n <= 0:
1449 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1450 return
1451 elif n > self._max_google_results:
1452 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1453 n = self._max_google_results
1454 self._download_n_results(query, n)
1455 return
1456 except ValueError: # parsing prefix as integer fails
1457 self._download_n_results(query, 1)
1458 return
1459
1460 def _download_n_results(self, query, n):
1461 """Downloads a specified number of results for a query"""
1462
1463 video_ids = []
1464 pagenum = 0
1465
1466 while True:
1467 self.report_download_page(query, pagenum)
1468 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1469 request = urllib2.Request(result_url)
1470 try:
1471 page = urllib2.urlopen(request).read()
1472 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1474 return
1475
1476 # Extract video identifiers
1477 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1478 video_id = mobj.group(1)
1479 if video_id not in video_ids:
1480 video_ids.append(video_id)
1481 if len(video_ids) == n:
1482 # Specified n videos reached
1483 for id in video_ids:
1484 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1485 return
1486
1487 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1488 for id in video_ids:
1489 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1490 return
1491
1492 pagenum = pagenum + 1
1493
1494
1495 class YahooSearchIE(InfoExtractor):
1496 """Information Extractor for Yahoo! Video search queries."""
1497 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1498 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1499 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1500 _MORE_PAGES_INDICATOR = r'\s*Next'
1501 _yahoo_ie = None
1502 _max_yahoo_results = 1000
1503 IE_NAME = u'video.yahoo:search'
1504
1505 def __init__(self, yahoo_ie, downloader=None):
1506 InfoExtractor.__init__(self, downloader)
1507 self._yahoo_ie = yahoo_ie
1508
1509 def report_download_page(self, query, pagenum):
1510 """Report attempt to download playlist page with given number."""
1511 query = query.decode(preferredencoding())
1512 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1513
1514 def _real_initialize(self):
1515 self._yahoo_ie.initialize()
1516
1517 def _real_extract(self, query):
1518 mobj = re.match(self._VALID_URL, query)
1519 if mobj is None:
1520 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1521 return
1522
1523 prefix, query = query.split(':')
1524 prefix = prefix[8:]
1525 query = query.encode('utf-8')
1526 if prefix == '':
1527 self._download_n_results(query, 1)
1528 return
1529 elif prefix == 'all':
1530 self._download_n_results(query, self._max_yahoo_results)
1531 return
1532 else:
1533 try:
1534 n = long(prefix)
1535 if n <= 0:
1536 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1537 return
1538 elif n > self._max_yahoo_results:
1539 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1540 n = self._max_yahoo_results
1541 self._download_n_results(query, n)
1542 return
1543 except ValueError: # parsing prefix as integer fails
1544 self._download_n_results(query, 1)
1545 return
1546
1547 def _download_n_results(self, query, n):
1548 """Downloads a specified number of results for a query"""
1549
1550 video_ids = []
1551 already_seen = set()
1552 pagenum = 1
1553
1554 while True:
1555 self.report_download_page(query, pagenum)
1556 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1557 request = urllib2.Request(result_url)
1558 try:
1559 page = urllib2.urlopen(request).read()
1560 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1562 return
1563
1564 # Extract video identifiers
1565 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1566 video_id = mobj.group(1)
1567 if video_id not in already_seen:
1568 video_ids.append(video_id)
1569 already_seen.add(video_id)
1570 if len(video_ids) == n:
1571 # Specified n videos reached
1572 for id in video_ids:
1573 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1574 return
1575
1576 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577 for id in video_ids:
1578 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1579 return
1580
1581 pagenum = pagenum + 1
1582
1583
1584 class YoutubePlaylistIE(InfoExtractor):
1585 """Information Extractor for YouTube playlists."""
1586
1587 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1588 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1589 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1590 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1591 _youtube_ie = None
1592 IE_NAME = u'youtube:playlist'
1593
1594 def __init__(self, youtube_ie, downloader=None):
1595 InfoExtractor.__init__(self, downloader)
1596 self._youtube_ie = youtube_ie
1597
1598 def report_download_page(self, playlist_id, pagenum):
1599 """Report attempt to download playlist page with given number."""
1600 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1601
1602 def _real_initialize(self):
1603 self._youtube_ie.initialize()
1604
1605 def _real_extract(self, url):
1606 # Extract playlist id
1607 mobj = re.match(self._VALID_URL, url)
1608 if mobj is None:
1609 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1610 return
1611
1612 # Single video case
1613 if mobj.group(3) is not None:
1614 self._youtube_ie.extract(mobj.group(3))
1615 return
1616
1617 # Download playlist pages
1618 # prefix is 'p' as default for playlists but there are other types that need extra care
1619 playlist_prefix = mobj.group(1)
1620 if playlist_prefix == 'a':
1621 playlist_access = 'artist'
1622 else:
1623 playlist_prefix = 'p'
1624 playlist_access = 'view_play_list'
1625 playlist_id = mobj.group(2)
1626 video_ids = []
1627 pagenum = 1
1628
1629 while True:
1630 self.report_download_page(playlist_id, pagenum)
1631 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1632 request = urllib2.Request(url)
1633 try:
1634 page = urllib2.urlopen(request).read()
1635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1637 return
1638
1639 # Extract video identifiers
1640 ids_in_page = []
1641 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1642 if mobj.group(1) not in ids_in_page:
1643 ids_in_page.append(mobj.group(1))
1644 video_ids.extend(ids_in_page)
1645
1646 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1647 break
1648 pagenum = pagenum + 1
1649
1650 playliststart = self._downloader.params.get('playliststart', 1) - 1
1651 playlistend = self._downloader.params.get('playlistend', -1)
1652 if playlistend == -1:
1653 video_ids = video_ids[playliststart:]
1654 else:
1655 video_ids = video_ids[playliststart:playlistend]
1656
1657 for id in video_ids:
1658 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1659 return
1660
1661
1662 class YoutubeUserIE(InfoExtractor):
1663 """Information Extractor for YouTube users."""
1664
1665 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1666 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1667 _GDATA_PAGE_SIZE = 50
1668 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1669 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1670 _youtube_ie = None
1671 IE_NAME = u'youtube:user'
1672
1673 def __init__(self, youtube_ie, downloader=None):
1674 InfoExtractor.__init__(self, downloader)
1675 self._youtube_ie = youtube_ie
1676
1677 def report_download_page(self, username, start_index):
1678 """Report attempt to download user page."""
1679 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1680 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1681
1682 def _real_initialize(self):
1683 self._youtube_ie.initialize()
1684
1685 def _real_extract(self, url):
1686 # Extract username
1687 mobj = re.match(self._VALID_URL, url)
1688 if mobj is None:
1689 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690 return
1691
1692 username = mobj.group(1)
1693
1694 # Download video ids using YouTube Data API. Result size per
1695 # query is limited (currently to 50 videos) so we need to query
1696 # page by page until there are no video ids - it means we got
1697 # all of them.
1698
1699 video_ids = []
1700 pagenum = 0
1701
1702 while True:
1703 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1704 self.report_download_page(username, start_index)
1705
1706 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1707
1708 try:
1709 page = urllib2.urlopen(request).read()
1710 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1711 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1712 return
1713
1714 # Extract video identifiers
1715 ids_in_page = []
1716
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 if mobj.group(1) not in ids_in_page:
1719 ids_in_page.append(mobj.group(1))
1720
1721 video_ids.extend(ids_in_page)
1722
1723 # A little optimization - if current page is not
1724 # "full", ie. does not contain PAGE_SIZE video ids then
1725 # we can assume that this page is the last one - there
1726 # are no more ids on further pages - no need to query
1727 # again.
1728
1729 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1730 break
1731
1732 pagenum += 1
1733
1734 all_ids_count = len(video_ids)
1735 playliststart = self._downloader.params.get('playliststart', 1) - 1
1736 playlistend = self._downloader.params.get('playlistend', -1)
1737
1738 if playlistend == -1:
1739 video_ids = video_ids[playliststart:]
1740 else:
1741 video_ids = video_ids[playliststart:playlistend]
1742
1743 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1744 (username, all_ids_count, len(video_ids)))
1745
1746 for video_id in video_ids:
1747 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
1748
1749
1750 class DepositFilesIE(InfoExtractor):
1751 """Information extractor for depositfiles.com"""
1752
1753 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1754 IE_NAME = u'DepositFiles'
1755
1756 def __init__(self, downloader=None):
1757 InfoExtractor.__init__(self, downloader)
1758
1759 def report_download_webpage(self, file_id):
1760 """Report webpage download."""
1761 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1762
1763 def report_extraction(self, file_id):
1764 """Report information extraction."""
1765 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1766
1767 def _real_extract(self, url):
1768 # At this point we have a new file
1769 self._downloader.increment_downloads()
1770
1771 file_id = url.split('/')[-1]
1772 # Rebuild url in english locale
1773 url = 'http://depositfiles.com/en/files/' + file_id
1774
1775 # Retrieve file webpage with 'Free download' button pressed
1776 free_download_indication = { 'gateway_result' : '1' }
1777 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1778 try:
1779 self.report_download_webpage(file_id)
1780 webpage = urllib2.urlopen(request).read()
1781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1783 return
1784
1785 # Search for the real file URL
1786 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1787 if (mobj is None) or (mobj.group(1) is None):
1788 # Try to figure out reason of the error.
1789 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1790 if (mobj is not None) and (mobj.group(1) is not None):
1791 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1792 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1793 else:
1794 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1795 return
1796
1797 file_url = mobj.group(1)
1798 file_extension = os.path.splitext(file_url)[1][1:]
1799
1800 # Search for file title
1801 mobj = re.search(r'<b title="(.*?)">', webpage)
1802 if mobj is None:
1803 self._downloader.trouble(u'ERROR: unable to extract title')
1804 return
1805 file_title = mobj.group(1).decode('utf-8')
1806
1807 try:
1808 # Process file information
1809 self._downloader.process_info({
1810 'id': file_id.decode('utf-8'),
1811 'url': file_url.decode('utf-8'),
1812 'uploader': u'NA',
1813 'upload_date': u'NA',
1814 'title': file_title,
1815 'stitle': file_title,
1816 'ext': file_extension.decode('utf-8'),
1817 'format': u'NA',
1818 'player_url': None,
1819 })
1820 except UnavailableVideoError, err:
1821 self._downloader.trouble(u'ERROR: unable to download file')
1822
1823
1824 class FacebookIE(InfoExtractor):
1825 """Information Extractor for Facebook"""
1826
1827 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1828 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1829 _NETRC_MACHINE = 'facebook'
1830 _available_formats = ['video', 'highqual', 'lowqual']
1831 _video_extensions = {
1832 'video': 'mp4',
1833 'highqual': 'mp4',
1834 'lowqual': 'mp4',
1835 }
1836 IE_NAME = u'facebook'
1837
1838 def __init__(self, downloader=None):
1839 InfoExtractor.__init__(self, downloader)
1840
1841 def _reporter(self, message):
1842 """Add header and report message."""
1843 self._downloader.to_screen(u'[facebook] %s' % message)
1844
1845 def report_login(self):
1846 """Report attempt to log in."""
1847 self._reporter(u'Logging in')
1848
1849 def report_video_webpage_download(self, video_id):
1850 """Report attempt to download video webpage."""
1851 self._reporter(u'%s: Downloading video webpage' % video_id)
1852
1853 def report_information_extraction(self, video_id):
1854 """Report attempt to extract video information."""
1855 self._reporter(u'%s: Extracting video information' % video_id)
1856
1857 def _parse_page(self, video_webpage):
1858 """Extract video information from page"""
1859 # General data
1860 data = {'title': r'\("video_title", "(.*?)"\)',
1861 'description': r'<div class="datawrap">(.*?)</div>',
1862 'owner': r'\("video_owner_name", "(.*?)"\)',
1863 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1864 }
1865 video_info = {}
1866 for piece in data.keys():
1867 mobj = re.search(data[piece], video_webpage)
1868 if mobj is not None:
1869 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1870
1871 # Video urls
1872 video_urls = {}
1873 for fmt in self._available_formats:
1874 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1875 if mobj is not None:
1876 # URL is in a Javascript segment inside an escaped Unicode format within
1877 # the generally utf-8 page
1878 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1879 video_info['video_urls'] = video_urls
1880
1881 return video_info
1882
1883 def _real_initialize(self):
1884 if self._downloader is None:
1885 return
1886
1887 useremail = None
1888 password = None
1889 downloader_params = self._downloader.params
1890
1891 # Attempt to use provided username and password or .netrc data
1892 if downloader_params.get('username', None) is not None:
1893 useremail = downloader_params['username']
1894 password = downloader_params['password']
1895 elif downloader_params.get('usenetrc', False):
1896 try:
1897 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1898 if info is not None:
1899 useremail = info[0]
1900 password = info[2]
1901 else:
1902 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1903 except (IOError, netrc.NetrcParseError), err:
1904 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1905 return
1906
1907 if useremail is None:
1908 return
1909
1910 # Log in
1911 login_form = {
1912 'email': useremail,
1913 'pass': password,
1914 'login': 'Log+In'
1915 }
1916 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1917 try:
1918 self.report_login()
1919 login_results = urllib2.urlopen(request).read()
1920 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1921 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1922 return
1923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1924 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1925 return
1926
1927 def _real_extract(self, url):
1928 mobj = re.match(self._VALID_URL, url)
1929 if mobj is None:
1930 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1931 return
1932 video_id = mobj.group('ID')
1933
1934 # Get video webpage
1935 self.report_video_webpage_download(video_id)
1936 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1937 try:
1938 page = urllib2.urlopen(request)
1939 video_webpage = page.read()
1940 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1941 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1942 return
1943
1944 # Start extracting information
1945 self.report_information_extraction(video_id)
1946
1947 # Extract information
1948 video_info = self._parse_page(video_webpage)
1949
1950 # uploader
1951 if 'owner' not in video_info:
1952 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1953 return
1954 video_uploader = video_info['owner']
1955
1956 # title
1957 if 'title' not in video_info:
1958 self._downloader.trouble(u'ERROR: unable to extract video title')
1959 return
1960 video_title = video_info['title']
1961 video_title = video_title.decode('utf-8')
1962 video_title = sanitize_title(video_title)
1963
1964 simple_title = simplify_title(video_title)
1965
1966 # thumbnail image
1967 if 'thumbnail' not in video_info:
1968 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1969 video_thumbnail = ''
1970 else:
1971 video_thumbnail = video_info['thumbnail']
1972
1973 # upload date
1974 upload_date = u'NA'
1975 if 'upload_date' in video_info:
1976 upload_time = video_info['upload_date']
1977 timetuple = email.utils.parsedate_tz(upload_time)
1978 if timetuple is not None:
1979 try:
1980 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1981 except:
1982 pass
1983
1984 # description
1985 video_description = video_info.get('description', 'No description available.')
1986
1987 url_map = video_info['video_urls']
1988 if len(url_map.keys()) > 0:
1989 # Decide which formats to download
1990 req_format = self._downloader.params.get('format', None)
1991 format_limit = self._downloader.params.get('format_limit', None)
1992
1993 if format_limit is not None and format_limit in self._available_formats:
1994 format_list = self._available_formats[self._available_formats.index(format_limit):]
1995 else:
1996 format_list = self._available_formats
1997 existing_formats = [x for x in format_list if x in url_map]
1998 if len(existing_formats) == 0:
1999 self._downloader.trouble(u'ERROR: no known formats available for video')
2000 return
2001 if req_format is None:
2002 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2003 elif req_format == 'worst':
2004 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2005 elif req_format == '-1':
2006 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2007 else:
2008 # Specific format
2009 if req_format not in url_map:
2010 self._downloader.trouble(u'ERROR: requested format not available')
2011 return
2012 video_url_list = [(req_format, url_map[req_format])] # Specific format
2013
2014 for format_param, video_real_url in video_url_list:
2015
2016 # At this point we have a new video
2017 self._downloader.increment_downloads()
2018
2019 # Extension
2020 video_extension = self._video_extensions.get(format_param, 'mp4')
2021
2022 try:
2023 # Process video information
2024 self._downloader.process_info({
2025 'id': video_id.decode('utf-8'),
2026 'url': video_real_url.decode('utf-8'),
2027 'uploader': video_uploader.decode('utf-8'),
2028 'upload_date': upload_date,
2029 'title': video_title,
2030 'stitle': simple_title,
2031 'ext': video_extension.decode('utf-8'),
2032 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2033 'thumbnail': video_thumbnail.decode('utf-8'),
2034 'description': video_description.decode('utf-8'),
2035 'player_url': None,
2036 })
2037 except UnavailableVideoError, err:
2038 self._downloader.trouble(u'\nERROR: unable to download video')
2039
2040 class BlipTVIE(InfoExtractor):
2041 """Information extractor for blip.tv"""
2042
2043 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2044 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2045 IE_NAME = u'blip.tv'
2046
2047 def report_extraction(self, file_id):
2048 """Report information extraction."""
2049 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2050
2051 def report_direct_download(self, title):
2052 """Report information extraction."""
2053 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2054
2055 def _real_extract(self, url):
2056 mobj = re.match(self._VALID_URL, url)
2057 if mobj is None:
2058 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2059 return
2060
2061 if '?' in url:
2062 cchar = '&'
2063 else:
2064 cchar = '?'
2065 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2066 request = urllib2.Request(json_url)
2067 self.report_extraction(mobj.group(1))
2068 info = None
2069 try:
2070 urlh = urllib2.urlopen(request)
2071 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2072 basename = url.split('/')[-1]
2073 title,ext = os.path.splitext(basename)
2074 title = title.decode('UTF-8')
2075 ext = ext.replace('.', '')
2076 self.report_direct_download(title)
2077 info = {
2078 'id': title,
2079 'url': url,
2080 'title': title,
2081 'stitle': simplify_title(title),
2082 'ext': ext,
2083 'urlhandle': urlh
2084 }
2085 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2086 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2087 return
2088 if info is None: # Regular URL
2089 try:
2090 json_code = urlh.read()
2091 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2092 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2093 return
2094
2095 try:
2096 json_data = json.loads(json_code)
2097 if 'Post' in json_data:
2098 data = json_data['Post']
2099 else:
2100 data = json_data
2101
2102 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2103 video_url = data['media']['url']
2104 umobj = re.match(self._URL_EXT, video_url)
2105 if umobj is None:
2106 raise ValueError('Can not determine filename extension')
2107 ext = umobj.group(1)
2108
2109 info = {
2110 'id': data['item_id'],
2111 'url': video_url,
2112 'uploader': data['display_name'],
2113 'upload_date': upload_date,
2114 'title': data['title'],
2115 'stitle': simplify_title(data['title']),
2116 'ext': ext,
2117 'format': data['media']['mimeType'],
2118 'thumbnail': data['thumbnailUrl'],
2119 'description': data['description'],
2120 'player_url': data['embedUrl']
2121 }
2122 except (ValueError,KeyError), err:
2123 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2124 return
2125
2126 self._downloader.increment_downloads()
2127
2128 try:
2129 self._downloader.process_info(info)
2130 except UnavailableVideoError, err:
2131 self._downloader.trouble(u'\nERROR: unable to download video')
2132
2133
2134 class MyVideoIE(InfoExtractor):
2135 """Information Extractor for myvideo.de."""
2136
2137 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2138 IE_NAME = u'myvideo'
2139
2140 def __init__(self, downloader=None):
2141 InfoExtractor.__init__(self, downloader)
2142
2143 def report_download_webpage(self, video_id):
2144 """Report webpage download."""
2145 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2146
2147 def report_extraction(self, video_id):
2148 """Report information extraction."""
2149 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2150
2151 def _real_extract(self,url):
2152 mobj = re.match(self._VALID_URL, url)
2153 if mobj is None:
2154 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2155 return
2156
2157 video_id = mobj.group(1)
2158
2159 # Get video webpage
2160 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2161 try:
2162 self.report_download_webpage(video_id)
2163 webpage = urllib2.urlopen(request).read()
2164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2165 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2166 return
2167
2168 self.report_extraction(video_id)
2169 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2170 webpage)
2171 if mobj is None:
2172 self._downloader.trouble(u'ERROR: unable to extract media URL')
2173 return
2174 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2175
2176 mobj = re.search('<title>([^<]+)</title>', webpage)
2177 if mobj is None:
2178 self._downloader.trouble(u'ERROR: unable to extract title')
2179 return
2180
2181 video_title = mobj.group(1)
2182 video_title = sanitize_title(video_title)
2183
2184 simple_title = simplify_title(video_title)
2185
2186 try:
2187 self._downloader.process_info({
2188 'id': video_id,
2189 'url': video_url,
2190 'uploader': u'NA',
2191 'upload_date': u'NA',
2192 'title': video_title,
2193 'stitle': simple_title,
2194 'ext': u'flv',
2195 'format': u'NA',
2196 'player_url': None,
2197 })
2198 except UnavailableVideoError:
2199 self._downloader.trouble(u'\nERROR: Unable to download video')
2200
2201 class ComedyCentralIE(InfoExtractor):
2202 """Information extractor for The Daily Show and Colbert Report """
2203
2204 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2205 IE_NAME = u'comedycentral'
2206
2207 def report_extraction(self, episode_id):
2208 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2209
2210 def report_config_download(self, episode_id):
2211 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2212
2213 def report_index_download(self, episode_id):
2214 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2215
2216 def report_player_url(self, episode_id):
2217 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2218
2219 def _real_extract(self, url):
2220 mobj = re.match(self._VALID_URL, url)
2221 if mobj is None:
2222 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2223 return
2224
2225 if mobj.group('shortname'):
2226 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2227 url = u'http://www.thedailyshow.com/full-episodes/'
2228 else:
2229 url = u'http://www.colbertnation.com/full-episodes/'
2230 mobj = re.match(self._VALID_URL, url)
2231 assert mobj is not None
2232
2233 dlNewest = not mobj.group('episode')
2234 if dlNewest:
2235 epTitle = mobj.group('showname')
2236 else:
2237 epTitle = mobj.group('episode')
2238
2239 req = urllib2.Request(url)
2240 self.report_extraction(epTitle)
2241 try:
2242 htmlHandle = urllib2.urlopen(req)
2243 html = htmlHandle.read()
2244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2246 return
2247 if dlNewest:
2248 url = htmlHandle.geturl()
2249 mobj = re.match(self._VALID_URL, url)
2250 if mobj is None:
2251 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2252 return
2253 if mobj.group('episode') == '':
2254 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2255 return
2256 epTitle = mobj.group('episode')
2257
2258 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2259 if len(mMovieParams) == 0:
2260 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2261 return
2262
2263 playerUrl_raw = mMovieParams[0][0]
2264 self.report_player_url(epTitle)
2265 try:
2266 urlHandle = urllib2.urlopen(playerUrl_raw)
2267 playerUrl = urlHandle.geturl()
2268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2269 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2270 return
2271
2272 uri = mMovieParams[0][1]
2273 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2274 self.report_index_download(epTitle)
2275 try:
2276 indexXml = urllib2.urlopen(indexUrl).read()
2277 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2278 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2279 return
2280
2281 idoc = xml.etree.ElementTree.fromstring(indexXml)
2282 itemEls = idoc.findall('.//item')
2283 for itemEl in itemEls:
2284 mediaId = itemEl.findall('./guid')[0].text
2285 shortMediaId = mediaId.split(':')[-1]
2286 showId = mediaId.split(':')[-2].replace('.com', '')
2287 officialTitle = itemEl.findall('./title')[0].text
2288 officialDate = itemEl.findall('./pubDate')[0].text
2289
2290 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2291 urllib.urlencode({'uri': mediaId}))
2292 configReq = urllib2.Request(configUrl)
2293 self.report_config_download(epTitle)
2294 try:
2295 configXml = urllib2.urlopen(configReq).read()
2296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2298 return
2299
2300 cdoc = xml.etree.ElementTree.fromstring(configXml)
2301 turls = []
2302 for rendition in cdoc.findall('.//rendition'):
2303 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2304 turls.append(finfo)
2305
2306 if len(turls) == 0:
2307 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2308 continue
2309
2310 # For now, just pick the highest bitrate
2311 format,video_url = turls[-1]
2312
2313 self._downloader.increment_downloads()
2314
2315 effTitle = showId + u'-' + epTitle
2316 info = {
2317 'id': shortMediaId,
2318 'url': video_url,
2319 'uploader': showId,
2320 'upload_date': officialDate,
2321 'title': effTitle,
2322 'stitle': simplify_title(effTitle),
2323 'ext': 'mp4',
2324 'format': format,
2325 'thumbnail': None,
2326 'description': officialTitle,
2327 'player_url': playerUrl
2328 }
2329
2330 try:
2331 self._downloader.process_info(info)
2332 except UnavailableVideoError, err:
2333 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
2334 continue
2335
2336
2337 class EscapistIE(InfoExtractor):
2338 """Information extractor for The Escapist """
2339
2340 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2341 IE_NAME = u'escapist'
2342
2343 def report_extraction(self, showName):
2344 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2345
2346 def report_config_download(self, showName):
2347 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2348
2349 def _real_extract(self, url):
2350 htmlParser = HTMLParser.HTMLParser()
2351
2352 mobj = re.match(self._VALID_URL, url)
2353 if mobj is None:
2354 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2355 return
2356 showName = mobj.group('showname')
2357 videoId = mobj.group('episode')
2358
2359 self.report_extraction(showName)
2360 try:
2361 webPage = urllib2.urlopen(url).read()
2362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2363 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2364 return
2365
2366 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2367 description = htmlParser.unescape(descMatch.group(1))
2368 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2369 imgUrl = htmlParser.unescape(imgMatch.group(1))
2370 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2371 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
2372 configUrlMatch = re.search('config=(.*)$', playerUrl)
2373 configUrl = urllib2.unquote(configUrlMatch.group(1))
2374
2375 self.report_config_download(showName)
2376 try:
2377 configJSON = urllib2.urlopen(configUrl).read()
2378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2380 return
2381
2382 # Technically, it's JavaScript, not JSON
2383 configJSON = configJSON.replace("'", '"')
2384
2385 try:
2386 config = json.loads(configJSON)
2387 except (ValueError,), err:
2388 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2389 return
2390
2391 playlist = config['playlist']
2392 videoUrl = playlist[1]['url']
2393
2394 self._downloader.increment_downloads()
2395 info = {
2396 'id': videoId,
2397 'url': videoUrl,
2398 'uploader': showName,
2399 'upload_date': None,
2400 'title': showName,
2401 'stitle': simplify_title(showName),
2402 'ext': 'flv',
2403 'format': 'flv',
2404 'thumbnail': imgUrl,
2405 'description': description,
2406 'player_url': playerUrl,
2407 }
2408
2409 try:
2410 self._downloader.process_info(info)
2411 except UnavailableVideoError, err:
2412 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
2413
2414
2415 class CollegeHumorIE(InfoExtractor):
2416 """Information extractor for collegehumor.com"""
2417
2418 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2419 IE_NAME = u'collegehumor'
2420
2421 def report_webpage(self, video_id):
2422 """Report information extraction."""
2423 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2424
2425 def report_extraction(self, video_id):
2426 """Report information extraction."""
2427 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2428
2429 def _real_extract(self, url):
2430 htmlParser = HTMLParser.HTMLParser()
2431
2432 mobj = re.match(self._VALID_URL, url)
2433 if mobj is None:
2434 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435 return
2436 video_id = mobj.group('videoid')
2437
2438 self.report_webpage(video_id)
2439 request = urllib2.Request(url)
2440 try:
2441 webpage = urllib2.urlopen(request).read()
2442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2443 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2444 return
2445
2446 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2447 if m is None:
2448 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2449 return
2450 internal_video_id = m.group('internalvideoid')
2451
2452 info = {
2453 'id': video_id,
2454 'internal_id': internal_video_id,
2455 }
2456
2457 self.report_extraction(video_id)
2458 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2459 try:
2460 metaXml = urllib2.urlopen(xmlUrl).read()
2461 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2462 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2463 return
2464
2465 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2466 try:
2467 videoNode = mdoc.findall('./video')[0]
2468 info['description'] = videoNode.findall('./description')[0].text
2469 info['title'] = videoNode.findall('./caption')[0].text
2470 info['stitle'] = simplify_title(info['title'])
2471 info['url'] = videoNode.findall('./file')[0].text
2472 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2473 info['ext'] = info['url'].rpartition('.')[2]
2474 info['format'] = info['ext']
2475 except IndexError:
2476 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2477 return
2478
2479 self._downloader.increment_downloads()
2480
2481 try:
2482 self._downloader.process_info(info)
2483 except UnavailableVideoError, err:
2484 self._downloader.trouble(u'\nERROR: unable to download video')
2485
2486
2487 class XVideosIE(InfoExtractor):
2488 """Information extractor for xvideos.com"""
2489
2490 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2491 IE_NAME = u'xvideos'
2492
2493 def report_webpage(self, video_id):
2494 """Report information extraction."""
2495 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2496
2497 def report_extraction(self, video_id):
2498 """Report information extraction."""
2499 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2500
2501 def _real_extract(self, url):
2502 htmlParser = HTMLParser.HTMLParser()
2503
2504 mobj = re.match(self._VALID_URL, url)
2505 if mobj is None:
2506 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507 return
2508 video_id = mobj.group(1).decode('utf-8')
2509
2510 self.report_webpage(video_id)
2511
2512 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2513 try:
2514 webpage = urllib2.urlopen(request).read()
2515 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2516 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2517 return
2518
2519 self.report_extraction(video_id)
2520
2521
2522 # Extract video URL
2523 mobj = re.search(r'flv_url=(.+?)&', webpage)
2524 if mobj is None:
2525 self._downloader.trouble(u'ERROR: unable to extract video url')
2526 return
2527 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2528
2529
2530 # Extract title
2531 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2532 if mobj is None:
2533 self._downloader.trouble(u'ERROR: unable to extract video title')
2534 return
2535 video_title = mobj.group(1).decode('utf-8')
2536
2537
2538 # Extract video thumbnail
2539 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2540 if mobj is None:
2541 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2542 return
2543 video_thumbnail = mobj.group(1).decode('utf-8')
2544
2545
2546
2547 self._downloader.increment_downloads()
2548 info = {
2549 'id': video_id,
2550 'url': video_url,
2551 'uploader': None,
2552 'upload_date': None,
2553 'title': video_title,
2554 'stitle': simplify_title(video_title),
2555 'ext': 'flv',
2556 'format': 'flv',
2557 'thumbnail': video_thumbnail,
2558 'description': None,
2559 'player_url': None,
2560 }
2561
2562 try:
2563 self._downloader.process_info(info)
2564 except UnavailableVideoError, err:
2565 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
2566
2567
2568 class SoundcloudIE(InfoExtractor):
2569 """Information extractor for soundcloud.com
2570 To access the media, the uid of the song and a stream token
2571 must be extracted from the page source and the script must make
2572 a request to media.soundcloud.com/crossdomain.xml. Then
2573 the media can be grabbed by requesting from an url composed
2574 of the stream token and uid
2575 """
2576
2577 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2578 IE_NAME = u'soundcloud'
2579
2580 def __init__(self, downloader=None):
2581 InfoExtractor.__init__(self, downloader)
2582
2583 def report_webpage(self, video_id):
2584 """Report information extraction."""
2585 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2586
2587 def report_extraction(self, video_id):
2588 """Report information extraction."""
2589 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2590
2591 def _real_extract(self, url):
2592 htmlParser = HTMLParser.HTMLParser()
2593
2594 mobj = re.match(self._VALID_URL, url)
2595 if mobj is None:
2596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2597 return
2598
2599 # extract uploader (which is in the url)
2600 uploader = mobj.group(1).decode('utf-8')
2601 # extract simple title (uploader + slug of song title)
2602 slug_title = mobj.group(2).decode('utf-8')
2603 simple_title = uploader + '-' + slug_title
2604
2605 self.report_webpage('%s/%s' % (uploader, slug_title))
2606
2607 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2608 try:
2609 webpage = urllib2.urlopen(request).read()
2610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2611 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2612 return
2613
2614 self.report_extraction('%s/%s' % (uploader, slug_title))
2615
2616 # extract uid and stream token that soundcloud hands out for access
2617 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2618 if mobj:
2619 video_id = mobj.group(1)
2620 stream_token = mobj.group(2)
2621
2622 # extract unsimplified title
2623 mobj = re.search('"title":"(.*?)",', webpage)
2624 if mobj:
2625 title = mobj.group(1)
2626
2627 # construct media url (with uid/token)
2628 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2629 mediaURL = mediaURL % (video_id, stream_token)
2630
2631 # description
2632 description = u'No description available'
2633 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2634 if mobj:
2635 description = mobj.group(1)
2636
2637 # upload date
2638 upload_date = None
2639 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2640 if mobj:
2641 try:
2642 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2643 except Exception, e:
2644 print str(e)
2645
2646 # for soundcloud, a request to a cross domain is required for cookies
2647 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2648
2649 try:
2650 self._downloader.process_info({
2651 'id': video_id.decode('utf-8'),
2652 'url': mediaURL,
2653 'uploader': uploader.decode('utf-8'),
2654 'upload_date': upload_date,
2655 'title': simple_title.decode('utf-8'),
2656 'stitle': simple_title.decode('utf-8'),
2657 'ext': u'mp3',
2658 'format': u'NA',
2659 'player_url': None,
2660 'description': description.decode('utf-8')
2661 })
2662 except UnavailableVideoError:
2663 self._downloader.trouble(u'\nERROR: unable to download video')
2664
2665
2666 class InfoQIE(InfoExtractor):
2667 """Information extractor for infoq.com"""
2668
2669 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2670 IE_NAME = u'infoq'
2671
2672 def report_webpage(self, video_id):
2673 """Report information extraction."""
2674 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2675
2676 def report_extraction(self, video_id):
2677 """Report information extraction."""
2678 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2679
2680 def _real_extract(self, url):
2681 htmlParser = HTMLParser.HTMLParser()
2682
2683 mobj = re.match(self._VALID_URL, url)
2684 if mobj is None:
2685 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2686 return
2687
2688 self.report_webpage(url)
2689
2690 request = urllib2.Request(url)
2691 try:
2692 webpage = urllib2.urlopen(request).read()
2693 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2694 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2695 return
2696
2697 self.report_extraction(url)
2698
2699
2700 # Extract video URL
2701 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2702 if mobj is None:
2703 self._downloader.trouble(u'ERROR: unable to extract video url')
2704 return
2705 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2706
2707
2708 # Extract title
2709 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2710 if mobj is None:
2711 self._downloader.trouble(u'ERROR: unable to extract video title')
2712 return
2713 video_title = mobj.group(1).decode('utf-8')
2714
2715 # Extract description
2716 video_description = u'No description available.'
2717 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2718 if mobj is not None:
2719 video_description = mobj.group(1).decode('utf-8')
2720
2721 video_filename = video_url.split('/')[-1]
2722 video_id, extension = video_filename.split('.')
2723
2724 self._downloader.increment_downloads()
2725 info = {
2726 'id': video_id,
2727 'url': video_url,
2728 'uploader': None,
2729 'upload_date': None,
2730 'title': video_title,
2731 'stitle': simplify_title(video_title),
2732 'ext': extension,
2733 'format': extension, # Extension is always(?) mp4, but seems to be flv
2734 'thumbnail': None,
2735 'description': video_description,
2736 'player_url': None,
2737 }
2738
2739 try:
2740 self._downloader.process_info(info)
2741 except UnavailableVideoError, err:
2742 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
2743
2744 class MixcloudIE(InfoExtractor):
2745 """Information extractor for www.mixcloud.com"""
2746 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2747 IE_NAME = u'mixcloud'
2748
2749 def __init__(self, downloader=None):
2750 InfoExtractor.__init__(self, downloader)
2751
2752 def report_download_json(self, file_id):
2753 """Report JSON download."""
2754 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2755
2756 def report_extraction(self, file_id):
2757 """Report information extraction."""
2758 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2759
2760 def get_urls(self, jsonData, fmt, bitrate='best'):
2761 """Get urls from 'audio_formats' section in json"""
2762 file_url = None
2763 try:
2764 bitrate_list = jsonData[fmt]
2765 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2766 bitrate = max(bitrate_list) # select highest
2767
2768 url_list = jsonData[fmt][bitrate]
2769 except TypeError: # we have no bitrate info.
2770 url_list = jsonData[fmt]
2771
2772 return url_list
2773
2774 def check_urls(self, url_list):
2775 """Returns 1st active url from list"""
2776 for url in url_list:
2777 try:
2778 urllib2.urlopen(url)
2779 return url
2780 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2781 url = None
2782
2783 return None
2784
2785 def _print_formats(self, formats):
2786 print 'Available formats:'
2787 for fmt in formats.keys():
2788 for b in formats[fmt]:
2789 try:
2790 ext = formats[fmt][b][0]
2791 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2792 except TypeError: # we have no bitrate info
2793 ext = formats[fmt][0]
2794 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2795 break
2796
2797 def _real_extract(self, url):
2798 mobj = re.match(self._VALID_URL, url)
2799 if mobj is None:
2800 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801 return
2802 # extract uploader & filename from url
2803 uploader = mobj.group(1).decode('utf-8')
2804 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2805
2806 # construct API request
2807 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2808 # retrieve .json file with links to files
2809 request = urllib2.Request(file_url)
2810 try:
2811 self.report_download_json(file_url)
2812 jsonData = urllib2.urlopen(request).read()
2813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2815 return
2816
2817 # parse JSON
2818 json_data = json.loads(jsonData)
2819 player_url = json_data['player_swf_url']
2820 formats = dict(json_data['audio_formats'])
2821
2822 req_format = self._downloader.params.get('format', None)
2823 bitrate = None
2824
2825 if self._downloader.params.get('listformats', None):
2826 self._print_formats(formats)
2827 return
2828
2829 if req_format is None or req_format == 'best':
2830 for format_param in formats.keys():
2831 url_list = self.get_urls(formats, format_param)
2832 # check urls
2833 file_url = self.check_urls(url_list)
2834 if file_url is not None:
2835 break # got it!
2836 else:
2837 if req_format not in formats.keys():
2838 self._downloader.trouble(u'ERROR: format is not available')
2839 return
2840
2841 url_list = self.get_urls(formats, req_format)
2842 file_url = self.check_urls(url_list)
2843 format_param = req_format
2844
2845 # We have audio
2846 self._downloader.increment_downloads()
2847 try:
2848 # Process file information
2849 self._downloader.process_info({
2850 'id': file_id.decode('utf-8'),
2851 'url': file_url.decode('utf-8'),
2852 'uploader': uploader.decode('utf-8'),
2853 'upload_date': u'NA',
2854 'title': json_data['name'],
2855 'stitle': simplify_title(json_data['name']),
2856 'ext': file_url.split('.')[-1].decode('utf-8'),
2857 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2858 'thumbnail': json_data['thumbnail_url'],
2859 'description': json_data['description'],
2860 'player_url': player_url.decode('utf-8'),
2861 })
2862 except UnavailableVideoError, err:
2863 self._downloader.trouble(u'ERROR: unable to download file')
2864
2865 class StanfordOpenClassroomIE(InfoExtractor):
2866 """Information extractor for Stanford's Open ClassRoom"""
2867
2868 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2869 IE_NAME = u'stanfordoc'
2870
2871 def report_download_webpage(self, objid):
2872 """Report information extraction."""
2873 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2874
2875 def report_extraction(self, video_id):
2876 """Report information extraction."""
2877 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2878
2879 def _real_extract(self, url):
2880 mobj = re.match(self._VALID_URL, url)
2881 if mobj is None:
2882 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2883 return
2884
2885 if mobj.group('course') and mobj.group('video'): # A specific video
2886 course = mobj.group('course')
2887 video = mobj.group('video')
2888 info = {
2889 'id': simplify_title(course + '_' + video),
2890 }
2891
2892 self.report_extraction(info['id'])
2893 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2894 xmlUrl = baseUrl + video + '.xml'
2895 try:
2896 metaXml = urllib2.urlopen(xmlUrl).read()
2897 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2899 return
2900 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2901 try:
2902 info['title'] = mdoc.findall('./title')[0].text
2903 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2904 except IndexError:
2905 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2906 return
2907 info['stitle'] = simplify_title(info['title'])
2908 info['ext'] = info['url'].rpartition('.')[2]
2909 info['format'] = info['ext']
2910 self._downloader.increment_downloads()
2911 try:
2912 self._downloader.process_info(info)
2913 except UnavailableVideoError, err:
2914 self._downloader.trouble(u'\nERROR: unable to download video')
2915 elif mobj.group('course'): # A course page
2916 unescapeHTML = HTMLParser.HTMLParser().unescape
2917
2918 course = mobj.group('course')
2919 info = {
2920 'id': simplify_title(course),
2921 'type': 'playlist',
2922 }
2923
2924 self.report_download_webpage(info['id'])
2925 try:
2926 coursepage = urllib2.urlopen(url).read()
2927 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2928 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2929 return
2930
2931 m = re.search('<h1>([^<]+)</h1>', coursepage)
2932 if m:
2933 info['title'] = unescapeHTML(m.group(1))
2934 else:
2935 info['title'] = info['id']
2936 info['stitle'] = simplify_title(info['title'])
2937
2938 m = re.search('<description>([^<]+)</description>', coursepage)
2939 if m:
2940 info['description'] = unescapeHTML(m.group(1))
2941
2942 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2943 info['list'] = [
2944 {
2945 'type': 'reference',
2946 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2947 }
2948 for vpage in links]
2949
2950 for entry in info['list']:
2951 assert entry['type'] == 'reference'
2952 self.extract(entry['url'])
2953 else: # Root page
2954 unescapeHTML = HTMLParser.HTMLParser().unescape
2955
2956 info = {
2957 'id': 'Stanford OpenClassroom',
2958 'type': 'playlist',
2959 }
2960
2961 self.report_download_webpage(info['id'])
2962 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2963 try:
2964 rootpage = urllib2.urlopen(rootURL).read()
2965 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2966 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2967 return
2968
2969 info['title'] = info['id']
2970 info['stitle'] = simplify_title(info['title'])
2971
2972 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2973 info['list'] = [
2974 {
2975 'type': 'reference',
2976 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2977 }
2978 for cpage in links]
2979
2980 for entry in info['list']:
2981 assert entry['type'] == 'reference'
2982 self.extract(entry['url'])
2983
2984 class MTVIE(InfoExtractor):
2985 """Information extractor for MTV.com"""
2986
2987 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2988 IE_NAME = u'mtv'
2989
2990 def report_webpage(self, video_id):
2991 """Report information extraction."""
2992 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2993
2994 def report_extraction(self, video_id):
2995 """Report information extraction."""
2996 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2997
2998 def _real_extract(self, url):
2999 mobj = re.match(self._VALID_URL, url)
3000 if mobj is None:
3001 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3002 return
3003 if not mobj.group('proto'):
3004 url = 'http://' + url
3005 video_id = mobj.group('videoid')
3006 self.report_webpage(video_id)
3007
3008 request = urllib2.Request(url)
3009 try:
3010 webpage = urllib2.urlopen(request).read()
3011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3012 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3013 return
3014
3015 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3016 if mobj is None:
3017 self._downloader.trouble(u'ERROR: unable to extract song name')
3018 return
3019 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3020 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3021 if mobj is None:
3022 self._downloader.trouble(u'ERROR: unable to extract performer')
3023 return
3024 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3025 video_title = performer + ' - ' + song_name
3026
3027 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3028 if mobj is None:
3029 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3030 return
3031 mtvn_uri = mobj.group(1)
3032
3033 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3034 if mobj is None:
3035 self._downloader.trouble(u'ERROR: unable to extract content id')
3036 return
3037 content_id = mobj.group(1)
3038
3039 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3040 self.report_extraction(video_id)
3041 request = urllib2.Request(videogen_url)
3042 try:
3043 metadataXml = urllib2.urlopen(request).read()
3044 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3045 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3046 return
3047
3048 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3049 renditions = mdoc.findall('.//rendition')
3050
3051 # For now, always pick the highest quality.
3052 rendition = renditions[-1]
3053
3054 try:
3055 _,_,ext = rendition.attrib['type'].partition('/')
3056 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3057 video_url = rendition.find('./src').text
3058 except KeyError:
3059 self._downloader.trouble('Invalid rendition field.')
3060 return
3061
3062 self._downloader.increment_downloads()
3063 info = {
3064 'id': video_id,
3065 'url': video_url,
3066 'uploader': performer,
3067 'title': video_title,
3068 'stitle': simplify_title(video_title),
3069 'ext': ext,
3070 'format': format,
3071 }
3072
3073 try:
3074 self._downloader.process_info(info)
3075 except UnavailableVideoError, err:
3076 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)