]> jfr.im git - yt-dlp.git/blob - youtube-dl
Release 2012.01.08b
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.08b'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48 import ctypes
49
50 try:
51 import email.utils
52 except ImportError: # Python 2.4
53 import email.Utils
54 try:
55 import cStringIO as StringIO
56 except ImportError:
57 import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61 from urlparse import parse_qs
62 except ImportError:
63 from cgi import parse_qs
64
65 try:
66 import lxml.etree
67 except ImportError:
68 pass # Handled below
69
70 try:
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84 import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86 import re
87 class json(object):
88 @staticmethod
89 def loads(s):
90 s = s.decode('UTF-8')
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
95 i += 1
96 if expectMore:
97 if i >= len(s):
98 raiseError('Premature end', i)
99 return i
100 def decodeEscape(match):
101 esc = match.group(1)
102 _STATIC = {
103 '"': '"',
104 '\\': '\\',
105 '/': '/',
106 'b': unichr(0x8),
107 'f': unichr(0xc),
108 'n': '\n',
109 'r': '\r',
110 't': '\t',
111 }
112 if esc in _STATIC:
113 return _STATIC[esc]
114 if esc[0] == 'u':
115 if len(esc) == 1+4:
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
122 def parseString(i):
123 i += 1
124 e = i
125 while True:
126 e = s.index('"', e)
127 bslashes = 0
128 while s[e-bslashes-1] == '\\':
129 bslashes += 1
130 if bslashes % 2 == 1:
131 e += 1
132 continue
133 break
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
136 return (e+1,stri)
137 def parseObj(i):
138 i += 1
139 res = {}
140 i = skipSpace(i)
141 if s[i] == '}': # Empty dictionary
142 return (i+1,res)
143 while True:
144 if s[i] != '"':
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
147 i = skipSpace(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
150 i,val = parse(i+1)
151 res[key] = val
152 i = skipSpace(i)
153 if s[i] == '}':
154 return (i+1, res)
155 if s[i] != ',':
156 raiseError('Expected comma or closing curly brace', i)
157 i = skipSpace(i+1)
158 def parseArray(i):
159 res = []
160 i = skipSpace(i+1)
161 if s[i] == ']': # Empty array
162 return (i+1,res)
163 while True:
164 i,val = parse(i)
165 res.append(val)
166 i = skipSpace(i) # Raise exception if premature end
167 if s[i] == ']':
168 return (i+1, res)
169 if s[i] != ',':
170 raiseError('Expected a comma or closing bracket', i)
171 i = skipSpace(i+1)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
175 return (i+len(k), v)
176 raiseError('Not a boolean (or null)', i)
177 def parseNumber(i):
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 if mobj is None:
180 raiseError('Not a number', i)
181 nums = mobj.group(1)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186 def parse(i):
187 i = skipSpace(i)
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
190 return (i,res)
191 i,res = parse(0)
192 if i < len(s):
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194 return res
195
196 def preferredencoding():
197 """Get preferred encoding.
198
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
201 """
202 def yield_preferredencoding():
203 try:
204 pref = locale.getpreferredencoding()
205 u'TEST'.encode(pref)
206 except:
207 pref = 'UTF-8'
208 while True:
209 yield pref
210 return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
215
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
218 """
219 entity = matchobj.group(1)
220
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
224
225 # Unicode character
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 if mobj is not None:
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
230 base = 16
231 numstr = u'0%s' % numstr
232 else:
233 base = 10
234 return unichr(long(numstr, base))
235
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
248
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
252 function.
253
254 It returns the tuple (stream, definitive_file_name).
255 """
256 try:
257 if filename == u'-':
258 if sys.platform == 'win32':
259 import msvcrt
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(_encodeFilename(filename), open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268 # An exception here should be caught in the caller
269 stream = open(_encodeFilename(filename), open_mode)
270 return (stream, filename)
271
272
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
275 timestamp = None
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
279 return timestamp
280
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
287 res = []
288 for el in iterable:
289 if el not in res:
290 res.append(el)
291 return res
292
293 def _unescapeHTML(s):
294 """
295 @param s a string (of type unicode)
296 """
297 assert type(s) == type(u'')
298
299 htmlParser = HTMLParser.HTMLParser()
300 return htmlParser.unescape(s)
301
302 def _encodeFilename(s):
303 """
304 @param s The name of the file (of type unicode)
305 """
306
307 assert type(s) == type(u'')
308 return s.encode(sys.getfilesystemencoding(), 'ignore')
309
310 class DownloadError(Exception):
311 """Download Error exception.
312
313 This exception may be thrown by FileDownloader objects if they are not
314 configured to continue on errors. They will contain the appropriate
315 error message.
316 """
317 pass
318
319
320 class SameFileError(Exception):
321 """Same File exception.
322
323 This exception will be thrown by FileDownloader objects if they detect
324 multiple files would have to be downloaded to the same file on disk.
325 """
326 pass
327
328
329 class PostProcessingError(Exception):
330 """Post Processing exception.
331
332 This exception may be raised by PostProcessor's .run() method to
333 indicate an error in the postprocessing task.
334 """
335 pass
336
337 class MaxDownloadsReached(Exception):
338 """ --max-downloads limit has been reached. """
339 pass
340
341
342 class UnavailableVideoError(Exception):
343 """Unavailable Format exception.
344
345 This exception will be thrown when a video is requested
346 in a format that is not available for that video.
347 """
348 pass
349
350
351 class ContentTooShortError(Exception):
352 """Content Too Short exception.
353
354 This exception may be raised by FileDownloader objects when a file they
355 download is too small for what the server announced first, indicating
356 the connection was probably interrupted.
357 """
358 # Both in bytes
359 downloaded = None
360 expected = None
361
362 def __init__(self, downloaded, expected):
363 self.downloaded = downloaded
364 self.expected = expected
365
366
367 class YoutubeDLHandler(urllib2.HTTPHandler):
368 """Handler for HTTP requests and responses.
369
370 This class, when installed with an OpenerDirector, automatically adds
371 the standard headers to every HTTP request and handles gzipped and
372 deflated responses from web servers. If compression is to be avoided in
373 a particular request, the original request in the program code only has
374 to include the HTTP header "Youtubedl-No-Compression", which will be
375 removed before making the real request.
376
377 Part of this code was copied from:
378
379 http://techknack.net/python-urllib2-handlers/
380
381 Andrew Rowls, the author of that code, agreed to release it to the
382 public domain.
383 """
384
385 @staticmethod
386 def deflate(data):
387 try:
388 return zlib.decompress(data, -zlib.MAX_WBITS)
389 except zlib.error:
390 return zlib.decompress(data)
391
392 @staticmethod
393 def addinfourl_wrapper(stream, headers, url, code):
394 if hasattr(urllib2.addinfourl, 'getcode'):
395 return urllib2.addinfourl(stream, headers, url, code)
396 ret = urllib2.addinfourl(stream, headers, url)
397 ret.code = code
398 return ret
399
400 def http_request(self, req):
401 for h in std_headers:
402 if h in req.headers:
403 del req.headers[h]
404 req.add_header(h, std_headers[h])
405 if 'Youtubedl-no-compression' in req.headers:
406 if 'Accept-encoding' in req.headers:
407 del req.headers['Accept-encoding']
408 del req.headers['Youtubedl-no-compression']
409 return req
410
411 def http_response(self, req, resp):
412 old_resp = resp
413 # gzip
414 if resp.headers.get('Content-encoding', '') == 'gzip':
415 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
416 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
417 resp.msg = old_resp.msg
418 # deflate
419 if resp.headers.get('Content-encoding', '') == 'deflate':
420 gz = StringIO.StringIO(self.deflate(resp.read()))
421 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
422 resp.msg = old_resp.msg
423 return resp
424
425
426 class FileDownloader(object):
427 """File Downloader class.
428
429 File downloader objects are the ones responsible of downloading the
430 actual video file and writing it to disk if the user has requested
431 it, among some other tasks. In most cases there should be one per
432 program. As, given a video URL, the downloader doesn't know how to
433 extract all the needed information, task that InfoExtractors do, it
434 has to pass the URL to one of them.
435
436 For this, file downloader objects have a method that allows
437 InfoExtractors to be registered in a given order. When it is passed
438 a URL, the file downloader handles it to the first InfoExtractor it
439 finds that reports being able to handle it. The InfoExtractor extracts
440 all the information about the video or videos the URL refers to, and
441 asks the FileDownloader to process the video information, possibly
442 downloading the video.
443
444 File downloaders accept a lot of parameters. In order not to saturate
445 the object constructor with arguments, it receives a dictionary of
446 options instead. These options are available through the params
447 attribute for the InfoExtractors to use. The FileDownloader also
448 registers itself as the downloader in charge for the InfoExtractors
449 that are added to it, so this is a "mutual registration".
450
451 Available options:
452
453 username: Username for authentication purposes.
454 password: Password for authentication purposes.
455 usenetrc: Use netrc for authentication instead.
456 quiet: Do not print messages to stdout.
457 forceurl: Force printing final URL.
458 forcetitle: Force printing title.
459 forcethumbnail: Force printing thumbnail URL.
460 forcedescription: Force printing description.
461 forcefilename: Force printing final filename.
462 simulate: Do not download the video files.
463 format: Video format code.
464 format_limit: Highest quality format to try.
465 outtmpl: Template for output names.
466 ignoreerrors: Do not stop on download errors.
467 ratelimit: Download speed limit, in bytes/sec.
468 nooverwrites: Prevent overwriting files.
469 retries: Number of times to retry for HTTP error 5xx
470 continuedl: Try to continue downloads if possible.
471 noprogress: Do not print the progress bar.
472 playliststart: Playlist item to start at.
473 playlistend: Playlist item to end at.
474 matchtitle: Download only matching titles.
475 rejecttitle: Reject downloads for matching titles.
476 logtostderr: Log messages to stderr instead of stdout.
477 consoletitle: Display progress in console window's titlebar.
478 nopart: Do not use temporary .part files.
479 updatetime: Use the Last-modified header to set output file timestamps.
480 writedescription: Write the video description to a .description file
481 writeinfojson: Write the video description to a .info.json file
482 """
483
484 params = None
485 _ies = []
486 _pps = []
487 _download_retcode = None
488 _num_downloads = None
489 _screen_file = None
490
491 def __init__(self, params):
492 """Create a FileDownloader object with the given options."""
493 self._ies = []
494 self._pps = []
495 self._download_retcode = 0
496 self._num_downloads = 0
497 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
498 self.params = params
499
500 @staticmethod
501 def format_bytes(bytes):
502 if bytes is None:
503 return 'N/A'
504 if type(bytes) is str:
505 bytes = float(bytes)
506 if bytes == 0.0:
507 exponent = 0
508 else:
509 exponent = long(math.log(bytes, 1024.0))
510 suffix = 'bkMGTPEZY'[exponent]
511 converted = float(bytes) / float(1024 ** exponent)
512 return '%.2f%s' % (converted, suffix)
513
514 @staticmethod
515 def calc_percent(byte_counter, data_len):
516 if data_len is None:
517 return '---.-%'
518 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
519
520 @staticmethod
521 def calc_eta(start, now, total, current):
522 if total is None:
523 return '--:--'
524 dif = now - start
525 if current == 0 or dif < 0.001: # One millisecond
526 return '--:--'
527 rate = float(current) / dif
528 eta = long((float(total) - float(current)) / rate)
529 (eta_mins, eta_secs) = divmod(eta, 60)
530 if eta_mins > 99:
531 return '--:--'
532 return '%02d:%02d' % (eta_mins, eta_secs)
533
534 @staticmethod
535 def calc_speed(start, now, bytes):
536 dif = now - start
537 if bytes == 0 or dif < 0.001: # One millisecond
538 return '%10s' % '---b/s'
539 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
540
541 @staticmethod
542 def best_block_size(elapsed_time, bytes):
543 new_min = max(bytes / 2.0, 1.0)
544 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
545 if elapsed_time < 0.001:
546 return long(new_max)
547 rate = bytes / elapsed_time
548 if rate > new_max:
549 return long(new_max)
550 if rate < new_min:
551 return long(new_min)
552 return long(rate)
553
554 @staticmethod
555 def parse_bytes(bytestr):
556 """Parse a string indicating a byte quantity into a long integer."""
557 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
558 if matchobj is None:
559 return None
560 number = float(matchobj.group(1))
561 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
562 return long(round(number * multiplier))
563
564 def add_info_extractor(self, ie):
565 """Add an InfoExtractor object to the end of the list."""
566 self._ies.append(ie)
567 ie.set_downloader(self)
568
569 def add_post_processor(self, pp):
570 """Add a PostProcessor object to the end of the chain."""
571 self._pps.append(pp)
572 pp.set_downloader(self)
573
574 def to_screen(self, message, skip_eol=False):
575 """Print message to stdout if not in quiet mode."""
576 assert type(message) == type(u'')
577 if not self.params.get('quiet', False):
578 terminator = [u'\n', u''][skip_eol]
579 output = message + terminator
580
581 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
582 output = output.encode(preferredencoding(), 'ignore')
583 self._screen_file.write(output)
584 self._screen_file.flush()
585
586 def to_stderr(self, message):
587 """Print message to stderr."""
588 print >>sys.stderr, message.encode(preferredencoding())
589
590 def to_cons_title(self, message):
591 """Set console/terminal window title to message."""
592 if not self.params.get('consoletitle', False):
593 return
594 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
595 # c_wchar_p() might not be necessary if `message` is
596 # already of type unicode()
597 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
598 elif 'TERM' in os.environ:
599 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
600
601 def fixed_template(self):
602 """Checks if the output template is fixed."""
603 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
604
605 def trouble(self, message=None):
606 """Determine action to take when a download problem appears.
607
608 Depending on if the downloader has been configured to ignore
609 download errors or not, this method may throw an exception or
610 not when errors are found, after printing the message.
611 """
612 if message is not None:
613 self.to_stderr(message)
614 if not self.params.get('ignoreerrors', False):
615 raise DownloadError(message)
616 self._download_retcode = 1
617
618 def slow_down(self, start_time, byte_counter):
619 """Sleep if the download speed is over the rate limit."""
620 rate_limit = self.params.get('ratelimit', None)
621 if rate_limit is None or byte_counter == 0:
622 return
623 now = time.time()
624 elapsed = now - start_time
625 if elapsed <= 0.0:
626 return
627 speed = float(byte_counter) / elapsed
628 if speed > rate_limit:
629 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
630
631 def temp_name(self, filename):
632 """Returns a temporary filename for the given filename."""
633 if self.params.get('nopart', False) or filename == u'-' or \
634 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
635 return filename
636 return filename + u'.part'
637
638 def undo_temp_name(self, filename):
639 if filename.endswith(u'.part'):
640 return filename[:-len(u'.part')]
641 return filename
642
643 def try_rename(self, old_filename, new_filename):
644 try:
645 if old_filename == new_filename:
646 return
647 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
648 except (IOError, OSError), err:
649 self.trouble(u'ERROR: unable to rename file')
650
651 def try_utime(self, filename, last_modified_hdr):
652 """Try to set the last-modified time of the given file."""
653 if last_modified_hdr is None:
654 return
655 if not os.path.isfile(_encodeFilename(filename)):
656 return
657 timestr = last_modified_hdr
658 if timestr is None:
659 return
660 filetime = timeconvert(timestr)
661 if filetime is None:
662 return filetime
663 try:
664 os.utime(filename, (time.time(), filetime))
665 except:
666 pass
667 return filetime
668
669 def report_writedescription(self, descfn):
670 """ Report that the description file is being written """
671 self.to_screen(u'[info] Writing video description to: ' + descfn)
672
673 def report_writeinfojson(self, infofn):
674 """ Report that the metadata file has been written """
675 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
676
677 def report_destination(self, filename):
678 """Report destination filename."""
679 self.to_screen(u'[download] Destination: ' + filename)
680
681 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
682 """Report download progress."""
683 if self.params.get('noprogress', False):
684 return
685 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
686 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
687 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
688 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
689
690 def report_resuming_byte(self, resume_len):
691 """Report attempt to resume at given byte."""
692 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
693
694 def report_retry(self, count, retries):
695 """Report retry in case of HTTP error 5xx"""
696 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
697
698 def report_file_already_downloaded(self, file_name):
699 """Report file has already been fully downloaded."""
700 try:
701 self.to_screen(u'[download] %s has already been downloaded' % file_name)
702 except (UnicodeEncodeError), err:
703 self.to_screen(u'[download] The file has already been downloaded')
704
705 def report_unable_to_resume(self):
706 """Report it was impossible to resume download."""
707 self.to_screen(u'[download] Unable to resume')
708
709 def report_finish(self):
710 """Report download finished."""
711 if self.params.get('noprogress', False):
712 self.to_screen(u'[download] Download completed')
713 else:
714 self.to_screen(u'')
715
716 def increment_downloads(self):
717 """Increment the ordinal that assigns a number to each file."""
718 self._num_downloads += 1
719
720 def prepare_filename(self, info_dict):
721 """Generate the output filename."""
722 try:
723 template_dict = dict(info_dict)
724 template_dict['epoch'] = unicode(long(time.time()))
725 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
726 filename = self.params['outtmpl'] % template_dict
727 return filename
728 except (ValueError, KeyError), err:
729 self.trouble(u'ERROR: invalid system charset or erroneous output template')
730 return None
731
732 def _match_entry(self, info_dict):
733 """ Returns None iff the file should be downloaded """
734
735 title = info_dict['title']
736 matchtitle = self.params.get('matchtitle', False)
737 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
738 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
739 rejecttitle = self.params.get('rejecttitle', False)
740 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
741 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
742 return None
743
744 def process_info(self, info_dict):
745 """Process a single dictionary returned by an InfoExtractor."""
746
747 reason = self._match_entry(info_dict)
748 if reason is not None:
749 self.to_screen(u'[download] ' + reason)
750 return
751
752 max_downloads = self.params.get('max_downloads')
753 if max_downloads is not None:
754 if self._num_downloads > int(max_downloads):
755 raise MaxDownloadsReached()
756
757 filename = self.prepare_filename(info_dict)
758
759 # Forced printings
760 if self.params.get('forcetitle', False):
761 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
762 if self.params.get('forceurl', False):
763 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
764 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
765 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
766 if self.params.get('forcedescription', False) and 'description' in info_dict:
767 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
768 if self.params.get('forcefilename', False) and filename is not None:
769 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
770 if self.params.get('forceformat', False):
771 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
772
773 # Do nothing else if in simulate mode
774 if self.params.get('simulate', False):
775 return
776
777 if filename is None:
778 return
779
780 try:
781 dn = os.path.dirname(_encodeFilename(filename))
782 if dn != '' and not os.path.exists(dn): # dn is already encoded
783 os.makedirs(dn)
784 except (OSError, IOError), err:
785 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
786 return
787
788 if self.params.get('writedescription', False):
789 try:
790 descfn = filename + u'.description'
791 self.report_writedescription(descfn)
792 descfile = open(_encodeFilename(descfn), 'wb')
793 try:
794 descfile.write(info_dict['description'].encode('utf-8'))
795 finally:
796 descfile.close()
797 except (OSError, IOError):
798 self.trouble(u'ERROR: Cannot write description file ' + descfn)
799 return
800
801 if self.params.get('writeinfojson', False):
802 infofn = filename + u'.info.json'
803 self.report_writeinfojson(infofn)
804 try:
805 json.dump
806 except (NameError,AttributeError):
807 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
808 return
809 try:
810 infof = open(_encodeFilename(infofn), 'wb')
811 try:
812 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
813 json.dump(json_info_dict, infof)
814 finally:
815 infof.close()
816 except (OSError, IOError):
817 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
818 return
819
820 if not self.params.get('skip_download', False):
821 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
822 success = True
823 else:
824 try:
825 success = self._do_download(filename, info_dict)
826 except (OSError, IOError), err:
827 raise UnavailableVideoError
828 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
829 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
830 return
831 except (ContentTooShortError, ), err:
832 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
833 return
834
835 if success:
836 try:
837 self.post_process(filename, info_dict)
838 except (PostProcessingError), err:
839 self.trouble(u'ERROR: postprocessing: %s' % str(err))
840 return
841
842 def download(self, url_list):
843 """Download a given list of URLs."""
844 if len(url_list) > 1 and self.fixed_template():
845 raise SameFileError(self.params['outtmpl'])
846
847 for url in url_list:
848 suitable_found = False
849 for ie in self._ies:
850 # Go to next InfoExtractor if not suitable
851 if not ie.suitable(url):
852 continue
853
854 # Suitable InfoExtractor found
855 suitable_found = True
856
857 # Extract information from URL and process it
858 ie.extract(url)
859
860 # Suitable InfoExtractor had been found; go to next URL
861 break
862
863 if not suitable_found:
864 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
865
866 return self._download_retcode
867
868 def post_process(self, filename, ie_info):
869 """Run the postprocessing chain on the given file."""
870 info = dict(ie_info)
871 info['filepath'] = filename
872 for pp in self._pps:
873 info = pp.run(info)
874 if info is None:
875 break
876
877 def _download_with_rtmpdump(self, filename, url, player_url):
878 self.report_destination(filename)
879 tmpfilename = self.temp_name(filename)
880
881 # Check for rtmpdump first
882 try:
883 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
884 except (OSError, IOError):
885 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
886 return False
887
888 # Download using rtmpdump. rtmpdump returns exit code 2 when
889 # the connection was interrumpted and resuming appears to be
890 # possible. This is part of rtmpdump's normal usage, AFAIK.
891 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
892 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
893 while retval == 2 or retval == 1:
894 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
895 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
896 time.sleep(5.0) # This seems to be needed
897 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
898 cursize = os.path.getsize(_encodeFilename(tmpfilename))
899 if prevsize == cursize and retval == 1:
900 break
901 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
902 if prevsize == cursize and retval == 2 and cursize > 1024:
903 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
904 retval = 0
905 break
906 if retval == 0:
907 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
908 self.try_rename(tmpfilename, filename)
909 return True
910 else:
911 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
912 return False
913
914 def _do_download(self, filename, info_dict):
915 url = info_dict['url']
916 player_url = info_dict.get('player_url', None)
917
918 # Check file already present
919 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
920 self.report_file_already_downloaded(filename)
921 return True
922
923 # Attempt to download using rtmpdump
924 if url.startswith('rtmp'):
925 return self._download_with_rtmpdump(filename, url, player_url)
926
927 tmpfilename = self.temp_name(filename)
928 stream = None
929
930 # Do not include the Accept-Encoding header
931 headers = {'Youtubedl-no-compression': 'True'}
932 basic_request = urllib2.Request(url, None, headers)
933 request = urllib2.Request(url, None, headers)
934
935 # Establish possible resume length
936 if os.path.isfile(_encodeFilename(tmpfilename)):
937 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
938 else:
939 resume_len = 0
940
941 open_mode = 'wb'
942 if resume_len != 0:
943 if self.params.get('continuedl', False):
944 self.report_resuming_byte(resume_len)
945 request.add_header('Range','bytes=%d-' % resume_len)
946 open_mode = 'ab'
947 else:
948 resume_len = 0
949
950 count = 0
951 retries = self.params.get('retries', 0)
952 while count <= retries:
953 # Establish connection
954 try:
955 if count == 0 and 'urlhandle' in info_dict:
956 data = info_dict['urlhandle']
957 data = urllib2.urlopen(request)
958 break
959 except (urllib2.HTTPError, ), err:
960 if (err.code < 500 or err.code >= 600) and err.code != 416:
961 # Unexpected HTTP error
962 raise
963 elif err.code == 416:
964 # Unable to resume (requested range not satisfiable)
965 try:
966 # Open the connection again without the range header
967 data = urllib2.urlopen(basic_request)
968 content_length = data.info()['Content-Length']
969 except (urllib2.HTTPError, ), err:
970 if err.code < 500 or err.code >= 600:
971 raise
972 else:
973 # Examine the reported length
974 if (content_length is not None and
975 (resume_len - 100 < long(content_length) < resume_len + 100)):
976 # The file had already been fully downloaded.
977 # Explanation to the above condition: in issue #175 it was revealed that
978 # YouTube sometimes adds or removes a few bytes from the end of the file,
979 # changing the file size slightly and causing problems for some users. So
980 # I decided to implement a suggested change and consider the file
981 # completely downloaded if the file size differs less than 100 bytes from
982 # the one in the hard drive.
983 self.report_file_already_downloaded(filename)
984 self.try_rename(tmpfilename, filename)
985 return True
986 else:
987 # The length does not match, we start the download over
988 self.report_unable_to_resume()
989 open_mode = 'wb'
990 break
991 # Retry
992 count += 1
993 if count <= retries:
994 self.report_retry(count, retries)
995
996 if count > retries:
997 self.trouble(u'ERROR: giving up after %s retries' % retries)
998 return False
999
1000 data_len = data.info().get('Content-length', None)
1001 if data_len is not None:
1002 data_len = long(data_len) + resume_len
1003 data_len_str = self.format_bytes(data_len)
1004 byte_counter = 0 + resume_len
1005 block_size = 1024
1006 start = time.time()
1007 while True:
1008 # Download and write
1009 before = time.time()
1010 data_block = data.read(block_size)
1011 after = time.time()
1012 if len(data_block) == 0:
1013 break
1014 byte_counter += len(data_block)
1015
1016 # Open file just in time
1017 if stream is None:
1018 try:
1019 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1020 assert stream is not None
1021 filename = self.undo_temp_name(tmpfilename)
1022 self.report_destination(filename)
1023 except (OSError, IOError), err:
1024 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1025 return False
1026 try:
1027 stream.write(data_block)
1028 except (IOError, OSError), err:
1029 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1030 return False
1031 block_size = self.best_block_size(after - before, len(data_block))
1032
1033 # Progress message
1034 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1035 if data_len is None:
1036 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1037 else:
1038 percent_str = self.calc_percent(byte_counter, data_len)
1039 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1040 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1041
1042 # Apply rate limit
1043 self.slow_down(start, byte_counter - resume_len)
1044
1045 if stream is None:
1046 self.trouble(u'\nERROR: Did not get any data blocks')
1047 return False
1048 stream.close()
1049 self.report_finish()
1050 if data_len is not None and byte_counter != data_len:
1051 raise ContentTooShortError(byte_counter, long(data_len))
1052 self.try_rename(tmpfilename, filename)
1053
1054 # Update file modification time
1055 if self.params.get('updatetime', True):
1056 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1057
1058 return True
1059
1060
1061 class InfoExtractor(object):
1062 """Information Extractor class.
1063
1064 Information extractors are the classes that, given a URL, extract
1065 information from the video (or videos) the URL refers to. This
1066 information includes the real video URL, the video title and simplified
1067 title, author and others. The information is stored in a dictionary
1068 which is then passed to the FileDownloader. The FileDownloader
1069 processes this information possibly downloading the video to the file
1070 system, among other possible outcomes. The dictionaries must include
1071 the following fields:
1072
1073 id: Video identifier.
1074 url: Final video URL.
1075 uploader: Nickname of the video uploader.
1076 title: Literal title.
1077 stitle: Simplified title.
1078 ext: Video filename extension.
1079 format: Video format.
1080 player_url: SWF Player URL (may be None).
1081
1082 The following fields are optional. Their primary purpose is to allow
1083 youtube-dl to serve as the backend for a video search function, such
1084 as the one in youtube2mp3. They are only used when their respective
1085 forced printing functions are called:
1086
1087 thumbnail: Full URL to a video thumbnail image.
1088 description: One-line video description.
1089
1090 Subclasses of this one should re-define the _real_initialize() and
1091 _real_extract() methods and define a _VALID_URL regexp.
1092 Probably, they should also be added to the list of extractors.
1093 """
1094
1095 _ready = False
1096 _downloader = None
1097
1098 def __init__(self, downloader=None):
1099 """Constructor. Receives an optional downloader."""
1100 self._ready = False
1101 self.set_downloader(downloader)
1102
1103 def suitable(self, url):
1104 """Receives a URL and returns True if suitable for this IE."""
1105 return re.match(self._VALID_URL, url) is not None
1106
1107 def initialize(self):
1108 """Initializes an instance (authentication, etc)."""
1109 if not self._ready:
1110 self._real_initialize()
1111 self._ready = True
1112
1113 def extract(self, url):
1114 """Extracts URL information and returns it in list of dicts."""
1115 self.initialize()
1116 return self._real_extract(url)
1117
1118 def set_downloader(self, downloader):
1119 """Sets the downloader for this IE."""
1120 self._downloader = downloader
1121
1122 def _real_initialize(self):
1123 """Real initialization process. Redefine in subclasses."""
1124 pass
1125
1126 def _real_extract(self, url):
1127 """Real extraction process. Redefine in subclasses."""
1128 pass
1129
1130
1131 class YoutubeIE(InfoExtractor):
1132 """Information extractor for youtube.com."""
1133
1134 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1135 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1136 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1137 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1138 _NETRC_MACHINE = 'youtube'
1139 # Listed in order of quality
1140 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1141 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1142 _video_extensions = {
1143 '13': '3gp',
1144 '17': 'mp4',
1145 '18': 'mp4',
1146 '22': 'mp4',
1147 '37': 'mp4',
1148 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1149 '43': 'webm',
1150 '44': 'webm',
1151 '45': 'webm',
1152 }
1153 _video_dimensions = {
1154 '5': '240x400',
1155 '6': '???',
1156 '13': '???',
1157 '17': '144x176',
1158 '18': '360x640',
1159 '22': '720x1280',
1160 '34': '360x640',
1161 '35': '480x854',
1162 '37': '1080x1920',
1163 '38': '3072x4096',
1164 '43': '360x640',
1165 '44': '480x854',
1166 '45': '720x1280',
1167 }
1168 IE_NAME = u'youtube'
1169
1170 def report_lang(self):
1171 """Report attempt to set language."""
1172 self._downloader.to_screen(u'[youtube] Setting language')
1173
1174 def report_login(self):
1175 """Report attempt to log in."""
1176 self._downloader.to_screen(u'[youtube] Logging in')
1177
1178 def report_age_confirmation(self):
1179 """Report attempt to confirm age."""
1180 self._downloader.to_screen(u'[youtube] Confirming age')
1181
1182 def report_video_webpage_download(self, video_id):
1183 """Report attempt to download video webpage."""
1184 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1185
1186 def report_video_info_webpage_download(self, video_id):
1187 """Report attempt to download video info webpage."""
1188 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1189
1190 def report_information_extraction(self, video_id):
1191 """Report attempt to extract video information."""
1192 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1193
1194 def report_unavailable_format(self, video_id, format):
1195 """Report extracted video URL."""
1196 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1197
1198 def report_rtmp_download(self):
1199 """Indicate the download will use the RTMP protocol."""
1200 self._downloader.to_screen(u'[youtube] RTMP download detected')
1201
1202 def _print_formats(self, formats):
1203 print 'Available formats:'
1204 for x in formats:
1205 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1206
1207 def _real_initialize(self):
1208 if self._downloader is None:
1209 return
1210
1211 username = None
1212 password = None
1213 downloader_params = self._downloader.params
1214
1215 # Attempt to use provided username and password or .netrc data
1216 if downloader_params.get('username', None) is not None:
1217 username = downloader_params['username']
1218 password = downloader_params['password']
1219 elif downloader_params.get('usenetrc', False):
1220 try:
1221 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1222 if info is not None:
1223 username = info[0]
1224 password = info[2]
1225 else:
1226 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1227 except (IOError, netrc.NetrcParseError), err:
1228 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1229 return
1230
1231 # Set language
1232 request = urllib2.Request(self._LANG_URL)
1233 try:
1234 self.report_lang()
1235 urllib2.urlopen(request).read()
1236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1237 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1238 return
1239
1240 # No authentication to be performed
1241 if username is None:
1242 return
1243
1244 # Log in
1245 login_form = {
1246 'current_form': 'loginForm',
1247 'next': '/',
1248 'action_login': 'Log In',
1249 'username': username,
1250 'password': password,
1251 }
1252 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1253 try:
1254 self.report_login()
1255 login_results = urllib2.urlopen(request).read()
1256 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1257 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1258 return
1259 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1260 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1261 return
1262
1263 # Confirm age
1264 age_form = {
1265 'next_url': '/',
1266 'action_confirm': 'Confirm',
1267 }
1268 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1269 try:
1270 self.report_age_confirmation()
1271 age_results = urllib2.urlopen(request).read()
1272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1274 return
1275
1276 def _real_extract(self, url):
1277 # Extract video id from URL
1278 mobj = re.match(self._VALID_URL, url)
1279 if mobj is None:
1280 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1281 return
1282 video_id = mobj.group(2)
1283
1284 # Get video webpage
1285 self.report_video_webpage_download(video_id)
1286 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1287 try:
1288 video_webpage = urllib2.urlopen(request).read()
1289 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1290 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1291 return
1292
1293 # Attempt to extract SWF player URL
1294 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1295 if mobj is not None:
1296 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1297 else:
1298 player_url = None
1299
1300 # Get video info
1301 self.report_video_info_webpage_download(video_id)
1302 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1303 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1304 % (video_id, el_type))
1305 request = urllib2.Request(video_info_url)
1306 try:
1307 video_info_webpage = urllib2.urlopen(request).read()
1308 video_info = parse_qs(video_info_webpage)
1309 if 'token' in video_info:
1310 break
1311 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1312 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1313 return
1314 if 'token' not in video_info:
1315 if 'reason' in video_info:
1316 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1317 else:
1318 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1319 return
1320
1321 # Start extracting information
1322 self.report_information_extraction(video_id)
1323
1324 # uploader
1325 if 'author' not in video_info:
1326 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1327 return
1328 video_uploader = urllib.unquote_plus(video_info['author'][0])
1329
1330 # title
1331 if 'title' not in video_info:
1332 self._downloader.trouble(u'ERROR: unable to extract video title')
1333 return
1334 video_title = urllib.unquote_plus(video_info['title'][0])
1335 video_title = video_title.decode('utf-8')
1336 video_title = sanitize_title(video_title)
1337
1338 # simplified title
1339 simple_title = _simplify_title(video_title)
1340
1341 # thumbnail image
1342 if 'thumbnail_url' not in video_info:
1343 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1344 video_thumbnail = ''
1345 else: # don't panic if we can't find it
1346 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1347
1348 # upload date
1349 upload_date = u'NA'
1350 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1351 if mobj is not None:
1352 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1353 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1354 for expression in format_expressions:
1355 try:
1356 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1357 except:
1358 pass
1359
1360 # description
1361 try:
1362 lxml.etree
1363 except NameError:
1364 video_description = u'No description available.'
1365 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1366 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1367 if mobj is not None:
1368 video_description = mobj.group(1).decode('utf-8')
1369 else:
1370 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1371 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1372 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1373 # TODO use another parser
1374
1375 # token
1376 video_token = urllib.unquote_plus(video_info['token'][0])
1377
1378 # Decide which formats to download
1379 req_format = self._downloader.params.get('format', None)
1380
1381 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1382 self.report_rtmp_download()
1383 video_url_list = [(None, video_info['conn'][0])]
1384 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1385 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1386 url_data = [parse_qs(uds) for uds in url_data_strs]
1387 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1388 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1389
1390 format_limit = self._downloader.params.get('format_limit', None)
1391 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1392 if format_limit is not None and format_limit in available_formats:
1393 format_list = available_formats[available_formats.index(format_limit):]
1394 else:
1395 format_list = available_formats
1396 existing_formats = [x for x in format_list if x in url_map]
1397 if len(existing_formats) == 0:
1398 self._downloader.trouble(u'ERROR: no known formats available for video')
1399 return
1400 if self._downloader.params.get('listformats', None):
1401 self._print_formats(existing_formats)
1402 return
1403 if req_format is None or req_format == 'best':
1404 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1405 elif req_format == 'worst':
1406 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1407 elif req_format in ('-1', 'all'):
1408 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1409 else:
1410 # Specific formats. We pick the first in a slash-delimeted sequence.
1411 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1412 req_formats = req_format.split('/')
1413 video_url_list = None
1414 for rf in req_formats:
1415 if rf in url_map:
1416 video_url_list = [(rf, url_map[rf])]
1417 break
1418 if video_url_list is None:
1419 self._downloader.trouble(u'ERROR: requested format not available')
1420 return
1421 else:
1422 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1423 return
1424
1425 for format_param, video_real_url in video_url_list:
1426 # At this point we have a new video
1427 self._downloader.increment_downloads()
1428
1429 # Extension
1430 video_extension = self._video_extensions.get(format_param, 'flv')
1431
1432 try:
1433 # Process video information
1434 self._downloader.process_info({
1435 'id': video_id.decode('utf-8'),
1436 'url': video_real_url.decode('utf-8'),
1437 'uploader': video_uploader.decode('utf-8'),
1438 'upload_date': upload_date,
1439 'title': video_title,
1440 'stitle': simple_title,
1441 'ext': video_extension.decode('utf-8'),
1442 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1443 'thumbnail': video_thumbnail.decode('utf-8'),
1444 'description': video_description,
1445 'player_url': player_url,
1446 })
1447 except UnavailableVideoError, err:
1448 self._downloader.trouble(u'\nERROR: unable to download video')
1449
1450
1451 class MetacafeIE(InfoExtractor):
1452 """Information Extractor for metacafe.com."""
1453
1454 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1455 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1456 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1457 _youtube_ie = None
1458 IE_NAME = u'metacafe'
1459
1460 def __init__(self, youtube_ie, downloader=None):
1461 InfoExtractor.__init__(self, downloader)
1462 self._youtube_ie = youtube_ie
1463
1464 def report_disclaimer(self):
1465 """Report disclaimer retrieval."""
1466 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1467
1468 def report_age_confirmation(self):
1469 """Report attempt to confirm age."""
1470 self._downloader.to_screen(u'[metacafe] Confirming age')
1471
1472 def report_download_webpage(self, video_id):
1473 """Report webpage download."""
1474 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1475
1476 def report_extraction(self, video_id):
1477 """Report information extraction."""
1478 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1479
1480 def _real_initialize(self):
1481 # Retrieve disclaimer
1482 request = urllib2.Request(self._DISCLAIMER)
1483 try:
1484 self.report_disclaimer()
1485 disclaimer = urllib2.urlopen(request).read()
1486 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1487 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1488 return
1489
1490 # Confirm age
1491 disclaimer_form = {
1492 'filters': '0',
1493 'submit': "Continue - I'm over 18",
1494 }
1495 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1496 try:
1497 self.report_age_confirmation()
1498 disclaimer = urllib2.urlopen(request).read()
1499 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1501 return
1502
1503 def _real_extract(self, url):
1504 # Extract id and simplified title from URL
1505 mobj = re.match(self._VALID_URL, url)
1506 if mobj is None:
1507 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1508 return
1509
1510 video_id = mobj.group(1)
1511
1512 # Check if video comes from YouTube
1513 mobj2 = re.match(r'^yt-(.*)$', video_id)
1514 if mobj2 is not None:
1515 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1516 return
1517
1518 # At this point we have a new video
1519 self._downloader.increment_downloads()
1520
1521 simple_title = mobj.group(2).decode('utf-8')
1522
1523 # Retrieve video webpage to extract further information
1524 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1525 try:
1526 self.report_download_webpage(video_id)
1527 webpage = urllib2.urlopen(request).read()
1528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1529 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1530 return
1531
1532 # Extract URL, uploader and title from webpage
1533 self.report_extraction(video_id)
1534 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1535 if mobj is not None:
1536 mediaURL = urllib.unquote(mobj.group(1))
1537 video_extension = mediaURL[-3:]
1538
1539 # Extract gdaKey if available
1540 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1541 if mobj is None:
1542 video_url = mediaURL
1543 else:
1544 gdaKey = mobj.group(1)
1545 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1546 else:
1547 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1548 if mobj is None:
1549 self._downloader.trouble(u'ERROR: unable to extract media URL')
1550 return
1551 vardict = parse_qs(mobj.group(1))
1552 if 'mediaData' not in vardict:
1553 self._downloader.trouble(u'ERROR: unable to extract media URL')
1554 return
1555 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1556 if mobj is None:
1557 self._downloader.trouble(u'ERROR: unable to extract media URL')
1558 return
1559 mediaURL = mobj.group(1).replace('\\/', '/')
1560 video_extension = mediaURL[-3:]
1561 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1562
1563 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1564 if mobj is None:
1565 self._downloader.trouble(u'ERROR: unable to extract title')
1566 return
1567 video_title = mobj.group(1).decode('utf-8')
1568 video_title = sanitize_title(video_title)
1569
1570 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1571 if mobj is None:
1572 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1573 return
1574 video_uploader = mobj.group(1)
1575
1576 try:
1577 # Process video information
1578 self._downloader.process_info({
1579 'id': video_id.decode('utf-8'),
1580 'url': video_url.decode('utf-8'),
1581 'uploader': video_uploader.decode('utf-8'),
1582 'upload_date': u'NA',
1583 'title': video_title,
1584 'stitle': simple_title,
1585 'ext': video_extension.decode('utf-8'),
1586 'format': u'NA',
1587 'player_url': None,
1588 })
1589 except UnavailableVideoError:
1590 self._downloader.trouble(u'\nERROR: unable to download video')
1591
1592
1593 class DailymotionIE(InfoExtractor):
1594 """Information Extractor for Dailymotion"""
1595
1596 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1597 IE_NAME = u'dailymotion'
1598
1599 def __init__(self, downloader=None):
1600 InfoExtractor.__init__(self, downloader)
1601
1602 def report_download_webpage(self, video_id):
1603 """Report webpage download."""
1604 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1605
1606 def report_extraction(self, video_id):
1607 """Report information extraction."""
1608 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1609
1610 def _real_extract(self, url):
1611 # Extract id and simplified title from URL
1612 mobj = re.match(self._VALID_URL, url)
1613 if mobj is None:
1614 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1615 return
1616
1617 # At this point we have a new video
1618 self._downloader.increment_downloads()
1619 video_id = mobj.group(1)
1620
1621 video_extension = 'flv'
1622
1623 # Retrieve video webpage to extract further information
1624 request = urllib2.Request(url)
1625 request.add_header('Cookie', 'family_filter=off')
1626 try:
1627 self.report_download_webpage(video_id)
1628 webpage = urllib2.urlopen(request).read()
1629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1631 return
1632
1633 # Extract URL, uploader and title from webpage
1634 self.report_extraction(video_id)
1635 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1636 if mobj is None:
1637 self._downloader.trouble(u'ERROR: unable to extract media URL')
1638 return
1639 sequence = urllib.unquote(mobj.group(1))
1640 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1641 if mobj is None:
1642 self._downloader.trouble(u'ERROR: unable to extract media URL')
1643 return
1644 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1645
1646 # if needed add http://www.dailymotion.com/ if relative URL
1647
1648 video_url = mediaURL
1649
1650 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1651 if mobj is None:
1652 self._downloader.trouble(u'ERROR: unable to extract title')
1653 return
1654 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1655 video_title = sanitize_title(video_title)
1656 simple_title = _simplify_title(video_title)
1657
1658 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1659 if mobj is None:
1660 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1661 return
1662 video_uploader = mobj.group(1)
1663
1664 try:
1665 # Process video information
1666 self._downloader.process_info({
1667 'id': video_id.decode('utf-8'),
1668 'url': video_url.decode('utf-8'),
1669 'uploader': video_uploader.decode('utf-8'),
1670 'upload_date': u'NA',
1671 'title': video_title,
1672 'stitle': simple_title,
1673 'ext': video_extension.decode('utf-8'),
1674 'format': u'NA',
1675 'player_url': None,
1676 })
1677 except UnavailableVideoError:
1678 self._downloader.trouble(u'\nERROR: unable to download video')
1679
1680
1681 class GoogleIE(InfoExtractor):
1682 """Information extractor for video.google.com."""
1683
1684 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1685 IE_NAME = u'video.google'
1686
1687 def __init__(self, downloader=None):
1688 InfoExtractor.__init__(self, downloader)
1689
1690 def report_download_webpage(self, video_id):
1691 """Report webpage download."""
1692 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1693
1694 def report_extraction(self, video_id):
1695 """Report information extraction."""
1696 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1697
1698 def _real_extract(self, url):
1699 # Extract id from URL
1700 mobj = re.match(self._VALID_URL, url)
1701 if mobj is None:
1702 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1703 return
1704
1705 # At this point we have a new video
1706 self._downloader.increment_downloads()
1707 video_id = mobj.group(1)
1708
1709 video_extension = 'mp4'
1710
1711 # Retrieve video webpage to extract further information
1712 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1713 try:
1714 self.report_download_webpage(video_id)
1715 webpage = urllib2.urlopen(request).read()
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718 return
1719
1720 # Extract URL, uploader, and title from webpage
1721 self.report_extraction(video_id)
1722 mobj = re.search(r"download_url:'([^']+)'", webpage)
1723 if mobj is None:
1724 video_extension = 'flv'
1725 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1726 if mobj is None:
1727 self._downloader.trouble(u'ERROR: unable to extract media URL')
1728 return
1729 mediaURL = urllib.unquote(mobj.group(1))
1730 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1731 mediaURL = mediaURL.replace('\\x26', '\x26')
1732
1733 video_url = mediaURL
1734
1735 mobj = re.search(r'<title>(.*)</title>', webpage)
1736 if mobj is None:
1737 self._downloader.trouble(u'ERROR: unable to extract title')
1738 return
1739 video_title = mobj.group(1).decode('utf-8')
1740 video_title = sanitize_title(video_title)
1741 simple_title = _simplify_title(video_title)
1742
1743 # Extract video description
1744 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1745 if mobj is None:
1746 self._downloader.trouble(u'ERROR: unable to extract video description')
1747 return
1748 video_description = mobj.group(1).decode('utf-8')
1749 if not video_description:
1750 video_description = 'No description available.'
1751
1752 # Extract video thumbnail
1753 if self._downloader.params.get('forcethumbnail', False):
1754 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1755 try:
1756 webpage = urllib2.urlopen(request).read()
1757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1758 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1759 return
1760 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1761 if mobj is None:
1762 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1763 return
1764 video_thumbnail = mobj.group(1)
1765 else: # we need something to pass to process_info
1766 video_thumbnail = ''
1767
1768 try:
1769 # Process video information
1770 self._downloader.process_info({
1771 'id': video_id.decode('utf-8'),
1772 'url': video_url.decode('utf-8'),
1773 'uploader': u'NA',
1774 'upload_date': u'NA',
1775 'title': video_title,
1776 'stitle': simple_title,
1777 'ext': video_extension.decode('utf-8'),
1778 'format': u'NA',
1779 'player_url': None,
1780 })
1781 except UnavailableVideoError:
1782 self._downloader.trouble(u'\nERROR: unable to download video')
1783
1784
1785 class PhotobucketIE(InfoExtractor):
1786 """Information extractor for photobucket.com."""
1787
1788 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1789 IE_NAME = u'photobucket'
1790
1791 def __init__(self, downloader=None):
1792 InfoExtractor.__init__(self, downloader)
1793
1794 def report_download_webpage(self, video_id):
1795 """Report webpage download."""
1796 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1797
1798 def report_extraction(self, video_id):
1799 """Report information extraction."""
1800 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1801
1802 def _real_extract(self, url):
1803 # Extract id from URL
1804 mobj = re.match(self._VALID_URL, url)
1805 if mobj is None:
1806 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1807 return
1808
1809 # At this point we have a new video
1810 self._downloader.increment_downloads()
1811 video_id = mobj.group(1)
1812
1813 video_extension = 'flv'
1814
1815 # Retrieve video webpage to extract further information
1816 request = urllib2.Request(url)
1817 try:
1818 self.report_download_webpage(video_id)
1819 webpage = urllib2.urlopen(request).read()
1820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1821 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1822 return
1823
1824 # Extract URL, uploader, and title from webpage
1825 self.report_extraction(video_id)
1826 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1827 if mobj is None:
1828 self._downloader.trouble(u'ERROR: unable to extract media URL')
1829 return
1830 mediaURL = urllib.unquote(mobj.group(1))
1831
1832 video_url = mediaURL
1833
1834 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1835 if mobj is None:
1836 self._downloader.trouble(u'ERROR: unable to extract title')
1837 return
1838 video_title = mobj.group(1).decode('utf-8')
1839 video_title = sanitize_title(video_title)
1840 simple_title = _simplify_title(vide_title)
1841
1842 video_uploader = mobj.group(2).decode('utf-8')
1843
1844 try:
1845 # Process video information
1846 self._downloader.process_info({
1847 'id': video_id.decode('utf-8'),
1848 'url': video_url.decode('utf-8'),
1849 'uploader': video_uploader,
1850 'upload_date': u'NA',
1851 'title': video_title,
1852 'stitle': simple_title,
1853 'ext': video_extension.decode('utf-8'),
1854 'format': u'NA',
1855 'player_url': None,
1856 })
1857 except UnavailableVideoError:
1858 self._downloader.trouble(u'\nERROR: unable to download video')
1859
1860
1861 class YahooIE(InfoExtractor):
1862 """Information extractor for video.yahoo.com."""
1863
1864 # _VALID_URL matches all Yahoo! Video URLs
1865 # _VPAGE_URL matches only the extractable '/watch/' URLs
1866 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1867 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1868 IE_NAME = u'video.yahoo'
1869
1870 def __init__(self, downloader=None):
1871 InfoExtractor.__init__(self, downloader)
1872
1873 def report_download_webpage(self, video_id):
1874 """Report webpage download."""
1875 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1876
1877 def report_extraction(self, video_id):
1878 """Report information extraction."""
1879 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1880
1881 def _real_extract(self, url, new_video=True):
1882 # Extract ID from URL
1883 mobj = re.match(self._VALID_URL, url)
1884 if mobj is None:
1885 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1886 return
1887
1888 # At this point we have a new video
1889 self._downloader.increment_downloads()
1890 video_id = mobj.group(2)
1891 video_extension = 'flv'
1892
1893 # Rewrite valid but non-extractable URLs as
1894 # extractable English language /watch/ URLs
1895 if re.match(self._VPAGE_URL, url) is None:
1896 request = urllib2.Request(url)
1897 try:
1898 webpage = urllib2.urlopen(request).read()
1899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1901 return
1902
1903 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1904 if mobj is None:
1905 self._downloader.trouble(u'ERROR: Unable to extract id field')
1906 return
1907 yahoo_id = mobj.group(1)
1908
1909 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1910 if mobj is None:
1911 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1912 return
1913 yahoo_vid = mobj.group(1)
1914
1915 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1916 return self._real_extract(url, new_video=False)
1917
1918 # Retrieve video webpage to extract further information
1919 request = urllib2.Request(url)
1920 try:
1921 self.report_download_webpage(video_id)
1922 webpage = urllib2.urlopen(request).read()
1923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1924 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1925 return
1926
1927 # Extract uploader and title from webpage
1928 self.report_extraction(video_id)
1929 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1930 if mobj is None:
1931 self._downloader.trouble(u'ERROR: unable to extract video title')
1932 return
1933 video_title = mobj.group(1).decode('utf-8')
1934 simple_title = _simplify_title(video_title)
1935
1936 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1937 if mobj is None:
1938 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1939 return
1940 video_uploader = mobj.group(1).decode('utf-8')
1941
1942 # Extract video thumbnail
1943 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1944 if mobj is None:
1945 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1946 return
1947 video_thumbnail = mobj.group(1).decode('utf-8')
1948
1949 # Extract video description
1950 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1951 if mobj is None:
1952 self._downloader.trouble(u'ERROR: unable to extract video description')
1953 return
1954 video_description = mobj.group(1).decode('utf-8')
1955 if not video_description:
1956 video_description = 'No description available.'
1957
1958 # Extract video height and width
1959 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1960 if mobj is None:
1961 self._downloader.trouble(u'ERROR: unable to extract video height')
1962 return
1963 yv_video_height = mobj.group(1)
1964
1965 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1966 if mobj is None:
1967 self._downloader.trouble(u'ERROR: unable to extract video width')
1968 return
1969 yv_video_width = mobj.group(1)
1970
1971 # Retrieve video playlist to extract media URL
1972 # I'm not completely sure what all these options are, but we
1973 # seem to need most of them, otherwise the server sends a 401.
1974 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1975 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1976 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1977 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1978 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1979 try:
1980 self.report_download_webpage(video_id)
1981 webpage = urllib2.urlopen(request).read()
1982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1983 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1984 return
1985
1986 # Extract media URL from playlist XML
1987 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1988 if mobj is None:
1989 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1990 return
1991 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1992 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1993
1994 try:
1995 # Process video information
1996 self._downloader.process_info({
1997 'id': video_id.decode('utf-8'),
1998 'url': video_url,
1999 'uploader': video_uploader,
2000 'upload_date': u'NA',
2001 'title': video_title,
2002 'stitle': simple_title,
2003 'ext': video_extension.decode('utf-8'),
2004 'thumbnail': video_thumbnail.decode('utf-8'),
2005 'description': video_description,
2006 'thumbnail': video_thumbnail,
2007 'player_url': None,
2008 })
2009 except UnavailableVideoError:
2010 self._downloader.trouble(u'\nERROR: unable to download video')
2011
2012
2013 class VimeoIE(InfoExtractor):
2014 """Information extractor for vimeo.com."""
2015
2016 # _VALID_URL matches Vimeo URLs
2017 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2018 IE_NAME = u'vimeo'
2019
2020 def __init__(self, downloader=None):
2021 InfoExtractor.__init__(self, downloader)
2022
2023 def report_download_webpage(self, video_id):
2024 """Report webpage download."""
2025 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2026
2027 def report_extraction(self, video_id):
2028 """Report information extraction."""
2029 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2030
2031 def _real_extract(self, url, new_video=True):
2032 # Extract ID from URL
2033 mobj = re.match(self._VALID_URL, url)
2034 if mobj is None:
2035 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2036 return
2037
2038 # At this point we have a new video
2039 self._downloader.increment_downloads()
2040 video_id = mobj.group(1)
2041
2042 # Retrieve video webpage to extract further information
2043 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2044 try:
2045 self.report_download_webpage(video_id)
2046 webpage = urllib2.urlopen(request).read()
2047 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2048 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2049 return
2050
2051 # Now we begin extracting as much information as we can from what we
2052 # retrieved. First we extract the information common to all extractors,
2053 # and latter we extract those that are Vimeo specific.
2054 self.report_extraction(video_id)
2055
2056 # Extract title
2057 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2058 if mobj is None:
2059 self._downloader.trouble(u'ERROR: unable to extract video title')
2060 return
2061 video_title = mobj.group(1).decode('utf-8')
2062 simple_title = _simplify_title(video_title)
2063
2064 # Extract uploader
2065 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2066 if mobj is None:
2067 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2068 return
2069 video_uploader = mobj.group(1).decode('utf-8')
2070
2071 # Extract video thumbnail
2072 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2073 if mobj is None:
2074 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2075 return
2076 video_thumbnail = mobj.group(1).decode('utf-8')
2077
2078 # # Extract video description
2079 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2080 # if mobj is None:
2081 # self._downloader.trouble(u'ERROR: unable to extract video description')
2082 # return
2083 # video_description = mobj.group(1).decode('utf-8')
2084 # if not video_description: video_description = 'No description available.'
2085 video_description = 'Foo.'
2086
2087 # Vimeo specific: extract request signature
2088 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2089 if mobj is None:
2090 self._downloader.trouble(u'ERROR: unable to extract request signature')
2091 return
2092 sig = mobj.group(1).decode('utf-8')
2093
2094 # Vimeo specific: extract video quality information
2095 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2096 if mobj is None:
2097 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2098 return
2099 quality = mobj.group(1).decode('utf-8')
2100
2101 if int(quality) == 1:
2102 quality = 'hd'
2103 else:
2104 quality = 'sd'
2105
2106 # Vimeo specific: Extract request signature expiration
2107 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2108 if mobj is None:
2109 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2110 return
2111 sig_exp = mobj.group(1).decode('utf-8')
2112
2113 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2114
2115 try:
2116 # Process video information
2117 self._downloader.process_info({
2118 'id': video_id.decode('utf-8'),
2119 'url': video_url,
2120 'uploader': video_uploader,
2121 'upload_date': u'NA',
2122 'title': video_title,
2123 'stitle': simple_title,
2124 'ext': u'mp4',
2125 'thumbnail': video_thumbnail.decode('utf-8'),
2126 'description': video_description,
2127 'thumbnail': video_thumbnail,
2128 'description': video_description,
2129 'player_url': None,
2130 })
2131 except UnavailableVideoError:
2132 self._downloader.trouble(u'ERROR: unable to download video')
2133
2134
2135 class GenericIE(InfoExtractor):
2136 """Generic last-resort information extractor."""
2137
2138 _VALID_URL = r'.*'
2139 IE_NAME = u'generic'
2140
2141 def __init__(self, downloader=None):
2142 InfoExtractor.__init__(self, downloader)
2143
2144 def report_download_webpage(self, video_id):
2145 """Report webpage download."""
2146 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2147 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2148
2149 def report_extraction(self, video_id):
2150 """Report information extraction."""
2151 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2152
2153 def _real_extract(self, url):
2154 # At this point we have a new video
2155 self._downloader.increment_downloads()
2156
2157 video_id = url.split('/')[-1]
2158 request = urllib2.Request(url)
2159 try:
2160 self.report_download_webpage(video_id)
2161 webpage = urllib2.urlopen(request).read()
2162 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2163 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2164 return
2165 except ValueError, err:
2166 # since this is the last-resort InfoExtractor, if
2167 # this error is thrown, it'll be thrown here
2168 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2169 return
2170
2171 self.report_extraction(video_id)
2172 # Start with something easy: JW Player in SWFObject
2173 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2174 if mobj is None:
2175 # Broaden the search a little bit
2176 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2177 if mobj is None:
2178 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2179 return
2180
2181 # It's possible that one of the regexes
2182 # matched, but returned an empty group:
2183 if mobj.group(1) is None:
2184 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2185 return
2186
2187 video_url = urllib.unquote(mobj.group(1))
2188 video_id = os.path.basename(video_url)
2189
2190 # here's a fun little line of code for you:
2191 video_extension = os.path.splitext(video_id)[1][1:]
2192 video_id = os.path.splitext(video_id)[0]
2193
2194 # it's tempting to parse this further, but you would
2195 # have to take into account all the variations like
2196 # Video Title - Site Name
2197 # Site Name | Video Title
2198 # Video Title - Tagline | Site Name
2199 # and so on and so forth; it's just not practical
2200 mobj = re.search(r'<title>(.*)</title>', webpage)
2201 if mobj is None:
2202 self._downloader.trouble(u'ERROR: unable to extract title')
2203 return
2204 video_title = mobj.group(1).decode('utf-8')
2205 video_title = sanitize_title(video_title)
2206 simple_title = _simplify_title(video_title)
2207
2208 # video uploader is domain name
2209 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2210 if mobj is None:
2211 self._downloader.trouble(u'ERROR: unable to extract title')
2212 return
2213 video_uploader = mobj.group(1).decode('utf-8')
2214
2215 try:
2216 # Process video information
2217 self._downloader.process_info({
2218 'id': video_id.decode('utf-8'),
2219 'url': video_url.decode('utf-8'),
2220 'uploader': video_uploader,
2221 'upload_date': u'NA',
2222 'title': video_title,
2223 'stitle': simple_title,
2224 'ext': video_extension.decode('utf-8'),
2225 'format': u'NA',
2226 'player_url': None,
2227 })
2228 except UnavailableVideoError, err:
2229 self._downloader.trouble(u'\nERROR: unable to download video')
2230
2231
2232 class YoutubeSearchIE(InfoExtractor):
2233 """Information Extractor for YouTube search queries."""
2234 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2235 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2236 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2237 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2238 _youtube_ie = None
2239 _max_youtube_results = 1000
2240 IE_NAME = u'youtube:search'
2241
2242 def __init__(self, youtube_ie, downloader=None):
2243 InfoExtractor.__init__(self, downloader)
2244 self._youtube_ie = youtube_ie
2245
2246 def report_download_page(self, query, pagenum):
2247 """Report attempt to download playlist page with given number."""
2248 query = query.decode(preferredencoding())
2249 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2250
2251 def _real_initialize(self):
2252 self._youtube_ie.initialize()
2253
2254 def _real_extract(self, query):
2255 mobj = re.match(self._VALID_URL, query)
2256 if mobj is None:
2257 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2258 return
2259
2260 prefix, query = query.split(':')
2261 prefix = prefix[8:]
2262 query = query.encode('utf-8')
2263 if prefix == '':
2264 self._download_n_results(query, 1)
2265 return
2266 elif prefix == 'all':
2267 self._download_n_results(query, self._max_youtube_results)
2268 return
2269 else:
2270 try:
2271 n = long(prefix)
2272 if n <= 0:
2273 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2274 return
2275 elif n > self._max_youtube_results:
2276 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2277 n = self._max_youtube_results
2278 self._download_n_results(query, n)
2279 return
2280 except ValueError: # parsing prefix as integer fails
2281 self._download_n_results(query, 1)
2282 return
2283
2284 def _download_n_results(self, query, n):
2285 """Downloads a specified number of results for a query"""
2286
2287 video_ids = []
2288 already_seen = set()
2289 pagenum = 1
2290
2291 while True:
2292 self.report_download_page(query, pagenum)
2293 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2294 request = urllib2.Request(result_url)
2295 try:
2296 page = urllib2.urlopen(request).read()
2297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2298 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2299 return
2300
2301 # Extract video identifiers
2302 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2303 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2304 if video_id not in already_seen:
2305 video_ids.append(video_id)
2306 already_seen.add(video_id)
2307 if len(video_ids) == n:
2308 # Specified n videos reached
2309 for id in video_ids:
2310 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2311 return
2312
2313 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2314 for id in video_ids:
2315 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2316 return
2317
2318 pagenum = pagenum + 1
2319
2320
2321 class GoogleSearchIE(InfoExtractor):
2322 """Information Extractor for Google Video search queries."""
2323 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2324 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2325 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2326 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2327 _google_ie = None
2328 _max_google_results = 1000
2329 IE_NAME = u'video.google:search'
2330
2331 def __init__(self, google_ie, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2333 self._google_ie = google_ie
2334
2335 def report_download_page(self, query, pagenum):
2336 """Report attempt to download playlist page with given number."""
2337 query = query.decode(preferredencoding())
2338 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2339
2340 def _real_initialize(self):
2341 self._google_ie.initialize()
2342
2343 def _real_extract(self, query):
2344 mobj = re.match(self._VALID_URL, query)
2345 if mobj is None:
2346 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2347 return
2348
2349 prefix, query = query.split(':')
2350 prefix = prefix[8:]
2351 query = query.encode('utf-8')
2352 if prefix == '':
2353 self._download_n_results(query, 1)
2354 return
2355 elif prefix == 'all':
2356 self._download_n_results(query, self._max_google_results)
2357 return
2358 else:
2359 try:
2360 n = long(prefix)
2361 if n <= 0:
2362 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2363 return
2364 elif n > self._max_google_results:
2365 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2366 n = self._max_google_results
2367 self._download_n_results(query, n)
2368 return
2369 except ValueError: # parsing prefix as integer fails
2370 self._download_n_results(query, 1)
2371 return
2372
2373 def _download_n_results(self, query, n):
2374 """Downloads a specified number of results for a query"""
2375
2376 video_ids = []
2377 already_seen = set()
2378 pagenum = 1
2379
2380 while True:
2381 self.report_download_page(query, pagenum)
2382 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2383 request = urllib2.Request(result_url)
2384 try:
2385 page = urllib2.urlopen(request).read()
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2388 return
2389
2390 # Extract video identifiers
2391 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2392 video_id = mobj.group(1)
2393 if video_id not in already_seen:
2394 video_ids.append(video_id)
2395 already_seen.add(video_id)
2396 if len(video_ids) == n:
2397 # Specified n videos reached
2398 for id in video_ids:
2399 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2400 return
2401
2402 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2403 for id in video_ids:
2404 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2405 return
2406
2407 pagenum = pagenum + 1
2408
2409
2410 class YahooSearchIE(InfoExtractor):
2411 """Information Extractor for Yahoo! Video search queries."""
2412 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2413 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2414 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2415 _MORE_PAGES_INDICATOR = r'\s*Next'
2416 _yahoo_ie = None
2417 _max_yahoo_results = 1000
2418 IE_NAME = u'video.yahoo:search'
2419
2420 def __init__(self, yahoo_ie, downloader=None):
2421 InfoExtractor.__init__(self, downloader)
2422 self._yahoo_ie = yahoo_ie
2423
2424 def report_download_page(self, query, pagenum):
2425 """Report attempt to download playlist page with given number."""
2426 query = query.decode(preferredencoding())
2427 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2428
2429 def _real_initialize(self):
2430 self._yahoo_ie.initialize()
2431
2432 def _real_extract(self, query):
2433 mobj = re.match(self._VALID_URL, query)
2434 if mobj is None:
2435 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2436 return
2437
2438 prefix, query = query.split(':')
2439 prefix = prefix[8:]
2440 query = query.encode('utf-8')
2441 if prefix == '':
2442 self._download_n_results(query, 1)
2443 return
2444 elif prefix == 'all':
2445 self._download_n_results(query, self._max_yahoo_results)
2446 return
2447 else:
2448 try:
2449 n = long(prefix)
2450 if n <= 0:
2451 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2452 return
2453 elif n > self._max_yahoo_results:
2454 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2455 n = self._max_yahoo_results
2456 self._download_n_results(query, n)
2457 return
2458 except ValueError: # parsing prefix as integer fails
2459 self._download_n_results(query, 1)
2460 return
2461
2462 def _download_n_results(self, query, n):
2463 """Downloads a specified number of results for a query"""
2464
2465 video_ids = []
2466 already_seen = set()
2467 pagenum = 1
2468
2469 while True:
2470 self.report_download_page(query, pagenum)
2471 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2472 request = urllib2.Request(result_url)
2473 try:
2474 page = urllib2.urlopen(request).read()
2475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2477 return
2478
2479 # Extract video identifiers
2480 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2481 video_id = mobj.group(1)
2482 if video_id not in already_seen:
2483 video_ids.append(video_id)
2484 already_seen.add(video_id)
2485 if len(video_ids) == n:
2486 # Specified n videos reached
2487 for id in video_ids:
2488 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2489 return
2490
2491 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2492 for id in video_ids:
2493 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2494 return
2495
2496 pagenum = pagenum + 1
2497
2498
2499 class YoutubePlaylistIE(InfoExtractor):
2500 """Information Extractor for YouTube playlists."""
2501
2502 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2503 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2504 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2505 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2506 _youtube_ie = None
2507 IE_NAME = u'youtube:playlist'
2508
2509 def __init__(self, youtube_ie, downloader=None):
2510 InfoExtractor.__init__(self, downloader)
2511 self._youtube_ie = youtube_ie
2512
2513 def report_download_page(self, playlist_id, pagenum):
2514 """Report attempt to download playlist page with given number."""
2515 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2516
2517 def _real_initialize(self):
2518 self._youtube_ie.initialize()
2519
2520 def _real_extract(self, url):
2521 # Extract playlist id
2522 mobj = re.match(self._VALID_URL, url)
2523 if mobj is None:
2524 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2525 return
2526
2527 # Single video case
2528 if mobj.group(3) is not None:
2529 self._youtube_ie.extract(mobj.group(3))
2530 return
2531
2532 # Download playlist pages
2533 # prefix is 'p' as default for playlists but there are other types that need extra care
2534 playlist_prefix = mobj.group(1)
2535 if playlist_prefix == 'a':
2536 playlist_access = 'artist'
2537 else:
2538 playlist_prefix = 'p'
2539 playlist_access = 'view_play_list'
2540 playlist_id = mobj.group(2)
2541 video_ids = []
2542 pagenum = 1
2543
2544 while True:
2545 self.report_download_page(playlist_id, pagenum)
2546 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2547 request = urllib2.Request(url)
2548 try:
2549 page = urllib2.urlopen(request).read()
2550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2551 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2552 return
2553
2554 # Extract video identifiers
2555 ids_in_page = []
2556 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2557 if mobj.group(1) not in ids_in_page:
2558 ids_in_page.append(mobj.group(1))
2559 video_ids.extend(ids_in_page)
2560
2561 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2562 break
2563 pagenum = pagenum + 1
2564
2565 playliststart = self._downloader.params.get('playliststart', 1) - 1
2566 playlistend = self._downloader.params.get('playlistend', -1)
2567 video_ids = video_ids[playliststart:playlistend]
2568
2569 for id in video_ids:
2570 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2571 return
2572
2573
2574 class YoutubeUserIE(InfoExtractor):
2575 """Information Extractor for YouTube users."""
2576
2577 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2578 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2579 _GDATA_PAGE_SIZE = 50
2580 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2581 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2582 _youtube_ie = None
2583 IE_NAME = u'youtube:user'
2584
2585 def __init__(self, youtube_ie, downloader=None):
2586 InfoExtractor.__init__(self, downloader)
2587 self._youtube_ie = youtube_ie
2588
2589 def report_download_page(self, username, start_index):
2590 """Report attempt to download user page."""
2591 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2592 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2593
2594 def _real_initialize(self):
2595 self._youtube_ie.initialize()
2596
2597 def _real_extract(self, url):
2598 # Extract username
2599 mobj = re.match(self._VALID_URL, url)
2600 if mobj is None:
2601 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2602 return
2603
2604 username = mobj.group(1)
2605
2606 # Download video ids using YouTube Data API. Result size per
2607 # query is limited (currently to 50 videos) so we need to query
2608 # page by page until there are no video ids - it means we got
2609 # all of them.
2610
2611 video_ids = []
2612 pagenum = 0
2613
2614 while True:
2615 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2616 self.report_download_page(username, start_index)
2617
2618 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2619
2620 try:
2621 page = urllib2.urlopen(request).read()
2622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2624 return
2625
2626 # Extract video identifiers
2627 ids_in_page = []
2628
2629 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2630 if mobj.group(1) not in ids_in_page:
2631 ids_in_page.append(mobj.group(1))
2632
2633 video_ids.extend(ids_in_page)
2634
2635 # A little optimization - if current page is not
2636 # "full", ie. does not contain PAGE_SIZE video ids then
2637 # we can assume that this page is the last one - there
2638 # are no more ids on further pages - no need to query
2639 # again.
2640
2641 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2642 break
2643
2644 pagenum += 1
2645
2646 all_ids_count = len(video_ids)
2647 playliststart = self._downloader.params.get('playliststart', 1) - 1
2648 playlistend = self._downloader.params.get('playlistend', -1)
2649
2650 if playlistend == -1:
2651 video_ids = video_ids[playliststart:]
2652 else:
2653 video_ids = video_ids[playliststart:playlistend]
2654
2655 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2656 (username, all_ids_count, len(video_ids)))
2657
2658 for video_id in video_ids:
2659 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2660
2661
2662 class DepositFilesIE(InfoExtractor):
2663 """Information extractor for depositfiles.com"""
2664
2665 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2666 IE_NAME = u'DepositFiles'
2667
2668 def __init__(self, downloader=None):
2669 InfoExtractor.__init__(self, downloader)
2670
2671 def report_download_webpage(self, file_id):
2672 """Report webpage download."""
2673 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2674
2675 def report_extraction(self, file_id):
2676 """Report information extraction."""
2677 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2678
2679 def _real_extract(self, url):
2680 # At this point we have a new file
2681 self._downloader.increment_downloads()
2682
2683 file_id = url.split('/')[-1]
2684 # Rebuild url in english locale
2685 url = 'http://depositfiles.com/en/files/' + file_id
2686
2687 # Retrieve file webpage with 'Free download' button pressed
2688 free_download_indication = { 'gateway_result' : '1' }
2689 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2690 try:
2691 self.report_download_webpage(file_id)
2692 webpage = urllib2.urlopen(request).read()
2693 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2694 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2695 return
2696
2697 # Search for the real file URL
2698 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2699 if (mobj is None) or (mobj.group(1) is None):
2700 # Try to figure out reason of the error.
2701 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2702 if (mobj is not None) and (mobj.group(1) is not None):
2703 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2704 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2705 else:
2706 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2707 return
2708
2709 file_url = mobj.group(1)
2710 file_extension = os.path.splitext(file_url)[1][1:]
2711
2712 # Search for file title
2713 mobj = re.search(r'<b title="(.*?)">', webpage)
2714 if mobj is None:
2715 self._downloader.trouble(u'ERROR: unable to extract title')
2716 return
2717 file_title = mobj.group(1).decode('utf-8')
2718
2719 try:
2720 # Process file information
2721 self._downloader.process_info({
2722 'id': file_id.decode('utf-8'),
2723 'url': file_url.decode('utf-8'),
2724 'uploader': u'NA',
2725 'upload_date': u'NA',
2726 'title': file_title,
2727 'stitle': file_title,
2728 'ext': file_extension.decode('utf-8'),
2729 'format': u'NA',
2730 'player_url': None,
2731 })
2732 except UnavailableVideoError, err:
2733 self._downloader.trouble(u'ERROR: unable to download file')
2734
2735
2736 class FacebookIE(InfoExtractor):
2737 """Information Extractor for Facebook"""
2738
2739 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2740 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2741 _NETRC_MACHINE = 'facebook'
2742 _available_formats = ['video', 'highqual', 'lowqual']
2743 _video_extensions = {
2744 'video': 'mp4',
2745 'highqual': 'mp4',
2746 'lowqual': 'mp4',
2747 }
2748 IE_NAME = u'facebook'
2749
2750 def __init__(self, downloader=None):
2751 InfoExtractor.__init__(self, downloader)
2752
2753 def _reporter(self, message):
2754 """Add header and report message."""
2755 self._downloader.to_screen(u'[facebook] %s' % message)
2756
2757 def report_login(self):
2758 """Report attempt to log in."""
2759 self._reporter(u'Logging in')
2760
2761 def report_video_webpage_download(self, video_id):
2762 """Report attempt to download video webpage."""
2763 self._reporter(u'%s: Downloading video webpage' % video_id)
2764
2765 def report_information_extraction(self, video_id):
2766 """Report attempt to extract video information."""
2767 self._reporter(u'%s: Extracting video information' % video_id)
2768
2769 def _parse_page(self, video_webpage):
2770 """Extract video information from page"""
2771 # General data
2772 data = {'title': r'\("video_title", "(.*?)"\)',
2773 'description': r'<div class="datawrap">(.*?)</div>',
2774 'owner': r'\("video_owner_name", "(.*?)"\)',
2775 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2776 }
2777 video_info = {}
2778 for piece in data.keys():
2779 mobj = re.search(data[piece], video_webpage)
2780 if mobj is not None:
2781 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2782
2783 # Video urls
2784 video_urls = {}
2785 for fmt in self._available_formats:
2786 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2787 if mobj is not None:
2788 # URL is in a Javascript segment inside an escaped Unicode format within
2789 # the generally utf-8 page
2790 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2791 video_info['video_urls'] = video_urls
2792
2793 return video_info
2794
2795 def _real_initialize(self):
2796 if self._downloader is None:
2797 return
2798
2799 useremail = None
2800 password = None
2801 downloader_params = self._downloader.params
2802
2803 # Attempt to use provided username and password or .netrc data
2804 if downloader_params.get('username', None) is not None:
2805 useremail = downloader_params['username']
2806 password = downloader_params['password']
2807 elif downloader_params.get('usenetrc', False):
2808 try:
2809 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2810 if info is not None:
2811 useremail = info[0]
2812 password = info[2]
2813 else:
2814 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2815 except (IOError, netrc.NetrcParseError), err:
2816 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2817 return
2818
2819 if useremail is None:
2820 return
2821
2822 # Log in
2823 login_form = {
2824 'email': useremail,
2825 'pass': password,
2826 'login': 'Log+In'
2827 }
2828 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2829 try:
2830 self.report_login()
2831 login_results = urllib2.urlopen(request).read()
2832 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2833 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2834 return
2835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2837 return
2838
2839 def _real_extract(self, url):
2840 mobj = re.match(self._VALID_URL, url)
2841 if mobj is None:
2842 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2843 return
2844 video_id = mobj.group('ID')
2845
2846 # Get video webpage
2847 self.report_video_webpage_download(video_id)
2848 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2849 try:
2850 page = urllib2.urlopen(request)
2851 video_webpage = page.read()
2852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2853 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2854 return
2855
2856 # Start extracting information
2857 self.report_information_extraction(video_id)
2858
2859 # Extract information
2860 video_info = self._parse_page(video_webpage)
2861
2862 # uploader
2863 if 'owner' not in video_info:
2864 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2865 return
2866 video_uploader = video_info['owner']
2867
2868 # title
2869 if 'title' not in video_info:
2870 self._downloader.trouble(u'ERROR: unable to extract video title')
2871 return
2872 video_title = video_info['title']
2873 video_title = video_title.decode('utf-8')
2874 video_title = sanitize_title(video_title)
2875
2876 simple_title = _simplify_title(video_title)
2877
2878 # thumbnail image
2879 if 'thumbnail' not in video_info:
2880 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2881 video_thumbnail = ''
2882 else:
2883 video_thumbnail = video_info['thumbnail']
2884
2885 # upload date
2886 upload_date = u'NA'
2887 if 'upload_date' in video_info:
2888 upload_time = video_info['upload_date']
2889 timetuple = email.utils.parsedate_tz(upload_time)
2890 if timetuple is not None:
2891 try:
2892 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2893 except:
2894 pass
2895
2896 # description
2897 video_description = video_info.get('description', 'No description available.')
2898
2899 url_map = video_info['video_urls']
2900 if len(url_map.keys()) > 0:
2901 # Decide which formats to download
2902 req_format = self._downloader.params.get('format', None)
2903 format_limit = self._downloader.params.get('format_limit', None)
2904
2905 if format_limit is not None and format_limit in self._available_formats:
2906 format_list = self._available_formats[self._available_formats.index(format_limit):]
2907 else:
2908 format_list = self._available_formats
2909 existing_formats = [x for x in format_list if x in url_map]
2910 if len(existing_formats) == 0:
2911 self._downloader.trouble(u'ERROR: no known formats available for video')
2912 return
2913 if req_format is None:
2914 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2915 elif req_format == 'worst':
2916 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2917 elif req_format == '-1':
2918 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2919 else:
2920 # Specific format
2921 if req_format not in url_map:
2922 self._downloader.trouble(u'ERROR: requested format not available')
2923 return
2924 video_url_list = [(req_format, url_map[req_format])] # Specific format
2925
2926 for format_param, video_real_url in video_url_list:
2927
2928 # At this point we have a new video
2929 self._downloader.increment_downloads()
2930
2931 # Extension
2932 video_extension = self._video_extensions.get(format_param, 'mp4')
2933
2934 try:
2935 # Process video information
2936 self._downloader.process_info({
2937 'id': video_id.decode('utf-8'),
2938 'url': video_real_url.decode('utf-8'),
2939 'uploader': video_uploader.decode('utf-8'),
2940 'upload_date': upload_date,
2941 'title': video_title,
2942 'stitle': simple_title,
2943 'ext': video_extension.decode('utf-8'),
2944 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2945 'thumbnail': video_thumbnail.decode('utf-8'),
2946 'description': video_description.decode('utf-8'),
2947 'player_url': None,
2948 })
2949 except UnavailableVideoError, err:
2950 self._downloader.trouble(u'\nERROR: unable to download video')
2951
2952 class BlipTVIE(InfoExtractor):
2953 """Information extractor for blip.tv"""
2954
2955 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2956 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2957 IE_NAME = u'blip.tv'
2958
2959 def report_extraction(self, file_id):
2960 """Report information extraction."""
2961 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2962
2963 def report_direct_download(self, title):
2964 """Report information extraction."""
2965 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2966
2967 def _real_extract(self, url):
2968 mobj = re.match(self._VALID_URL, url)
2969 if mobj is None:
2970 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2971 return
2972
2973 if '?' in url:
2974 cchar = '&'
2975 else:
2976 cchar = '?'
2977 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2978 request = urllib2.Request(json_url)
2979 self.report_extraction(mobj.group(1))
2980 info = None
2981 try:
2982 urlh = urllib2.urlopen(request)
2983 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2984 basename = url.split('/')[-1]
2985 title,ext = os.path.splitext(basename)
2986 title = title.decode('UTF-8')
2987 ext = ext.replace('.', '')
2988 self.report_direct_download(title)
2989 info = {
2990 'id': title,
2991 'url': url,
2992 'title': title,
2993 'stitle': _simplify_title(title),
2994 'ext': ext,
2995 'urlhandle': urlh
2996 }
2997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2998 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2999 return
3000 if info is None: # Regular URL
3001 try:
3002 json_code = urlh.read()
3003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3004 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3005 return
3006
3007 try:
3008 json_data = json.loads(json_code)
3009 if 'Post' in json_data:
3010 data = json_data['Post']
3011 else:
3012 data = json_data
3013
3014 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3015 video_url = data['media']['url']
3016 umobj = re.match(self._URL_EXT, video_url)
3017 if umobj is None:
3018 raise ValueError('Can not determine filename extension')
3019 ext = umobj.group(1)
3020
3021 info = {
3022 'id': data['item_id'],
3023 'url': video_url,
3024 'uploader': data['display_name'],
3025 'upload_date': upload_date,
3026 'title': data['title'],
3027 'stitle': _simplify_title(data['title']),
3028 'ext': ext,
3029 'format': data['media']['mimeType'],
3030 'thumbnail': data['thumbnailUrl'],
3031 'description': data['description'],
3032 'player_url': data['embedUrl']
3033 }
3034 except (ValueError,KeyError), err:
3035 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3036 return
3037
3038 self._downloader.increment_downloads()
3039
3040 try:
3041 self._downloader.process_info(info)
3042 except UnavailableVideoError, err:
3043 self._downloader.trouble(u'\nERROR: unable to download video')
3044
3045
3046 class MyVideoIE(InfoExtractor):
3047 """Information Extractor for myvideo.de."""
3048
3049 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3050 IE_NAME = u'myvideo'
3051
3052 def __init__(self, downloader=None):
3053 InfoExtractor.__init__(self, downloader)
3054
3055 def report_download_webpage(self, video_id):
3056 """Report webpage download."""
3057 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3058
3059 def report_extraction(self, video_id):
3060 """Report information extraction."""
3061 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3062
3063 def _real_extract(self,url):
3064 mobj = re.match(self._VALID_URL, url)
3065 if mobj is None:
3066 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3067 return
3068
3069 video_id = mobj.group(1)
3070
3071 # Get video webpage
3072 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3073 try:
3074 self.report_download_webpage(video_id)
3075 webpage = urllib2.urlopen(request).read()
3076 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3077 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3078 return
3079
3080 self.report_extraction(video_id)
3081 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3082 webpage)
3083 if mobj is None:
3084 self._downloader.trouble(u'ERROR: unable to extract media URL')
3085 return
3086 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3087
3088 mobj = re.search('<title>([^<]+)</title>', webpage)
3089 if mobj is None:
3090 self._downloader.trouble(u'ERROR: unable to extract title')
3091 return
3092
3093 video_title = mobj.group(1)
3094 video_title = sanitize_title(video_title)
3095
3096 simple_title = _simplify_title(video_title)
3097
3098 try:
3099 self._downloader.process_info({
3100 'id': video_id,
3101 'url': video_url,
3102 'uploader': u'NA',
3103 'upload_date': u'NA',
3104 'title': video_title,
3105 'stitle': simple_title,
3106 'ext': u'flv',
3107 'format': u'NA',
3108 'player_url': None,
3109 })
3110 except UnavailableVideoError:
3111 self._downloader.trouble(u'\nERROR: Unable to download video')
3112
3113 class ComedyCentralIE(InfoExtractor):
3114 """Information extractor for The Daily Show and Colbert Report """
3115
3116 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3117 IE_NAME = u'comedycentral'
3118
3119 def report_extraction(self, episode_id):
3120 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3121
3122 def report_config_download(self, episode_id):
3123 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3124
3125 def report_index_download(self, episode_id):
3126 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3127
3128 def report_player_url(self, episode_id):
3129 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3130
3131 def _real_extract(self, url):
3132 mobj = re.match(self._VALID_URL, url)
3133 if mobj is None:
3134 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3135 return
3136
3137 if mobj.group('shortname'):
3138 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3139 url = u'http://www.thedailyshow.com/full-episodes/'
3140 else:
3141 url = u'http://www.colbertnation.com/full-episodes/'
3142 mobj = re.match(self._VALID_URL, url)
3143 assert mobj is not None
3144
3145 dlNewest = not mobj.group('episode')
3146 if dlNewest:
3147 epTitle = mobj.group('showname')
3148 else:
3149 epTitle = mobj.group('episode')
3150
3151 req = urllib2.Request(url)
3152 self.report_extraction(epTitle)
3153 try:
3154 htmlHandle = urllib2.urlopen(req)
3155 html = htmlHandle.read()
3156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3158 return
3159 if dlNewest:
3160 url = htmlHandle.geturl()
3161 mobj = re.match(self._VALID_URL, url)
3162 if mobj is None:
3163 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3164 return
3165 if mobj.group('episode') == '':
3166 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3167 return
3168 epTitle = mobj.group('episode')
3169
3170 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3171 if len(mMovieParams) == 0:
3172 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3173 return
3174
3175 playerUrl_raw = mMovieParams[0][0]
3176 self.report_player_url(epTitle)
3177 try:
3178 urlHandle = urllib2.urlopen(playerUrl_raw)
3179 playerUrl = urlHandle.geturl()
3180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3181 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3182 return
3183
3184 uri = mMovieParams[0][1]
3185 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3186 self.report_index_download(epTitle)
3187 try:
3188 indexXml = urllib2.urlopen(indexUrl).read()
3189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3190 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3191 return
3192
3193 idoc = xml.etree.ElementTree.fromstring(indexXml)
3194 itemEls = idoc.findall('.//item')
3195 for itemEl in itemEls:
3196 mediaId = itemEl.findall('./guid')[0].text
3197 shortMediaId = mediaId.split(':')[-1]
3198 showId = mediaId.split(':')[-2].replace('.com', '')
3199 officialTitle = itemEl.findall('./title')[0].text
3200 officialDate = itemEl.findall('./pubDate')[0].text
3201
3202 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3203 urllib.urlencode({'uri': mediaId}))
3204 configReq = urllib2.Request(configUrl)
3205 self.report_config_download(epTitle)
3206 try:
3207 configXml = urllib2.urlopen(configReq).read()
3208 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3210 return
3211
3212 cdoc = xml.etree.ElementTree.fromstring(configXml)
3213 turls = []
3214 for rendition in cdoc.findall('.//rendition'):
3215 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3216 turls.append(finfo)
3217
3218 if len(turls) == 0:
3219 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3220 continue
3221
3222 # For now, just pick the highest bitrate
3223 format,video_url = turls[-1]
3224
3225 self._downloader.increment_downloads()
3226
3227 effTitle = showId + u'-' + epTitle
3228 info = {
3229 'id': shortMediaId,
3230 'url': video_url,
3231 'uploader': showId,
3232 'upload_date': officialDate,
3233 'title': effTitle,
3234 'stitle': _simplify_title(effTitle),
3235 'ext': 'mp4',
3236 'format': format,
3237 'thumbnail': None,
3238 'description': officialTitle,
3239 'player_url': playerUrl
3240 }
3241
3242 try:
3243 self._downloader.process_info(info)
3244 except UnavailableVideoError, err:
3245 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3246 continue
3247
3248
3249 class EscapistIE(InfoExtractor):
3250 """Information extractor for The Escapist """
3251
3252 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3253 IE_NAME = u'escapist'
3254
3255 def report_extraction(self, showName):
3256 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3257
3258 def report_config_download(self, showName):
3259 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3260
3261 def _real_extract(self, url):
3262 htmlParser = HTMLParser.HTMLParser()
3263
3264 mobj = re.match(self._VALID_URL, url)
3265 if mobj is None:
3266 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3267 return
3268 showName = mobj.group('showname')
3269 videoId = mobj.group('episode')
3270
3271 self.report_extraction(showName)
3272 try:
3273 webPage = urllib2.urlopen(url).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3276 return
3277
3278 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3279 description = htmlParser.unescape(descMatch.group(1))
3280 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3281 imgUrl = htmlParser.unescape(imgMatch.group(1))
3282 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3283 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3284 configUrlMatch = re.search('config=(.*)$', playerUrl)
3285 configUrl = urllib2.unquote(configUrlMatch.group(1))
3286
3287 self.report_config_download(showName)
3288 try:
3289 configJSON = urllib2.urlopen(configUrl).read()
3290 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3291 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3292 return
3293
3294 # Technically, it's JavaScript, not JSON
3295 configJSON = configJSON.replace("'", '"')
3296
3297 try:
3298 config = json.loads(configJSON)
3299 except (ValueError,), err:
3300 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3301 return
3302
3303 playlist = config['playlist']
3304 videoUrl = playlist[1]['url']
3305
3306 self._downloader.increment_downloads()
3307 info = {
3308 'id': videoId,
3309 'url': videoUrl,
3310 'uploader': showName,
3311 'upload_date': None,
3312 'title': showName,
3313 'stitle': _simplify_title(showName),
3314 'ext': 'flv',
3315 'format': 'flv',
3316 'thumbnail': imgUrl,
3317 'description': description,
3318 'player_url': playerUrl,
3319 }
3320
3321 try:
3322 self._downloader.process_info(info)
3323 except UnavailableVideoError, err:
3324 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3325
3326
3327 class CollegeHumorIE(InfoExtractor):
3328 """Information extractor for collegehumor.com"""
3329
3330 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3331 IE_NAME = u'collegehumor'
3332
3333 def report_webpage(self, video_id):
3334 """Report information extraction."""
3335 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3336
3337 def report_extraction(self, video_id):
3338 """Report information extraction."""
3339 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3340
3341 def _real_extract(self, url):
3342 htmlParser = HTMLParser.HTMLParser()
3343
3344 mobj = re.match(self._VALID_URL, url)
3345 if mobj is None:
3346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3347 return
3348 video_id = mobj.group('videoid')
3349
3350 self.report_webpage(video_id)
3351 request = urllib2.Request(url)
3352 try:
3353 webpage = urllib2.urlopen(request).read()
3354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3356 return
3357
3358 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3359 if m is None:
3360 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3361 return
3362 internal_video_id = m.group('internalvideoid')
3363
3364 info = {
3365 'id': video_id,
3366 'internal_id': internal_video_id,
3367 }
3368
3369 self.report_extraction(video_id)
3370 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3371 try:
3372 metaXml = urllib2.urlopen(xmlUrl).read()
3373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3375 return
3376
3377 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3378 try:
3379 videoNode = mdoc.findall('./video')[0]
3380 info['description'] = videoNode.findall('./description')[0].text
3381 info['title'] = videoNode.findall('./caption')[0].text
3382 info['stitle'] = _simplify_title(info['title'])
3383 info['url'] = videoNode.findall('./file')[0].text
3384 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3385 info['ext'] = info['url'].rpartition('.')[2]
3386 info['format'] = info['ext']
3387 except IndexError:
3388 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3389 return
3390
3391 self._downloader.increment_downloads()
3392
3393 try:
3394 self._downloader.process_info(info)
3395 except UnavailableVideoError, err:
3396 self._downloader.trouble(u'\nERROR: unable to download video')
3397
3398
3399 class XVideosIE(InfoExtractor):
3400 """Information extractor for xvideos.com"""
3401
3402 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3403 IE_NAME = u'xvideos'
3404
3405 def report_webpage(self, video_id):
3406 """Report information extraction."""
3407 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3408
3409 def report_extraction(self, video_id):
3410 """Report information extraction."""
3411 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3412
3413 def _real_extract(self, url):
3414 htmlParser = HTMLParser.HTMLParser()
3415
3416 mobj = re.match(self._VALID_URL, url)
3417 if mobj is None:
3418 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3419 return
3420 video_id = mobj.group(1).decode('utf-8')
3421
3422 self.report_webpage(video_id)
3423
3424 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3425 try:
3426 webpage = urllib2.urlopen(request).read()
3427 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3428 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3429 return
3430
3431 self.report_extraction(video_id)
3432
3433
3434 # Extract video URL
3435 mobj = re.search(r'flv_url=(.+?)&', webpage)
3436 if mobj is None:
3437 self._downloader.trouble(u'ERROR: unable to extract video url')
3438 return
3439 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3440
3441
3442 # Extract title
3443 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3444 if mobj is None:
3445 self._downloader.trouble(u'ERROR: unable to extract video title')
3446 return
3447 video_title = mobj.group(1).decode('utf-8')
3448
3449
3450 # Extract video thumbnail
3451 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3452 if mobj is None:
3453 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3454 return
3455 video_thumbnail = mobj.group(1).decode('utf-8')
3456
3457
3458
3459 self._downloader.increment_downloads()
3460 info = {
3461 'id': video_id,
3462 'url': video_url,
3463 'uploader': None,
3464 'upload_date': None,
3465 'title': video_title,
3466 'stitle': _simplify_title(video_title),
3467 'ext': 'flv',
3468 'format': 'flv',
3469 'thumbnail': video_thumbnail,
3470 'description': None,
3471 'player_url': None,
3472 }
3473
3474 try:
3475 self._downloader.process_info(info)
3476 except UnavailableVideoError, err:
3477 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3478
3479
3480 class SoundcloudIE(InfoExtractor):
3481 """Information extractor for soundcloud.com
3482 To access the media, the uid of the song and a stream token
3483 must be extracted from the page source and the script must make
3484 a request to media.soundcloud.com/crossdomain.xml. Then
3485 the media can be grabbed by requesting from an url composed
3486 of the stream token and uid
3487 """
3488
3489 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3490 IE_NAME = u'soundcloud'
3491
3492 def __init__(self, downloader=None):
3493 InfoExtractor.__init__(self, downloader)
3494
3495 def report_webpage(self, video_id):
3496 """Report information extraction."""
3497 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3498
3499 def report_extraction(self, video_id):
3500 """Report information extraction."""
3501 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3502
3503 def _real_extract(self, url):
3504 htmlParser = HTMLParser.HTMLParser()
3505
3506 mobj = re.match(self._VALID_URL, url)
3507 if mobj is None:
3508 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3509 return
3510
3511 # extract uploader (which is in the url)
3512 uploader = mobj.group(1).decode('utf-8')
3513 # extract simple title (uploader + slug of song title)
3514 slug_title = mobj.group(2).decode('utf-8')
3515 simple_title = uploader + '-' + slug_title
3516
3517 self.report_webpage('%s/%s' % (uploader, slug_title))
3518
3519 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3520 try:
3521 webpage = urllib2.urlopen(request).read()
3522 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3523 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3524 return
3525
3526 self.report_extraction('%s/%s' % (uploader, slug_title))
3527
3528 # extract uid and stream token that soundcloud hands out for access
3529 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3530 if mobj:
3531 video_id = mobj.group(1)
3532 stream_token = mobj.group(2)
3533
3534 # extract unsimplified title
3535 mobj = re.search('"title":"(.*?)",', webpage)
3536 if mobj:
3537 title = mobj.group(1)
3538
3539 # construct media url (with uid/token)
3540 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3541 mediaURL = mediaURL % (video_id, stream_token)
3542
3543 # description
3544 description = u'No description available'
3545 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3546 if mobj:
3547 description = mobj.group(1)
3548
3549 # upload date
3550 upload_date = None
3551 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3552 if mobj:
3553 try:
3554 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3555 except Exception, e:
3556 print str(e)
3557
3558 # for soundcloud, a request to a cross domain is required for cookies
3559 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3560
3561 try:
3562 self._downloader.process_info({
3563 'id': video_id.decode('utf-8'),
3564 'url': mediaURL,
3565 'uploader': uploader.decode('utf-8'),
3566 'upload_date': upload_date,
3567 'title': simple_title.decode('utf-8'),
3568 'stitle': simple_title.decode('utf-8'),
3569 'ext': u'mp3',
3570 'format': u'NA',
3571 'player_url': None,
3572 'description': description.decode('utf-8')
3573 })
3574 except UnavailableVideoError:
3575 self._downloader.trouble(u'\nERROR: unable to download video')
3576
3577
3578 class InfoQIE(InfoExtractor):
3579 """Information extractor for infoq.com"""
3580
3581 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3582 IE_NAME = u'infoq'
3583
3584 def report_webpage(self, video_id):
3585 """Report information extraction."""
3586 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3587
3588 def report_extraction(self, video_id):
3589 """Report information extraction."""
3590 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3591
3592 def _real_extract(self, url):
3593 htmlParser = HTMLParser.HTMLParser()
3594
3595 mobj = re.match(self._VALID_URL, url)
3596 if mobj is None:
3597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3598 return
3599
3600 self.report_webpage(url)
3601
3602 request = urllib2.Request(url)
3603 try:
3604 webpage = urllib2.urlopen(request).read()
3605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3606 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3607 return
3608
3609 self.report_extraction(url)
3610
3611
3612 # Extract video URL
3613 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3614 if mobj is None:
3615 self._downloader.trouble(u'ERROR: unable to extract video url')
3616 return
3617 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3618
3619
3620 # Extract title
3621 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3622 if mobj is None:
3623 self._downloader.trouble(u'ERROR: unable to extract video title')
3624 return
3625 video_title = mobj.group(1).decode('utf-8')
3626
3627 # Extract description
3628 video_description = u'No description available.'
3629 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3630 if mobj is not None:
3631 video_description = mobj.group(1).decode('utf-8')
3632
3633 video_filename = video_url.split('/')[-1]
3634 video_id, extension = video_filename.split('.')
3635
3636 self._downloader.increment_downloads()
3637 info = {
3638 'id': video_id,
3639 'url': video_url,
3640 'uploader': None,
3641 'upload_date': None,
3642 'title': video_title,
3643 'stitle': _simplify_title(video_title),
3644 'ext': extension,
3645 'format': extension, # Extension is always(?) mp4, but seems to be flv
3646 'thumbnail': None,
3647 'description': video_description,
3648 'player_url': None,
3649 }
3650
3651 try:
3652 self._downloader.process_info(info)
3653 except UnavailableVideoError, err:
3654 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3655
3656 class MixcloudIE(InfoExtractor):
3657 """Information extractor for www.mixcloud.com"""
3658 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3659 IE_NAME = u'mixcloud'
3660
3661 def __init__(self, downloader=None):
3662 InfoExtractor.__init__(self, downloader)
3663
3664 def report_download_json(self, file_id):
3665 """Report JSON download."""
3666 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3667
3668 def report_extraction(self, file_id):
3669 """Report information extraction."""
3670 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3671
3672 def get_urls(self, jsonData, fmt, bitrate='best'):
3673 """Get urls from 'audio_formats' section in json"""
3674 file_url = None
3675 try:
3676 bitrate_list = jsonData[fmt]
3677 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3678 bitrate = max(bitrate_list) # select highest
3679
3680 url_list = jsonData[fmt][bitrate]
3681 except TypeError: # we have no bitrate info.
3682 url_list = jsonData[fmt]
3683
3684 return url_list
3685
3686 def check_urls(self, url_list):
3687 """Returns 1st active url from list"""
3688 for url in url_list:
3689 try:
3690 urllib2.urlopen(url)
3691 return url
3692 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3693 url = None
3694
3695 return None
3696
3697 def _print_formats(self, formats):
3698 print 'Available formats:'
3699 for fmt in formats.keys():
3700 for b in formats[fmt]:
3701 try:
3702 ext = formats[fmt][b][0]
3703 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3704 except TypeError: # we have no bitrate info
3705 ext = formats[fmt][0]
3706 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3707 break
3708
3709 def _real_extract(self, url):
3710 mobj = re.match(self._VALID_URL, url)
3711 if mobj is None:
3712 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3713 return
3714 # extract uploader & filename from url
3715 uploader = mobj.group(1).decode('utf-8')
3716 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3717
3718 # construct API request
3719 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3720 # retrieve .json file with links to files
3721 request = urllib2.Request(file_url)
3722 try:
3723 self.report_download_json(file_url)
3724 jsonData = urllib2.urlopen(request).read()
3725 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3726 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3727 return
3728
3729 # parse JSON
3730 json_data = json.loads(jsonData)
3731 player_url = json_data['player_swf_url']
3732 formats = dict(json_data['audio_formats'])
3733
3734 req_format = self._downloader.params.get('format', None)
3735 bitrate = None
3736
3737 if self._downloader.params.get('listformats', None):
3738 self._print_formats(formats)
3739 return
3740
3741 if req_format is None or req_format == 'best':
3742 for format_param in formats.keys():
3743 url_list = self.get_urls(formats, format_param)
3744 # check urls
3745 file_url = self.check_urls(url_list)
3746 if file_url is not None:
3747 break # got it!
3748 else:
3749 if req_format not in formats.keys():
3750 self._downloader.trouble(u'ERROR: format is not available')
3751 return
3752
3753 url_list = self.get_urls(formats, req_format)
3754 file_url = self.check_urls(url_list)
3755 format_param = req_format
3756
3757 # We have audio
3758 self._downloader.increment_downloads()
3759 try:
3760 # Process file information
3761 self._downloader.process_info({
3762 'id': file_id.decode('utf-8'),
3763 'url': file_url.decode('utf-8'),
3764 'uploader': uploader.decode('utf-8'),
3765 'upload_date': u'NA',
3766 'title': json_data['name'],
3767 'stitle': _simplify_title(json_data['name']),
3768 'ext': file_url.split('.')[-1].decode('utf-8'),
3769 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3770 'thumbnail': json_data['thumbnail_url'],
3771 'description': json_data['description'],
3772 'player_url': player_url.decode('utf-8'),
3773 })
3774 except UnavailableVideoError, err:
3775 self._downloader.trouble(u'ERROR: unable to download file')
3776
3777 class StanfordOpenClassroomIE(InfoExtractor):
3778 """Information extractor for Stanford's Open ClassRoom"""
3779
3780 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3781 IE_NAME = u'stanfordoc'
3782
3783 def report_download_webpage(self, objid):
3784 """Report information extraction."""
3785 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3786
3787 def report_extraction(self, video_id):
3788 """Report information extraction."""
3789 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3790
3791 def _real_extract(self, url):
3792 mobj = re.match(self._VALID_URL, url)
3793 if mobj is None:
3794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3795 return
3796
3797 if mobj.group('course') and mobj.group('video'): # A specific video
3798 course = mobj.group('course')
3799 video = mobj.group('video')
3800 info = {
3801 'id': _simplify_title(course + '_' + video),
3802 }
3803
3804 self.report_extraction(info['id'])
3805 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3806 xmlUrl = baseUrl + video + '.xml'
3807 try:
3808 metaXml = urllib2.urlopen(xmlUrl).read()
3809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3810 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3811 return
3812 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3813 try:
3814 info['title'] = mdoc.findall('./title')[0].text
3815 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3816 except IndexError:
3817 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3818 return
3819 info['stitle'] = _simplify_title(info['title'])
3820 info['ext'] = info['url'].rpartition('.')[2]
3821 info['format'] = info['ext']
3822 self._downloader.increment_downloads()
3823 try:
3824 self._downloader.process_info(info)
3825 except UnavailableVideoError, err:
3826 self._downloader.trouble(u'\nERROR: unable to download video')
3827 elif mobj.group('course'): # A course page
3828 unescapeHTML = HTMLParser.HTMLParser().unescape
3829
3830 course = mobj.group('course')
3831 info = {
3832 'id': _simplify_title(course),
3833 'type': 'playlist',
3834 }
3835
3836 self.report_download_webpage(info['id'])
3837 try:
3838 coursepage = urllib2.urlopen(url).read()
3839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3840 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3841 return
3842
3843 m = re.search('<h1>([^<]+)</h1>', coursepage)
3844 if m:
3845 info['title'] = unescapeHTML(m.group(1))
3846 else:
3847 info['title'] = info['id']
3848 info['stitle'] = _simplify_title(info['title'])
3849
3850 m = re.search('<description>([^<]+)</description>', coursepage)
3851 if m:
3852 info['description'] = unescapeHTML(m.group(1))
3853
3854 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3855 info['list'] = [
3856 {
3857 'type': 'reference',
3858 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3859 }
3860 for vpage in links]
3861
3862 for entry in info['list']:
3863 assert entry['type'] == 'reference'
3864 self.extract(entry['url'])
3865 else: # Root page
3866 unescapeHTML = HTMLParser.HTMLParser().unescape
3867
3868 info = {
3869 'id': 'Stanford OpenClassroom',
3870 'type': 'playlist',
3871 }
3872
3873 self.report_download_webpage(info['id'])
3874 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3875 try:
3876 rootpage = urllib2.urlopen(rootURL).read()
3877 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3878 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3879 return
3880
3881 info['title'] = info['id']
3882 info['stitle'] = _simplify_title(info['title'])
3883
3884 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3885 info['list'] = [
3886 {
3887 'type': 'reference',
3888 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3889 }
3890 for cpage in links]
3891
3892 for entry in info['list']:
3893 assert entry['type'] == 'reference'
3894 self.extract(entry['url'])
3895
3896 class MTVIE(InfoExtractor):
3897 """Information extractor for MTV.com"""
3898
3899 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3900 IE_NAME = u'mtv'
3901
3902 def report_webpage(self, video_id):
3903 """Report information extraction."""
3904 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3905
3906 def report_extraction(self, video_id):
3907 """Report information extraction."""
3908 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3909
3910 def _real_extract(self, url):
3911 mobj = re.match(self._VALID_URL, url)
3912 if mobj is None:
3913 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3914 return
3915 if not mobj.group('proto'):
3916 url = 'http://' + url
3917 video_id = mobj.group('videoid')
3918 self.report_webpage(video_id)
3919
3920 request = urllib2.Request(url)
3921 try:
3922 webpage = urllib2.urlopen(request).read()
3923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3924 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3925 return
3926
3927 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3928 if mobj is None:
3929 self._downloader.trouble(u'ERROR: unable to extract song name')
3930 return
3931 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3932 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3933 if mobj is None:
3934 self._downloader.trouble(u'ERROR: unable to extract performer')
3935 return
3936 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3937 video_title = performer + ' - ' + song_name
3938
3939 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3940 if mobj is None:
3941 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3942 return
3943 mtvn_uri = mobj.group(1)
3944
3945 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3946 if mobj is None:
3947 self._downloader.trouble(u'ERROR: unable to extract content id')
3948 return
3949 content_id = mobj.group(1)
3950
3951 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3952 self.report_extraction(video_id)
3953 request = urllib2.Request(videogen_url)
3954 try:
3955 metadataXml = urllib2.urlopen(request).read()
3956 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3957 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3958 return
3959
3960 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3961 renditions = mdoc.findall('.//rendition')
3962
3963 # For now, always pick the highest quality.
3964 rendition = renditions[-1]
3965
3966 try:
3967 _,_,ext = rendition.attrib['type'].partition('/')
3968 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3969 video_url = rendition.find('./src').text
3970 except KeyError:
3971 self._downloader.trouble('Invalid rendition field.')
3972 return
3973
3974 self._downloader.increment_downloads()
3975 info = {
3976 'id': video_id,
3977 'url': video_url,
3978 'uploader': performer,
3979 'title': video_title,
3980 'stitle': _simplify_title(video_title),
3981 'ext': ext,
3982 'format': format,
3983 }
3984
3985 try:
3986 self._downloader.process_info(info)
3987 except UnavailableVideoError, err:
3988 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3989
3990
3991 class PostProcessor(object):
3992 """Post Processor class.
3993
3994 PostProcessor objects can be added to downloaders with their
3995 add_post_processor() method. When the downloader has finished a
3996 successful download, it will take its internal chain of PostProcessors
3997 and start calling the run() method on each one of them, first with
3998 an initial argument and then with the returned value of the previous
3999 PostProcessor.
4000
4001 The chain will be stopped if one of them ever returns None or the end
4002 of the chain is reached.
4003
4004 PostProcessor objects follow a "mutual registration" process similar
4005 to InfoExtractor objects.
4006 """
4007
4008 _downloader = None
4009
4010 def __init__(self, downloader=None):
4011 self._downloader = downloader
4012
4013 def set_downloader(self, downloader):
4014 """Sets the downloader for this PP."""
4015 self._downloader = downloader
4016
4017 def run(self, information):
4018 """Run the PostProcessor.
4019
4020 The "information" argument is a dictionary like the ones
4021 composed by InfoExtractors. The only difference is that this
4022 one has an extra field called "filepath" that points to the
4023 downloaded file.
4024
4025 When this method returns None, the postprocessing chain is
4026 stopped. However, this method may return an information
4027 dictionary that will be passed to the next postprocessing
4028 object in the chain. It can be the one it received after
4029 changing some fields.
4030
4031 In addition, this method may raise a PostProcessingError
4032 exception that will be taken into account by the downloader
4033 it was called from.
4034 """
4035 return information # by default, do nothing
4036
4037 class AudioConversionError(BaseException):
4038 def __init__(self, message):
4039 self.message = message
4040
4041 class FFmpegExtractAudioPP(PostProcessor):
4042
4043 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4044 PostProcessor.__init__(self, downloader)
4045 if preferredcodec is None:
4046 preferredcodec = 'best'
4047 self._preferredcodec = preferredcodec
4048 self._preferredquality = preferredquality
4049 self._keepvideo = keepvideo
4050
4051 @staticmethod
4052 def get_audio_codec(path):
4053 try:
4054 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4055 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4056 output = handle.communicate()[0]
4057 if handle.wait() != 0:
4058 return None
4059 except (IOError, OSError):
4060 return None
4061 audio_codec = None
4062 for line in output.split('\n'):
4063 if line.startswith('codec_name='):
4064 audio_codec = line.split('=')[1].strip()
4065 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4066 return audio_codec
4067 return None
4068
4069 @staticmethod
4070 def run_ffmpeg(path, out_path, codec, more_opts):
4071 if codec is None:
4072 acodec_opts = []
4073 else:
4074 acodec_opts = ['-acodec', codec]
4075 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4076 try:
4077 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4078 stdout,stderr = p.communicate()
4079 except (IOError, OSError):
4080 e = sys.exc_info()[1]
4081 if isinstance(e, OSError) and e.errno == 2:
4082 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4083 else:
4084 raise e
4085 if p.returncode != 0:
4086 msg = stderr.strip().split('\n')[-1]
4087 raise AudioConversionError(msg)
4088
4089 def run(self, information):
4090 path = information['filepath']
4091
4092 filecodec = self.get_audio_codec(path)
4093 if filecodec is None:
4094 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4095 return None
4096
4097 more_opts = []
4098 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4099 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4100 # Lossless, but in another container
4101 acodec = 'copy'
4102 extension = self._preferredcodec
4103 more_opts = ['-absf', 'aac_adtstoasc']
4104 elif filecodec in ['aac', 'mp3', 'vorbis']:
4105 # Lossless if possible
4106 acodec = 'copy'
4107 extension = filecodec
4108 if filecodec == 'aac':
4109 more_opts = ['-f', 'adts']
4110 if filecodec == 'vorbis':
4111 extension = 'ogg'
4112 else:
4113 # MP3 otherwise.
4114 acodec = 'libmp3lame'
4115 extension = 'mp3'
4116 more_opts = []
4117 if self._preferredquality is not None:
4118 more_opts += ['-ab', self._preferredquality]
4119 else:
4120 # We convert the audio (lossy)
4121 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4122 extension = self._preferredcodec
4123 more_opts = []
4124 if self._preferredquality is not None:
4125 more_opts += ['-ab', self._preferredquality]
4126 if self._preferredcodec == 'aac':
4127 more_opts += ['-f', 'adts']
4128 if self._preferredcodec == 'm4a':
4129 more_opts += ['-absf', 'aac_adtstoasc']
4130 if self._preferredcodec == 'vorbis':
4131 extension = 'ogg'
4132 if self._preferredcodec == 'wav':
4133 extension = 'wav'
4134 more_opts += ['-f', 'wav']
4135
4136 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4137 new_path = prefix + sep + extension
4138 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4139 try:
4140 self.run_ffmpeg(path, new_path, acodec, more_opts)
4141 except:
4142 etype,e,tb = sys.exc_info()
4143 if isinstance(e, AudioConversionError):
4144 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4145 else:
4146 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4147 return None
4148
4149 # Try to update the date time for extracted audio file.
4150 if information.get('filetime') is not None:
4151 try:
4152 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4153 except:
4154 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4155
4156 if not self._keepvideo:
4157 try:
4158 os.remove(_encodeFilename(path))
4159 except (IOError, OSError):
4160 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4161 return None
4162
4163 information['filepath'] = new_path
4164 return information
4165
4166
4167 def updateSelf(downloader, filename):
4168 ''' Update the program file with the latest version from the repository '''
4169 # Note: downloader only used for options
4170 if not os.access(filename, os.W_OK):
4171 sys.exit('ERROR: no write permissions on %s' % filename)
4172
4173 downloader.to_screen(u'Updating to latest version...')
4174
4175 try:
4176 try:
4177 urlh = urllib.urlopen(UPDATE_URL)
4178 newcontent = urlh.read()
4179
4180 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4181 if vmatch is not None and vmatch.group(1) == __version__:
4182 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4183 return
4184 finally:
4185 urlh.close()
4186 except (IOError, OSError), err:
4187 sys.exit('ERROR: unable to download latest version')
4188
4189 try:
4190 outf = open(filename, 'wb')
4191 try:
4192 outf.write(newcontent)
4193 finally:
4194 outf.close()
4195 except (IOError, OSError), err:
4196 sys.exit('ERROR: unable to overwrite current version')
4197
4198 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4199
4200 def parseOpts():
4201 # Deferred imports
4202 import getpass
4203 import optparse
4204 import shlex
4205
4206 def _readOptions(filename_bytes):
4207 try:
4208 optionf = open(filename_bytes)
4209 except IOError:
4210 return [] # silently skip if file is not present
4211 try:
4212 res = []
4213 for l in optionf:
4214 res += shlex.split(l, comments=True)
4215 finally:
4216 optionf.close()
4217 return res
4218
4219 def _format_option_string(option):
4220 ''' ('-o', '--option') -> -o, --format METAVAR'''
4221
4222 opts = []
4223
4224 if option._short_opts: opts.append(option._short_opts[0])
4225 if option._long_opts: opts.append(option._long_opts[0])
4226 if len(opts) > 1: opts.insert(1, ', ')
4227
4228 if option.takes_value(): opts.append(' %s' % option.metavar)
4229
4230 return "".join(opts)
4231
4232 def _find_term_columns():
4233 columns = os.environ.get('COLUMNS', None)
4234 if columns:
4235 return int(columns)
4236
4237 try:
4238 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4239 out,err = sp.communicate()
4240 return int(out.split()[1])
4241 except:
4242 pass
4243 return None
4244
4245 max_width = 80
4246 max_help_position = 80
4247
4248 # No need to wrap help messages if we're on a wide console
4249 columns = _find_term_columns()
4250 if columns: max_width = columns
4251
4252 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4253 fmt.format_option_strings = _format_option_string
4254
4255 kw = {
4256 'version' : __version__,
4257 'formatter' : fmt,
4258 'usage' : '%prog [options] url [url...]',
4259 'conflict_handler' : 'resolve',
4260 }
4261
4262 parser = optparse.OptionParser(**kw)
4263
4264 # option groups
4265 general = optparse.OptionGroup(parser, 'General Options')
4266 selection = optparse.OptionGroup(parser, 'Video Selection')
4267 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4268 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4269 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4270 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4271 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4272
4273 general.add_option('-h', '--help',
4274 action='help', help='print this help text and exit')
4275 general.add_option('-v', '--version',
4276 action='version', help='print program version and exit')
4277 general.add_option('-U', '--update',
4278 action='store_true', dest='update_self', help='update this program to latest version')
4279 general.add_option('-i', '--ignore-errors',
4280 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4281 general.add_option('-r', '--rate-limit',
4282 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4283 general.add_option('-R', '--retries',
4284 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4285 general.add_option('--dump-user-agent',
4286 action='store_true', dest='dump_user_agent',
4287 help='display the current browser identification', default=False)
4288 general.add_option('--list-extractors',
4289 action='store_true', dest='list_extractors',
4290 help='List all supported extractors and the URLs they would handle', default=False)
4291
4292 selection.add_option('--playlist-start',
4293 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4294 selection.add_option('--playlist-end',
4295 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4296 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4297 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4298 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4299
4300 authentication.add_option('-u', '--username',
4301 dest='username', metavar='USERNAME', help='account username')
4302 authentication.add_option('-p', '--password',
4303 dest='password', metavar='PASSWORD', help='account password')
4304 authentication.add_option('-n', '--netrc',
4305 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4306
4307
4308 video_format.add_option('-f', '--format',
4309 action='store', dest='format', metavar='FORMAT', help='video format code')
4310 video_format.add_option('--all-formats',
4311 action='store_const', dest='format', help='download all available video formats', const='all')
4312 video_format.add_option('--prefer-free-formats',
4313 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4314 video_format.add_option('--max-quality',
4315 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4316 video_format.add_option('-F', '--list-formats',
4317 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4318
4319
4320 verbosity.add_option('-q', '--quiet',
4321 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4322 verbosity.add_option('-s', '--simulate',
4323 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4324 verbosity.add_option('--skip-download',
4325 action='store_true', dest='skip_download', help='do not download the video', default=False)
4326 verbosity.add_option('-g', '--get-url',
4327 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4328 verbosity.add_option('-e', '--get-title',
4329 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4330 verbosity.add_option('--get-thumbnail',
4331 action='store_true', dest='getthumbnail',
4332 help='simulate, quiet but print thumbnail URL', default=False)
4333 verbosity.add_option('--get-description',
4334 action='store_true', dest='getdescription',
4335 help='simulate, quiet but print video description', default=False)
4336 verbosity.add_option('--get-filename',
4337 action='store_true', dest='getfilename',
4338 help='simulate, quiet but print output filename', default=False)
4339 verbosity.add_option('--get-format',
4340 action='store_true', dest='getformat',
4341 help='simulate, quiet but print output format', default=False)
4342 verbosity.add_option('--no-progress',
4343 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4344 verbosity.add_option('--console-title',
4345 action='store_true', dest='consoletitle',
4346 help='display progress in console titlebar', default=False)
4347
4348
4349 filesystem.add_option('-t', '--title',
4350 action='store_true', dest='usetitle', help='use title in file name', default=False)
4351 filesystem.add_option('-l', '--literal',
4352 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4353 filesystem.add_option('-A', '--auto-number',
4354 action='store_true', dest='autonumber',
4355 help='number downloaded files starting from 00000', default=False)
4356 filesystem.add_option('-o', '--output',
4357 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4358 filesystem.add_option('-a', '--batch-file',
4359 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4360 filesystem.add_option('-w', '--no-overwrites',
4361 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4362 filesystem.add_option('-c', '--continue',
4363 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4364 filesystem.add_option('--no-continue',
4365 action='store_false', dest='continue_dl',
4366 help='do not resume partially downloaded files (restart from beginning)')
4367 filesystem.add_option('--cookies',
4368 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4369 filesystem.add_option('--no-part',
4370 action='store_true', dest='nopart', help='do not use .part files', default=False)
4371 filesystem.add_option('--no-mtime',
4372 action='store_false', dest='updatetime',
4373 help='do not use the Last-modified header to set the file modification time', default=True)
4374 filesystem.add_option('--write-description',
4375 action='store_true', dest='writedescription',
4376 help='write video description to a .description file', default=False)
4377 filesystem.add_option('--write-info-json',
4378 action='store_true', dest='writeinfojson',
4379 help='write video metadata to a .info.json file', default=False)
4380
4381
4382 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4383 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4384 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4385 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4386 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4387 help='ffmpeg audio bitrate specification, 128k by default')
4388 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4389 help='keeps the video file on disk after the post-processing; the video is erased by default')
4390
4391
4392 parser.add_option_group(general)
4393 parser.add_option_group(selection)
4394 parser.add_option_group(filesystem)
4395 parser.add_option_group(verbosity)
4396 parser.add_option_group(video_format)
4397 parser.add_option_group(authentication)
4398 parser.add_option_group(postproc)
4399
4400 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4401 if xdg_config_home:
4402 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4403 else:
4404 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4405 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4406 opts, args = parser.parse_args(argv)
4407
4408 return parser, opts, args
4409
4410 def gen_extractors():
4411 """ Return a list of an instance of every supported extractor.
4412 The order does matter; the first extractor matched is the one handling the URL.
4413 """
4414 youtube_ie = YoutubeIE()
4415 google_ie = GoogleIE()
4416 yahoo_ie = YahooIE()
4417 return [
4418 YoutubePlaylistIE(youtube_ie),
4419 YoutubeUserIE(youtube_ie),
4420 YoutubeSearchIE(youtube_ie),
4421 youtube_ie,
4422 MetacafeIE(youtube_ie),
4423 DailymotionIE(),
4424 google_ie,
4425 GoogleSearchIE(google_ie),
4426 PhotobucketIE(),
4427 yahoo_ie,
4428 YahooSearchIE(yahoo_ie),
4429 DepositFilesIE(),
4430 FacebookIE(),
4431 BlipTVIE(),
4432 VimeoIE(),
4433 MyVideoIE(),
4434 ComedyCentralIE(),
4435 EscapistIE(),
4436 CollegeHumorIE(),
4437 XVideosIE(),
4438 SoundcloudIE(),
4439 InfoQIE(),
4440 MixcloudIE(),
4441 StanfordOpenClassroomIE(),
4442 MTVIE(),
4443
4444 GenericIE()
4445 ]
4446
4447 def _real_main():
4448 parser, opts, args = parseOpts()
4449
4450 # Open appropriate CookieJar
4451 if opts.cookiefile is None:
4452 jar = cookielib.CookieJar()
4453 else:
4454 try:
4455 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4456 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4457 jar.load()
4458 except (IOError, OSError), err:
4459 sys.exit(u'ERROR: unable to open cookie file')
4460
4461 # Dump user agent
4462 if opts.dump_user_agent:
4463 print std_headers['User-Agent']
4464 sys.exit(0)
4465
4466 # Batch file verification
4467 batchurls = []
4468 if opts.batchfile is not None:
4469 try:
4470 if opts.batchfile == '-':
4471 batchfd = sys.stdin
4472 else:
4473 batchfd = open(opts.batchfile, 'r')
4474 batchurls = batchfd.readlines()
4475 batchurls = [x.strip() for x in batchurls]
4476 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4477 except IOError:
4478 sys.exit(u'ERROR: batch file could not be read')
4479 all_urls = batchurls + args
4480
4481 # General configuration
4482 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4483 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4484 urllib2.install_opener(opener)
4485 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4486
4487 extractors = gen_extractors()
4488
4489 if opts.list_extractors:
4490 for ie in extractors:
4491 print(ie.IE_NAME)
4492 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4493 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4494 for mu in matchedUrls:
4495 print(u' ' + mu)
4496 sys.exit(0)
4497
4498 # Conflicting, missing and erroneous options
4499 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4500 parser.error(u'using .netrc conflicts with giving username/password')
4501 if opts.password is not None and opts.username is None:
4502 parser.error(u'account username missing')
4503 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4504 parser.error(u'using output template conflicts with using title, literal title or auto number')
4505 if opts.usetitle and opts.useliteral:
4506 parser.error(u'using title conflicts with using literal title')
4507 if opts.username is not None and opts.password is None:
4508 opts.password = getpass.getpass(u'Type account password and press return:')
4509 if opts.ratelimit is not None:
4510 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4511 if numeric_limit is None:
4512 parser.error(u'invalid rate limit specified')
4513 opts.ratelimit = numeric_limit
4514 if opts.retries is not None:
4515 try:
4516 opts.retries = long(opts.retries)
4517 except (TypeError, ValueError), err:
4518 parser.error(u'invalid retry count specified')
4519 try:
4520 opts.playliststart = int(opts.playliststart)
4521 if opts.playliststart <= 0:
4522 raise ValueError(u'Playlist start must be positive')
4523 except (TypeError, ValueError), err:
4524 parser.error(u'invalid playlist start number specified')
4525 try:
4526 opts.playlistend = int(opts.playlistend)
4527 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4528 raise ValueError(u'Playlist end must be greater than playlist start')
4529 except (TypeError, ValueError), err:
4530 parser.error(u'invalid playlist end number specified')
4531 if opts.extractaudio:
4532 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4533 parser.error(u'invalid audio format specified')
4534
4535 # File downloader
4536 fd = FileDownloader({
4537 'usenetrc': opts.usenetrc,
4538 'username': opts.username,
4539 'password': opts.password,
4540 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4541 'forceurl': opts.geturl,
4542 'forcetitle': opts.gettitle,
4543 'forcethumbnail': opts.getthumbnail,
4544 'forcedescription': opts.getdescription,
4545 'forcefilename': opts.getfilename,
4546 'forceformat': opts.getformat,
4547 'simulate': opts.simulate,
4548 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4549 'format': opts.format,
4550 'format_limit': opts.format_limit,
4551 'listformats': opts.listformats,
4552 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4553 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4554 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4555 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4556 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4557 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4558 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4559 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4560 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4561 or u'%(id)s.%(ext)s'),
4562 'ignoreerrors': opts.ignoreerrors,
4563 'ratelimit': opts.ratelimit,
4564 'nooverwrites': opts.nooverwrites,
4565 'retries': opts.retries,
4566 'continuedl': opts.continue_dl,
4567 'noprogress': opts.noprogress,
4568 'playliststart': opts.playliststart,
4569 'playlistend': opts.playlistend,
4570 'logtostderr': opts.outtmpl == '-',
4571 'consoletitle': opts.consoletitle,
4572 'nopart': opts.nopart,
4573 'updatetime': opts.updatetime,
4574 'writedescription': opts.writedescription,
4575 'writeinfojson': opts.writeinfojson,
4576 'matchtitle': opts.matchtitle,
4577 'rejecttitle': opts.rejecttitle,
4578 'max_downloads': opts.max_downloads,
4579 'prefer_free_formats': opts.prefer_free_formats,
4580 })
4581 for extractor in extractors:
4582 fd.add_info_extractor(extractor)
4583
4584 # PostProcessors
4585 if opts.extractaudio:
4586 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4587
4588 # Update version
4589 if opts.update_self:
4590 updateSelf(fd, sys.argv[0])
4591
4592 # Maybe do nothing
4593 if len(all_urls) < 1:
4594 if not opts.update_self:
4595 parser.error(u'you must provide at least one URL')
4596 else:
4597 sys.exit()
4598
4599 try:
4600 retcode = fd.download(all_urls)
4601 except MaxDownloadsReached:
4602 fd.to_screen(u'--max-download limit reached, aborting.')
4603 retcode = 101
4604
4605 # Dump cookie jar if requested
4606 if opts.cookiefile is not None:
4607 try:
4608 jar.save()
4609 except (IOError, OSError), err:
4610 sys.exit(u'ERROR: unable to save cookie jar')
4611
4612 sys.exit(retcode)
4613
4614 def main():
4615 try:
4616 _real_main()
4617 except DownloadError:
4618 sys.exit(1)
4619 except SameFileError:
4620 sys.exit(u'ERROR: fixed output name but more than one file to download')
4621 except KeyboardInterrupt:
4622 sys.exit(u'\nERROR: Interrupted by user')
4623
4624 if __name__ == '__main__':
4625 main()
4626
4627 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: