]> jfr.im git - yt-dlp.git/blob - youtube-dl
Clean up superfluous whitespace
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 'Filippo Valsorda',
19 )
20
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
23
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
25
26
27 import cookielib
28 import datetime
29 import getpass
30 import gzip
31 import htmlentitydefs
32 import HTMLParser
33 import httplib
34 import locale
35 import math
36 import netrc
37 import optparse
38 import os
39 import os.path
40 import re
41 import shlex
42 import socket
43 import string
44 import subprocess
45 import sys
46 import time
47 import urllib
48 import urllib2
49 import warnings
50 import zlib
51
52 if os.name == 'nt':
53 import ctypes
54
55 try:
56 import email.utils
57 except ImportError: # Python 2.4
58 import email.Utils
59 try:
60 import cStringIO as StringIO
61 except ImportError:
62 import StringIO
63
64 # parse_qs was moved from the cgi module to the urlparse module recently.
65 try:
66 from urlparse import parse_qs
67 except ImportError:
68 from cgi import parse_qs
69
70 try:
71 import lxml.etree
72 except ImportError:
73 pass # Handled below
74
75 try:
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
79
80 std_headers = {
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
86 }
87
88 try:
89 import json
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 import re
92 class json(object):
93 @staticmethod
94 def loads(s):
95 s = s.decode('UTF-8')
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
100 i += 1
101 if expectMore:
102 if i >= len(s):
103 raiseError('Premature end', i)
104 return i
105 def decodeEscape(match):
106 esc = match.group(1)
107 _STATIC = {
108 '"': '"',
109 '\\': '\\',
110 '/': '/',
111 'b': unichr(0x8),
112 'f': unichr(0xc),
113 'n': '\n',
114 'r': '\r',
115 't': '\t',
116 }
117 if esc in _STATIC:
118 return _STATIC[esc]
119 if esc[0] == 'u':
120 if len(esc) == 1+4:
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
127 def parseString(i):
128 i += 1
129 e = i
130 while True:
131 e = s.index('"', e)
132 bslashes = 0
133 while s[e-bslashes-1] == '\\':
134 bslashes += 1
135 if bslashes % 2 == 1:
136 e += 1
137 continue
138 break
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
141 return (e+1,stri)
142 def parseObj(i):
143 i += 1
144 res = {}
145 i = skipSpace(i)
146 if s[i] == '}': # Empty dictionary
147 return (i+1,res)
148 while True:
149 if s[i] != '"':
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
152 i = skipSpace(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
155 i,val = parse(i+1)
156 res[key] = val
157 i = skipSpace(i)
158 if s[i] == '}':
159 return (i+1, res)
160 if s[i] != ',':
161 raiseError('Expected comma or closing curly brace', i)
162 i = skipSpace(i+1)
163 def parseArray(i):
164 res = []
165 i = skipSpace(i+1)
166 if s[i] == ']': # Empty array
167 return (i+1,res)
168 while True:
169 i,val = parse(i)
170 res.append(val)
171 i = skipSpace(i) # Raise exception if premature end
172 if s[i] == ']':
173 return (i+1, res)
174 if s[i] != ',':
175 raiseError('Expected a comma or closing bracket', i)
176 i = skipSpace(i+1)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
180 return (i+len(k), v)
181 raiseError('Not a boolean (or null)', i)
182 def parseNumber(i):
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 if mobj is None:
185 raiseError('Not a number', i)
186 nums = mobj.group(1)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
191 def parse(i):
192 i = skipSpace(i)
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
195 return (i,res)
196 i,res = parse(0)
197 if i < len(s):
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
199 return res
200
201 def preferredencoding():
202 """Get preferred encoding.
203
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
206 """
207 def yield_preferredencoding():
208 try:
209 pref = locale.getpreferredencoding()
210 u'TEST'.encode(pref)
211 except:
212 pref = 'UTF-8'
213 while True:
214 yield pref
215 return yield_preferredencoding().next()
216
217
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
220
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
223 """
224 entity = matchobj.group(1)
225
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
229
230 # Unicode character
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 if mobj is not None:
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
235 base = 16
236 numstr = u'0%s' % numstr
237 else:
238 base = 10
239 return unichr(long(numstr, base))
240
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
243
244
245 def sanitize_title(utitle):
246 """Sanitizes a video title so it could be used as part of a filename."""
247 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
248 return utitle.replace(unicode(os.sep), u'%')
249
250
251 def sanitize_open(filename, open_mode):
252 """Try to open the given filename, and slightly tweak it if this fails.
253
254 Attempts to open the given filename. If this fails, it tries to change
255 the filename slightly, step by step, until it's either able to open it
256 or it fails and raises a final exception, like the standard open()
257 function.
258
259 It returns the tuple (stream, definitive_file_name).
260 """
261 try:
262 if filename == u'-':
263 if sys.platform == 'win32':
264 import msvcrt
265 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
266 return (sys.stdout, filename)
267 stream = open(_encodeFilename(filename), open_mode)
268 return (stream, filename)
269 except (IOError, OSError), err:
270 # In case of error, try to remove win32 forbidden chars
271 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272
273 # An exception here should be caught in the caller
274 stream = open(_encodeFilename(filename), open_mode)
275 return (stream, filename)
276
277
278 def timeconvert(timestr):
279 """Convert RFC 2822 defined time string into system timestamp"""
280 timestamp = None
281 timetuple = email.utils.parsedate_tz(timestr)
282 if timetuple is not None:
283 timestamp = email.utils.mktime_tz(timetuple)
284 return timestamp
285
286 def _simplify_title(title):
287 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
288 return expr.sub(u'_', title).strip(u'_')
289
290 def _orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
292 res = []
293 for el in iterable:
294 if el not in res:
295 res.append(el)
296 return res
297
298 def _unescapeHTML(s):
299 """
300 @param s a string (of type unicode)
301 """
302 assert type(s) == type(u'')
303
304 htmlParser = HTMLParser.HTMLParser()
305 return htmlParser.unescape(s)
306
307 def _encodeFilename(s):
308 """
309 @param s The name of the file (of type unicode)
310 """
311
312 assert type(s) == type(u'')
313
314 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
315 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
316 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
317 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
318 return s
319 else:
320 return s.encode(sys.getfilesystemencoding(), 'ignore')
321
322 class DownloadError(Exception):
323 """Download Error exception.
324
325 This exception may be thrown by FileDownloader objects if they are not
326 configured to continue on errors. They will contain the appropriate
327 error message.
328 """
329 pass
330
331
332 class SameFileError(Exception):
333 """Same File exception.
334
335 This exception will be thrown by FileDownloader objects if they detect
336 multiple files would have to be downloaded to the same file on disk.
337 """
338 pass
339
340
341 class PostProcessingError(Exception):
342 """Post Processing exception.
343
344 This exception may be raised by PostProcessor's .run() method to
345 indicate an error in the postprocessing task.
346 """
347 pass
348
349 class MaxDownloadsReached(Exception):
350 """ --max-downloads limit has been reached. """
351 pass
352
353
354 class UnavailableVideoError(Exception):
355 """Unavailable Format exception.
356
357 This exception will be thrown when a video is requested
358 in a format that is not available for that video.
359 """
360 pass
361
362
363 class ContentTooShortError(Exception):
364 """Content Too Short exception.
365
366 This exception may be raised by FileDownloader objects when a file they
367 download is too small for what the server announced first, indicating
368 the connection was probably interrupted.
369 """
370 # Both in bytes
371 downloaded = None
372 expected = None
373
374 def __init__(self, downloaded, expected):
375 self.downloaded = downloaded
376 self.expected = expected
377
378
379 class YoutubeDLHandler(urllib2.HTTPHandler):
380 """Handler for HTTP requests and responses.
381
382 This class, when installed with an OpenerDirector, automatically adds
383 the standard headers to every HTTP request and handles gzipped and
384 deflated responses from web servers. If compression is to be avoided in
385 a particular request, the original request in the program code only has
386 to include the HTTP header "Youtubedl-No-Compression", which will be
387 removed before making the real request.
388
389 Part of this code was copied from:
390
391 http://techknack.net/python-urllib2-handlers/
392
393 Andrew Rowls, the author of that code, agreed to release it to the
394 public domain.
395 """
396
397 @staticmethod
398 def deflate(data):
399 try:
400 return zlib.decompress(data, -zlib.MAX_WBITS)
401 except zlib.error:
402 return zlib.decompress(data)
403
404 @staticmethod
405 def addinfourl_wrapper(stream, headers, url, code):
406 if hasattr(urllib2.addinfourl, 'getcode'):
407 return urllib2.addinfourl(stream, headers, url, code)
408 ret = urllib2.addinfourl(stream, headers, url)
409 ret.code = code
410 return ret
411
412 def http_request(self, req):
413 for h in std_headers:
414 if h in req.headers:
415 del req.headers[h]
416 req.add_header(h, std_headers[h])
417 if 'Youtubedl-no-compression' in req.headers:
418 if 'Accept-encoding' in req.headers:
419 del req.headers['Accept-encoding']
420 del req.headers['Youtubedl-no-compression']
421 return req
422
423 def http_response(self, req, resp):
424 old_resp = resp
425 # gzip
426 if resp.headers.get('Content-encoding', '') == 'gzip':
427 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
430 # deflate
431 if resp.headers.get('Content-encoding', '') == 'deflate':
432 gz = StringIO.StringIO(self.deflate(resp.read()))
433 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
434 resp.msg = old_resp.msg
435 return resp
436
437
438 class FileDownloader(object):
439 """File Downloader class.
440
441 File downloader objects are the ones responsible of downloading the
442 actual video file and writing it to disk if the user has requested
443 it, among some other tasks. In most cases there should be one per
444 program. As, given a video URL, the downloader doesn't know how to
445 extract all the needed information, task that InfoExtractors do, it
446 has to pass the URL to one of them.
447
448 For this, file downloader objects have a method that allows
449 InfoExtractors to be registered in a given order. When it is passed
450 a URL, the file downloader handles it to the first InfoExtractor it
451 finds that reports being able to handle it. The InfoExtractor extracts
452 all the information about the video or videos the URL refers to, and
453 asks the FileDownloader to process the video information, possibly
454 downloading the video.
455
456 File downloaders accept a lot of parameters. In order not to saturate
457 the object constructor with arguments, it receives a dictionary of
458 options instead. These options are available through the params
459 attribute for the InfoExtractors to use. The FileDownloader also
460 registers itself as the downloader in charge for the InfoExtractors
461 that are added to it, so this is a "mutual registration".
462
463 Available options:
464
465 username: Username for authentication purposes.
466 password: Password for authentication purposes.
467 usenetrc: Use netrc for authentication instead.
468 quiet: Do not print messages to stdout.
469 forceurl: Force printing final URL.
470 forcetitle: Force printing title.
471 forcethumbnail: Force printing thumbnail URL.
472 forcedescription: Force printing description.
473 forcefilename: Force printing final filename.
474 simulate: Do not download the video files.
475 format: Video format code.
476 format_limit: Highest quality format to try.
477 outtmpl: Template for output names.
478 ignoreerrors: Do not stop on download errors.
479 ratelimit: Download speed limit, in bytes/sec.
480 nooverwrites: Prevent overwriting files.
481 retries: Number of times to retry for HTTP error 5xx
482 continuedl: Try to continue downloads if possible.
483 noprogress: Do not print the progress bar.
484 playliststart: Playlist item to start at.
485 playlistend: Playlist item to end at.
486 matchtitle: Download only matching titles.
487 rejecttitle: Reject downloads for matching titles.
488 logtostderr: Log messages to stderr instead of stdout.
489 consoletitle: Display progress in console window's titlebar.
490 nopart: Do not use temporary .part files.
491 updatetime: Use the Last-modified header to set output file timestamps.
492 writedescription: Write the video description to a .description file
493 writeinfojson: Write the video description to a .info.json file
494 writesubtitles: Write the video subtitles to a .srt file
495 subtitleslang: Language of the subtitles to download
496 """
497
498 params = None
499 _ies = []
500 _pps = []
501 _download_retcode = None
502 _num_downloads = None
503 _screen_file = None
504
505 def __init__(self, params):
506 """Create a FileDownloader object with the given options."""
507 self._ies = []
508 self._pps = []
509 self._download_retcode = 0
510 self._num_downloads = 0
511 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
512 self.params = params
513
514 @staticmethod
515 def format_bytes(bytes):
516 if bytes is None:
517 return 'N/A'
518 if type(bytes) is str:
519 bytes = float(bytes)
520 if bytes == 0.0:
521 exponent = 0
522 else:
523 exponent = long(math.log(bytes, 1024.0))
524 suffix = 'bkMGTPEZY'[exponent]
525 converted = float(bytes) / float(1024 ** exponent)
526 return '%.2f%s' % (converted, suffix)
527
528 @staticmethod
529 def calc_percent(byte_counter, data_len):
530 if data_len is None:
531 return '---.-%'
532 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
533
534 @staticmethod
535 def calc_eta(start, now, total, current):
536 if total is None:
537 return '--:--'
538 dif = now - start
539 if current == 0 or dif < 0.001: # One millisecond
540 return '--:--'
541 rate = float(current) / dif
542 eta = long((float(total) - float(current)) / rate)
543 (eta_mins, eta_secs) = divmod(eta, 60)
544 if eta_mins > 99:
545 return '--:--'
546 return '%02d:%02d' % (eta_mins, eta_secs)
547
548 @staticmethod
549 def calc_speed(start, now, bytes):
550 dif = now - start
551 if bytes == 0 or dif < 0.001: # One millisecond
552 return '%10s' % '---b/s'
553 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
554
555 @staticmethod
556 def best_block_size(elapsed_time, bytes):
557 new_min = max(bytes / 2.0, 1.0)
558 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
559 if elapsed_time < 0.001:
560 return long(new_max)
561 rate = bytes / elapsed_time
562 if rate > new_max:
563 return long(new_max)
564 if rate < new_min:
565 return long(new_min)
566 return long(rate)
567
568 @staticmethod
569 def parse_bytes(bytestr):
570 """Parse a string indicating a byte quantity into a long integer."""
571 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
572 if matchobj is None:
573 return None
574 number = float(matchobj.group(1))
575 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
576 return long(round(number * multiplier))
577
578 def add_info_extractor(self, ie):
579 """Add an InfoExtractor object to the end of the list."""
580 self._ies.append(ie)
581 ie.set_downloader(self)
582
583 def add_post_processor(self, pp):
584 """Add a PostProcessor object to the end of the chain."""
585 self._pps.append(pp)
586 pp.set_downloader(self)
587
588 def to_screen(self, message, skip_eol=False):
589 """Print message to stdout if not in quiet mode."""
590 assert type(message) == type(u'')
591 if not self.params.get('quiet', False):
592 terminator = [u'\n', u''][skip_eol]
593 output = message + terminator
594
595 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
596 output = output.encode(preferredencoding(), 'ignore')
597 self._screen_file.write(output)
598 self._screen_file.flush()
599
600 def to_stderr(self, message):
601 """Print message to stderr."""
602 print >>sys.stderr, message.encode(preferredencoding())
603
604 def to_cons_title(self, message):
605 """Set console/terminal window title to message."""
606 if not self.params.get('consoletitle', False):
607 return
608 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
609 # c_wchar_p() might not be necessary if `message` is
610 # already of type unicode()
611 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
612 elif 'TERM' in os.environ:
613 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
614
615 def fixed_template(self):
616 """Checks if the output template is fixed."""
617 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
618
619 def trouble(self, message=None):
620 """Determine action to take when a download problem appears.
621
622 Depending on if the downloader has been configured to ignore
623 download errors or not, this method may throw an exception or
624 not when errors are found, after printing the message.
625 """
626 if message is not None:
627 self.to_stderr(message)
628 if not self.params.get('ignoreerrors', False):
629 raise DownloadError(message)
630 self._download_retcode = 1
631
632 def slow_down(self, start_time, byte_counter):
633 """Sleep if the download speed is over the rate limit."""
634 rate_limit = self.params.get('ratelimit', None)
635 if rate_limit is None or byte_counter == 0:
636 return
637 now = time.time()
638 elapsed = now - start_time
639 if elapsed <= 0.0:
640 return
641 speed = float(byte_counter) / elapsed
642 if speed > rate_limit:
643 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
644
645 def temp_name(self, filename):
646 """Returns a temporary filename for the given filename."""
647 if self.params.get('nopart', False) or filename == u'-' or \
648 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
649 return filename
650 return filename + u'.part'
651
652 def undo_temp_name(self, filename):
653 if filename.endswith(u'.part'):
654 return filename[:-len(u'.part')]
655 return filename
656
657 def try_rename(self, old_filename, new_filename):
658 try:
659 if old_filename == new_filename:
660 return
661 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
662 except (IOError, OSError), err:
663 self.trouble(u'ERROR: unable to rename file')
664
665 def try_utime(self, filename, last_modified_hdr):
666 """Try to set the last-modified time of the given file."""
667 if last_modified_hdr is None:
668 return
669 if not os.path.isfile(_encodeFilename(filename)):
670 return
671 timestr = last_modified_hdr
672 if timestr is None:
673 return
674 filetime = timeconvert(timestr)
675 if filetime is None:
676 return filetime
677 try:
678 os.utime(filename, (time.time(), filetime))
679 except:
680 pass
681 return filetime
682
683 def report_writedescription(self, descfn):
684 """ Report that the description file is being written """
685 self.to_screen(u'[info] Writing video description to: ' + descfn)
686
687 def report_writesubtitles(self, srtfn):
688 """ Report that the subtitles file is being written """
689 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
690
691 def report_writeinfojson(self, infofn):
692 """ Report that the metadata file has been written """
693 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
694
695 def report_destination(self, filename):
696 """Report destination filename."""
697 self.to_screen(u'[download] Destination: ' + filename)
698
699 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
700 """Report download progress."""
701 if self.params.get('noprogress', False):
702 return
703 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
704 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
705 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
706 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
707
708 def report_resuming_byte(self, resume_len):
709 """Report attempt to resume at given byte."""
710 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
711
712 def report_retry(self, count, retries):
713 """Report retry in case of HTTP error 5xx"""
714 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
715
716 def report_file_already_downloaded(self, file_name):
717 """Report file has already been fully downloaded."""
718 try:
719 self.to_screen(u'[download] %s has already been downloaded' % file_name)
720 except (UnicodeEncodeError), err:
721 self.to_screen(u'[download] The file has already been downloaded')
722
723 def report_unable_to_resume(self):
724 """Report it was impossible to resume download."""
725 self.to_screen(u'[download] Unable to resume')
726
727 def report_finish(self):
728 """Report download finished."""
729 if self.params.get('noprogress', False):
730 self.to_screen(u'[download] Download completed')
731 else:
732 self.to_screen(u'')
733
734 def increment_downloads(self):
735 """Increment the ordinal that assigns a number to each file."""
736 self._num_downloads += 1
737
738 def prepare_filename(self, info_dict):
739 """Generate the output filename."""
740 try:
741 template_dict = dict(info_dict)
742 template_dict['epoch'] = unicode(long(time.time()))
743 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
744 filename = self.params['outtmpl'] % template_dict
745 return filename
746 except (ValueError, KeyError), err:
747 self.trouble(u'ERROR: invalid system charset or erroneous output template')
748 return None
749
750 def _match_entry(self, info_dict):
751 """ Returns None iff the file should be downloaded """
752
753 title = info_dict['title']
754 matchtitle = self.params.get('matchtitle', False)
755 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
756 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
757 rejecttitle = self.params.get('rejecttitle', False)
758 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
759 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
760 return None
761
762 def process_info(self, info_dict):
763 """Process a single dictionary returned by an InfoExtractor."""
764
765 reason = self._match_entry(info_dict)
766 if reason is not None:
767 self.to_screen(u'[download] ' + reason)
768 return
769
770 max_downloads = self.params.get('max_downloads')
771 if max_downloads is not None:
772 if self._num_downloads > int(max_downloads):
773 raise MaxDownloadsReached()
774
775 filename = self.prepare_filename(info_dict)
776
777 # Forced printings
778 if self.params.get('forcetitle', False):
779 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forceurl', False):
781 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
783 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcedescription', False) and 'description' in info_dict:
785 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forcefilename', False) and filename is not None:
787 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
788 if self.params.get('forceformat', False):
789 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
790
791 # Do nothing else if in simulate mode
792 if self.params.get('simulate', False):
793 return
794
795 if filename is None:
796 return
797
798 try:
799 dn = os.path.dirname(_encodeFilename(filename))
800 if dn != '' and not os.path.exists(dn): # dn is already encoded
801 os.makedirs(dn)
802 except (OSError, IOError), err:
803 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
804 return
805
806 if self.params.get('writedescription', False):
807 try:
808 descfn = filename + u'.description'
809 self.report_writedescription(descfn)
810 descfile = open(_encodeFilename(descfn), 'wb')
811 try:
812 descfile.write(info_dict['description'].encode('utf-8'))
813 finally:
814 descfile.close()
815 except (OSError, IOError):
816 self.trouble(u'ERROR: Cannot write description file ' + descfn)
817 return
818
819 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
820 # subtitles download errors are already managed as troubles in relevant IE
821 # that way it will silently go on when used with unsupporting IE
822 try:
823 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
824 self.report_writesubtitles(srtfn)
825 srtfile = open(_encodeFilename(srtfn), 'wb')
826 try:
827 srtfile.write(info_dict['subtitles'].encode('utf-8'))
828 finally:
829 srtfile.close()
830 except (OSError, IOError):
831 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
832 return
833
834 if self.params.get('writeinfojson', False):
835 infofn = filename + u'.info.json'
836 self.report_writeinfojson(infofn)
837 try:
838 json.dump
839 except (NameError,AttributeError):
840 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
841 return
842 try:
843 infof = open(_encodeFilename(infofn), 'wb')
844 try:
845 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
846 json.dump(json_info_dict, infof)
847 finally:
848 infof.close()
849 except (OSError, IOError):
850 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
851 return
852
853 if not self.params.get('skip_download', False):
854 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
855 success = True
856 else:
857 try:
858 success = self._do_download(filename, info_dict)
859 except (OSError, IOError), err:
860 raise UnavailableVideoError
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
863 return
864 except (ContentTooShortError, ), err:
865 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
866 return
867
868 if success:
869 try:
870 self.post_process(filename, info_dict)
871 except (PostProcessingError), err:
872 self.trouble(u'ERROR: postprocessing: %s' % str(err))
873 return
874
875 def download(self, url_list):
876 """Download a given list of URLs."""
877 if len(url_list) > 1 and self.fixed_template():
878 raise SameFileError(self.params['outtmpl'])
879
880 for url in url_list:
881 suitable_found = False
882 for ie in self._ies:
883 # Go to next InfoExtractor if not suitable
884 if not ie.suitable(url):
885 continue
886
887 # Suitable InfoExtractor found
888 suitable_found = True
889
890 # Extract information from URL and process it
891 ie.extract(url)
892
893 # Suitable InfoExtractor had been found; go to next URL
894 break
895
896 if not suitable_found:
897 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
898
899 return self._download_retcode
900
901 def post_process(self, filename, ie_info):
902 """Run the postprocessing chain on the given file."""
903 info = dict(ie_info)
904 info['filepath'] = filename
905 for pp in self._pps:
906 info = pp.run(info)
907 if info is None:
908 break
909
910 def _download_with_rtmpdump(self, filename, url, player_url):
911 self.report_destination(filename)
912 tmpfilename = self.temp_name(filename)
913
914 # Check for rtmpdump first
915 try:
916 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
917 except (OSError, IOError):
918 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
919 return False
920
921 # Download using rtmpdump. rtmpdump returns exit code 2 when
922 # the connection was interrumpted and resuming appears to be
923 # possible. This is part of rtmpdump's normal usage, AFAIK.
924 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
925 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
926 if self.params.get('verbose', False):
927 try:
928 import pipes
929 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
930 except ImportError:
931 shell_quote = repr
932 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
933 retval = subprocess.call(args)
934 while retval == 2 or retval == 1:
935 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
936 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
937 time.sleep(5.0) # This seems to be needed
938 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
939 cursize = os.path.getsize(_encodeFilename(tmpfilename))
940 if prevsize == cursize and retval == 1:
941 break
942 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
943 if prevsize == cursize and retval == 2 and cursize > 1024:
944 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
945 retval = 0
946 break
947 if retval == 0:
948 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
949 self.try_rename(tmpfilename, filename)
950 return True
951 else:
952 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
953 return False
954
955 def _do_download(self, filename, info_dict):
956 url = info_dict['url']
957 player_url = info_dict.get('player_url', None)
958
959 # Check file already present
960 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
961 self.report_file_already_downloaded(filename)
962 return True
963
964 # Attempt to download using rtmpdump
965 if url.startswith('rtmp'):
966 return self._download_with_rtmpdump(filename, url, player_url)
967
968 tmpfilename = self.temp_name(filename)
969 stream = None
970
971 # Do not include the Accept-Encoding header
972 headers = {'Youtubedl-no-compression': 'True'}
973 basic_request = urllib2.Request(url, None, headers)
974 request = urllib2.Request(url, None, headers)
975
976 # Establish possible resume length
977 if os.path.isfile(_encodeFilename(tmpfilename)):
978 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
979 else:
980 resume_len = 0
981
982 open_mode = 'wb'
983 if resume_len != 0:
984 if self.params.get('continuedl', False):
985 self.report_resuming_byte(resume_len)
986 request.add_header('Range','bytes=%d-' % resume_len)
987 open_mode = 'ab'
988 else:
989 resume_len = 0
990
991 count = 0
992 retries = self.params.get('retries', 0)
993 while count <= retries:
994 # Establish connection
995 try:
996 if count == 0 and 'urlhandle' in info_dict:
997 data = info_dict['urlhandle']
998 data = urllib2.urlopen(request)
999 break
1000 except (urllib2.HTTPError, ), err:
1001 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002 # Unexpected HTTP error
1003 raise
1004 elif err.code == 416:
1005 # Unable to resume (requested range not satisfiable)
1006 try:
1007 # Open the connection again without the range header
1008 data = urllib2.urlopen(basic_request)
1009 content_length = data.info()['Content-Length']
1010 except (urllib2.HTTPError, ), err:
1011 if err.code < 500 or err.code >= 600:
1012 raise
1013 else:
1014 # Examine the reported length
1015 if (content_length is not None and
1016 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017 # The file had already been fully downloaded.
1018 # Explanation to the above condition: in issue #175 it was revealed that
1019 # YouTube sometimes adds or removes a few bytes from the end of the file,
1020 # changing the file size slightly and causing problems for some users. So
1021 # I decided to implement a suggested change and consider the file
1022 # completely downloaded if the file size differs less than 100 bytes from
1023 # the one in the hard drive.
1024 self.report_file_already_downloaded(filename)
1025 self.try_rename(tmpfilename, filename)
1026 return True
1027 else:
1028 # The length does not match, we start the download over
1029 self.report_unable_to_resume()
1030 open_mode = 'wb'
1031 break
1032 # Retry
1033 count += 1
1034 if count <= retries:
1035 self.report_retry(count, retries)
1036
1037 if count > retries:
1038 self.trouble(u'ERROR: giving up after %s retries' % retries)
1039 return False
1040
1041 data_len = data.info().get('Content-length', None)
1042 if data_len is not None:
1043 data_len = long(data_len) + resume_len
1044 data_len_str = self.format_bytes(data_len)
1045 byte_counter = 0 + resume_len
1046 block_size = 1024
1047 start = time.time()
1048 while True:
1049 # Download and write
1050 before = time.time()
1051 data_block = data.read(block_size)
1052 after = time.time()
1053 if len(data_block) == 0:
1054 break
1055 byte_counter += len(data_block)
1056
1057 # Open file just in time
1058 if stream is None:
1059 try:
1060 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061 assert stream is not None
1062 filename = self.undo_temp_name(tmpfilename)
1063 self.report_destination(filename)
1064 except (OSError, IOError), err:
1065 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066 return False
1067 try:
1068 stream.write(data_block)
1069 except (IOError, OSError), err:
1070 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071 return False
1072 block_size = self.best_block_size(after - before, len(data_block))
1073
1074 # Progress message
1075 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076 if data_len is None:
1077 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078 else:
1079 percent_str = self.calc_percent(byte_counter, data_len)
1080 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082
1083 # Apply rate limit
1084 self.slow_down(start, byte_counter - resume_len)
1085
1086 if stream is None:
1087 self.trouble(u'\nERROR: Did not get any data blocks')
1088 return False
1089 stream.close()
1090 self.report_finish()
1091 if data_len is not None and byte_counter != data_len:
1092 raise ContentTooShortError(byte_counter, long(data_len))
1093 self.try_rename(tmpfilename, filename)
1094
1095 # Update file modification time
1096 if self.params.get('updatetime', True):
1097 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1098
1099 return True
1100
1101
1102 class InfoExtractor(object):
1103 """Information Extractor class.
1104
1105 Information extractors are the classes that, given a URL, extract
1106 information from the video (or videos) the URL refers to. This
1107 information includes the real video URL, the video title and simplified
1108 title, author and others. The information is stored in a dictionary
1109 which is then passed to the FileDownloader. The FileDownloader
1110 processes this information possibly downloading the video to the file
1111 system, among other possible outcomes. The dictionaries must include
1112 the following fields:
1113
1114 id: Video identifier.
1115 url: Final video URL.
1116 uploader: Nickname of the video uploader.
1117 title: Literal title.
1118 stitle: Simplified title.
1119 ext: Video filename extension.
1120 format: Video format.
1121 player_url: SWF Player URL (may be None).
1122
1123 The following fields are optional. Their primary purpose is to allow
1124 youtube-dl to serve as the backend for a video search function, such
1125 as the one in youtube2mp3. They are only used when their respective
1126 forced printing functions are called:
1127
1128 thumbnail: Full URL to a video thumbnail image.
1129 description: One-line video description.
1130
1131 Subclasses of this one should re-define the _real_initialize() and
1132 _real_extract() methods and define a _VALID_URL regexp.
1133 Probably, they should also be added to the list of extractors.
1134 """
1135
1136 _ready = False
1137 _downloader = None
1138
1139 def __init__(self, downloader=None):
1140 """Constructor. Receives an optional downloader."""
1141 self._ready = False
1142 self.set_downloader(downloader)
1143
1144 def suitable(self, url):
1145 """Receives a URL and returns True if suitable for this IE."""
1146 return re.match(self._VALID_URL, url) is not None
1147
1148 def initialize(self):
1149 """Initializes an instance (authentication, etc)."""
1150 if not self._ready:
1151 self._real_initialize()
1152 self._ready = True
1153
1154 def extract(self, url):
1155 """Extracts URL information and returns it in list of dicts."""
1156 self.initialize()
1157 return self._real_extract(url)
1158
1159 def set_downloader(self, downloader):
1160 """Sets the downloader for this IE."""
1161 self._downloader = downloader
1162
1163 def _real_initialize(self):
1164 """Real initialization process. Redefine in subclasses."""
1165 pass
1166
1167 def _real_extract(self, url):
1168 """Real extraction process. Redefine in subclasses."""
1169 pass
1170
1171
1172 class YoutubeIE(InfoExtractor):
1173 """Information extractor for youtube.com."""
1174
1175 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179 _NETRC_MACHINE = 'youtube'
1180 # Listed in order of quality
1181 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183 _video_extensions = {
1184 '13': '3gp',
1185 '17': 'mp4',
1186 '18': 'mp4',
1187 '22': 'mp4',
1188 '37': 'mp4',
1189 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1190 '43': 'webm',
1191 '44': 'webm',
1192 '45': 'webm',
1193 }
1194 _video_dimensions = {
1195 '5': '240x400',
1196 '6': '???',
1197 '13': '???',
1198 '17': '144x176',
1199 '18': '360x640',
1200 '22': '720x1280',
1201 '34': '360x640',
1202 '35': '480x854',
1203 '37': '1080x1920',
1204 '38': '3072x4096',
1205 '43': '360x640',
1206 '44': '480x854',
1207 '45': '720x1280',
1208 }
1209 IE_NAME = u'youtube'
1210
1211 def report_lang(self):
1212 """Report attempt to set language."""
1213 self._downloader.to_screen(u'[youtube] Setting language')
1214
1215 def report_login(self):
1216 """Report attempt to log in."""
1217 self._downloader.to_screen(u'[youtube] Logging in')
1218
1219 def report_age_confirmation(self):
1220 """Report attempt to confirm age."""
1221 self._downloader.to_screen(u'[youtube] Confirming age')
1222
1223 def report_video_webpage_download(self, video_id):
1224 """Report attempt to download video webpage."""
1225 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1226
1227 def report_video_info_webpage_download(self, video_id):
1228 """Report attempt to download video info webpage."""
1229 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1230
1231 def report_video_subtitles_download(self, video_id):
1232 """Report attempt to download video info webpage."""
1233 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1234
1235 def report_information_extraction(self, video_id):
1236 """Report attempt to extract video information."""
1237 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1238
1239 def report_unavailable_format(self, video_id, format):
1240 """Report extracted video URL."""
1241 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1242
1243 def report_rtmp_download(self):
1244 """Indicate the download will use the RTMP protocol."""
1245 self._downloader.to_screen(u'[youtube] RTMP download detected')
1246
1247 def _closed_captions_xml_to_srt(self, xml_string):
1248 srt = ''
1249 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250 # TODO parse xml instead of regex
1251 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252 if not dur: dur = '4'
1253 start = float(start)
1254 end = start + float(dur)
1255 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259 srt += str(n) + '\n'
1260 srt += start + ' --> ' + end + '\n'
1261 srt += caption + '\n\n'
1262 return srt
1263
1264 def _print_formats(self, formats):
1265 print 'Available formats:'
1266 for x in formats:
1267 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1268
1269 def _real_initialize(self):
1270 if self._downloader is None:
1271 return
1272
1273 username = None
1274 password = None
1275 downloader_params = self._downloader.params
1276
1277 # Attempt to use provided username and password or .netrc data
1278 if downloader_params.get('username', None) is not None:
1279 username = downloader_params['username']
1280 password = downloader_params['password']
1281 elif downloader_params.get('usenetrc', False):
1282 try:
1283 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284 if info is not None:
1285 username = info[0]
1286 password = info[2]
1287 else:
1288 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289 except (IOError, netrc.NetrcParseError), err:
1290 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1291 return
1292
1293 # Set language
1294 request = urllib2.Request(self._LANG_URL)
1295 try:
1296 self.report_lang()
1297 urllib2.urlopen(request).read()
1298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1300 return
1301
1302 # No authentication to be performed
1303 if username is None:
1304 return
1305
1306 # Log in
1307 login_form = {
1308 'current_form': 'loginForm',
1309 'next': '/',
1310 'action_login': 'Log In',
1311 'username': username,
1312 'password': password,
1313 }
1314 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1315 try:
1316 self.report_login()
1317 login_results = urllib2.urlopen(request).read()
1318 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1320 return
1321 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1323 return
1324
1325 # Confirm age
1326 age_form = {
1327 'next_url': '/',
1328 'action_confirm': 'Confirm',
1329 }
1330 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1331 try:
1332 self.report_age_confirmation()
1333 age_results = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1336 return
1337
1338 def _real_extract(self, url):
1339 # Extract video id from URL
1340 mobj = re.match(self._VALID_URL, url)
1341 if mobj is None:
1342 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1343 return
1344 video_id = mobj.group(2)
1345
1346 # Get video webpage
1347 self.report_video_webpage_download(video_id)
1348 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1349 try:
1350 video_webpage = urllib2.urlopen(request).read()
1351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1352 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1353 return
1354
1355 # Attempt to extract SWF player URL
1356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1357 if mobj is not None:
1358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1359 else:
1360 player_url = None
1361
1362 # Get video info
1363 self.report_video_info_webpage_download(video_id)
1364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1366 % (video_id, el_type))
1367 request = urllib2.Request(video_info_url)
1368 try:
1369 video_info_webpage = urllib2.urlopen(request).read()
1370 video_info = parse_qs(video_info_webpage)
1371 if 'token' in video_info:
1372 break
1373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1374 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1375 return
1376 if 'token' not in video_info:
1377 if 'reason' in video_info:
1378 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1379 else:
1380 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1381 return
1382
1383 # Start extracting information
1384 self.report_information_extraction(video_id)
1385
1386 # uploader
1387 if 'author' not in video_info:
1388 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1389 return
1390 video_uploader = urllib.unquote_plus(video_info['author'][0])
1391
1392 # title
1393 if 'title' not in video_info:
1394 self._downloader.trouble(u'ERROR: unable to extract video title')
1395 return
1396 video_title = urllib.unquote_plus(video_info['title'][0])
1397 video_title = video_title.decode('utf-8')
1398 video_title = sanitize_title(video_title)
1399
1400 # simplified title
1401 simple_title = _simplify_title(video_title)
1402
1403 # thumbnail image
1404 if 'thumbnail_url' not in video_info:
1405 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1406 video_thumbnail = ''
1407 else: # don't panic if we can't find it
1408 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1409
1410 # upload date
1411 upload_date = u'NA'
1412 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1413 if mobj is not None:
1414 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1415 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1416 for expression in format_expressions:
1417 try:
1418 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1419 except:
1420 pass
1421
1422 # description
1423 try:
1424 lxml.etree
1425 except NameError:
1426 video_description = u'No description available.'
1427 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1428 if mobj is not None:
1429 video_description = mobj.group(1).decode('utf-8')
1430 else:
1431 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1432 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1433 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1434 # TODO use another parser
1435
1436 # closed captions
1437 video_subtitles = None
1438 if self._downloader.params.get('writesubtitles', False):
1439 self.report_video_subtitles_download(video_id)
1440 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1441 try:
1442 srt_list = urllib2.urlopen(request).read()
1443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1445 else:
1446 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1447 if srt_lang_list:
1448 if self._downloader.params.get('subtitleslang', False):
1449 srt_lang = self._downloader.params.get('subtitleslang')
1450 elif 'en' in srt_lang_list:
1451 srt_lang = 'en'
1452 else:
1453 srt_lang = srt_lang_list[0]
1454 if not srt_lang in srt_lang_list:
1455 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1456 else:
1457 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1458 try:
1459 srt_xml = urllib2.urlopen(request).read()
1460 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1461 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1462 else:
1463 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1464 else:
1465 self._downloader.trouble(u'WARNING: video has no closed captions')
1466
1467 # token
1468 video_token = urllib.unquote_plus(video_info['token'][0])
1469
1470 # Decide which formats to download
1471 req_format = self._downloader.params.get('format', None)
1472
1473 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1474 self.report_rtmp_download()
1475 video_url_list = [(None, video_info['conn'][0])]
1476 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1477 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1478 url_data = [parse_qs(uds) for uds in url_data_strs]
1479 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1480 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1481
1482 format_limit = self._downloader.params.get('format_limit', None)
1483 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1484 if format_limit is not None and format_limit in available_formats:
1485 format_list = available_formats[available_formats.index(format_limit):]
1486 else:
1487 format_list = available_formats
1488 existing_formats = [x for x in format_list if x in url_map]
1489 if len(existing_formats) == 0:
1490 self._downloader.trouble(u'ERROR: no known formats available for video')
1491 return
1492 if self._downloader.params.get('listformats', None):
1493 self._print_formats(existing_formats)
1494 return
1495 if req_format is None or req_format == 'best':
1496 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1497 elif req_format == 'worst':
1498 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1499 elif req_format in ('-1', 'all'):
1500 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1501 else:
1502 # Specific formats. We pick the first in a slash-delimeted sequence.
1503 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1504 req_formats = req_format.split('/')
1505 video_url_list = None
1506 for rf in req_formats:
1507 if rf in url_map:
1508 video_url_list = [(rf, url_map[rf])]
1509 break
1510 if video_url_list is None:
1511 self._downloader.trouble(u'ERROR: requested format not available')
1512 return
1513 else:
1514 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1515 return
1516
1517 for format_param, video_real_url in video_url_list:
1518 # At this point we have a new video
1519 self._downloader.increment_downloads()
1520
1521 # Extension
1522 video_extension = self._video_extensions.get(format_param, 'flv')
1523
1524 try:
1525 # Process video information
1526 self._downloader.process_info({
1527 'id': video_id.decode('utf-8'),
1528 'url': video_real_url.decode('utf-8'),
1529 'uploader': video_uploader.decode('utf-8'),
1530 'upload_date': upload_date,
1531 'title': video_title,
1532 'stitle': simple_title,
1533 'ext': video_extension.decode('utf-8'),
1534 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1535 'thumbnail': video_thumbnail.decode('utf-8'),
1536 'description': video_description,
1537 'player_url': player_url,
1538 'subtitles': video_subtitles
1539 })
1540 except UnavailableVideoError, err:
1541 self._downloader.trouble(u'\nERROR: unable to download video')
1542
1543
1544 class MetacafeIE(InfoExtractor):
1545 """Information Extractor for metacafe.com."""
1546
1547 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1548 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1549 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1550 _youtube_ie = None
1551 IE_NAME = u'metacafe'
1552
1553 def __init__(self, youtube_ie, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1555 self._youtube_ie = youtube_ie
1556
1557 def report_disclaimer(self):
1558 """Report disclaimer retrieval."""
1559 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1560
1561 def report_age_confirmation(self):
1562 """Report attempt to confirm age."""
1563 self._downloader.to_screen(u'[metacafe] Confirming age')
1564
1565 def report_download_webpage(self, video_id):
1566 """Report webpage download."""
1567 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1568
1569 def report_extraction(self, video_id):
1570 """Report information extraction."""
1571 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1572
1573 def _real_initialize(self):
1574 # Retrieve disclaimer
1575 request = urllib2.Request(self._DISCLAIMER)
1576 try:
1577 self.report_disclaimer()
1578 disclaimer = urllib2.urlopen(request).read()
1579 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1580 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1581 return
1582
1583 # Confirm age
1584 disclaimer_form = {
1585 'filters': '0',
1586 'submit': "Continue - I'm over 18",
1587 }
1588 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1589 try:
1590 self.report_age_confirmation()
1591 disclaimer = urllib2.urlopen(request).read()
1592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1594 return
1595
1596 def _real_extract(self, url):
1597 # Extract id and simplified title from URL
1598 mobj = re.match(self._VALID_URL, url)
1599 if mobj is None:
1600 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1601 return
1602
1603 video_id = mobj.group(1)
1604
1605 # Check if video comes from YouTube
1606 mobj2 = re.match(r'^yt-(.*)$', video_id)
1607 if mobj2 is not None:
1608 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1609 return
1610
1611 # At this point we have a new video
1612 self._downloader.increment_downloads()
1613
1614 simple_title = mobj.group(2).decode('utf-8')
1615
1616 # Retrieve video webpage to extract further information
1617 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1618 try:
1619 self.report_download_webpage(video_id)
1620 webpage = urllib2.urlopen(request).read()
1621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1622 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1623 return
1624
1625 # Extract URL, uploader and title from webpage
1626 self.report_extraction(video_id)
1627 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1628 if mobj is not None:
1629 mediaURL = urllib.unquote(mobj.group(1))
1630 video_extension = mediaURL[-3:]
1631
1632 # Extract gdaKey if available
1633 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1634 if mobj is None:
1635 video_url = mediaURL
1636 else:
1637 gdaKey = mobj.group(1)
1638 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1639 else:
1640 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1641 if mobj is None:
1642 self._downloader.trouble(u'ERROR: unable to extract media URL')
1643 return
1644 vardict = parse_qs(mobj.group(1))
1645 if 'mediaData' not in vardict:
1646 self._downloader.trouble(u'ERROR: unable to extract media URL')
1647 return
1648 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1649 if mobj is None:
1650 self._downloader.trouble(u'ERROR: unable to extract media URL')
1651 return
1652 mediaURL = mobj.group(1).replace('\\/', '/')
1653 video_extension = mediaURL[-3:]
1654 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1655
1656 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1657 if mobj is None:
1658 self._downloader.trouble(u'ERROR: unable to extract title')
1659 return
1660 video_title = mobj.group(1).decode('utf-8')
1661 video_title = sanitize_title(video_title)
1662
1663 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1664 if mobj is None:
1665 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1666 return
1667 video_uploader = mobj.group(1)
1668
1669 try:
1670 # Process video information
1671 self._downloader.process_info({
1672 'id': video_id.decode('utf-8'),
1673 'url': video_url.decode('utf-8'),
1674 'uploader': video_uploader.decode('utf-8'),
1675 'upload_date': u'NA',
1676 'title': video_title,
1677 'stitle': simple_title,
1678 'ext': video_extension.decode('utf-8'),
1679 'format': u'NA',
1680 'player_url': None,
1681 })
1682 except UnavailableVideoError:
1683 self._downloader.trouble(u'\nERROR: unable to download video')
1684
1685
1686 class DailymotionIE(InfoExtractor):
1687 """Information Extractor for Dailymotion"""
1688
1689 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1690 IE_NAME = u'dailymotion'
1691
1692 def __init__(self, downloader=None):
1693 InfoExtractor.__init__(self, downloader)
1694
1695 def report_download_webpage(self, video_id):
1696 """Report webpage download."""
1697 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1698
1699 def report_extraction(self, video_id):
1700 """Report information extraction."""
1701 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1702
1703 def _real_extract(self, url):
1704 # Extract id and simplified title from URL
1705 mobj = re.match(self._VALID_URL, url)
1706 if mobj is None:
1707 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1708 return
1709
1710 # At this point we have a new video
1711 self._downloader.increment_downloads()
1712 video_id = mobj.group(1)
1713
1714 video_extension = 'flv'
1715
1716 # Retrieve video webpage to extract further information
1717 request = urllib2.Request(url)
1718 request.add_header('Cookie', 'family_filter=off')
1719 try:
1720 self.report_download_webpage(video_id)
1721 webpage = urllib2.urlopen(request).read()
1722 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1723 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1724 return
1725
1726 # Extract URL, uploader and title from webpage
1727 self.report_extraction(video_id)
1728 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1729 if mobj is None:
1730 self._downloader.trouble(u'ERROR: unable to extract media URL')
1731 return
1732 sequence = urllib.unquote(mobj.group(1))
1733 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1734 if mobj is None:
1735 self._downloader.trouble(u'ERROR: unable to extract media URL')
1736 return
1737 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1738
1739 # if needed add http://www.dailymotion.com/ if relative URL
1740
1741 video_url = mediaURL
1742
1743 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1744 if mobj is None:
1745 self._downloader.trouble(u'ERROR: unable to extract title')
1746 return
1747 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1748 video_title = sanitize_title(video_title)
1749 simple_title = _simplify_title(video_title)
1750
1751 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1752 if mobj is None:
1753 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1754 return
1755 video_uploader = mobj.group(1)
1756
1757 try:
1758 # Process video information
1759 self._downloader.process_info({
1760 'id': video_id.decode('utf-8'),
1761 'url': video_url.decode('utf-8'),
1762 'uploader': video_uploader.decode('utf-8'),
1763 'upload_date': u'NA',
1764 'title': video_title,
1765 'stitle': simple_title,
1766 'ext': video_extension.decode('utf-8'),
1767 'format': u'NA',
1768 'player_url': None,
1769 })
1770 except UnavailableVideoError:
1771 self._downloader.trouble(u'\nERROR: unable to download video')
1772
1773
1774 class GoogleIE(InfoExtractor):
1775 """Information extractor for video.google.com."""
1776
1777 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1778 IE_NAME = u'video.google'
1779
1780 def __init__(self, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1782
1783 def report_download_webpage(self, video_id):
1784 """Report webpage download."""
1785 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1786
1787 def report_extraction(self, video_id):
1788 """Report information extraction."""
1789 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1790
1791 def _real_extract(self, url):
1792 # Extract id from URL
1793 mobj = re.match(self._VALID_URL, url)
1794 if mobj is None:
1795 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1796 return
1797
1798 # At this point we have a new video
1799 self._downloader.increment_downloads()
1800 video_id = mobj.group(1)
1801
1802 video_extension = 'mp4'
1803
1804 # Retrieve video webpage to extract further information
1805 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1806 try:
1807 self.report_download_webpage(video_id)
1808 webpage = urllib2.urlopen(request).read()
1809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1811 return
1812
1813 # Extract URL, uploader, and title from webpage
1814 self.report_extraction(video_id)
1815 mobj = re.search(r"download_url:'([^']+)'", webpage)
1816 if mobj is None:
1817 video_extension = 'flv'
1818 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1819 if mobj is None:
1820 self._downloader.trouble(u'ERROR: unable to extract media URL')
1821 return
1822 mediaURL = urllib.unquote(mobj.group(1))
1823 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1824 mediaURL = mediaURL.replace('\\x26', '\x26')
1825
1826 video_url = mediaURL
1827
1828 mobj = re.search(r'<title>(.*)</title>', webpage)
1829 if mobj is None:
1830 self._downloader.trouble(u'ERROR: unable to extract title')
1831 return
1832 video_title = mobj.group(1).decode('utf-8')
1833 video_title = sanitize_title(video_title)
1834 simple_title = _simplify_title(video_title)
1835
1836 # Extract video description
1837 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1838 if mobj is None:
1839 self._downloader.trouble(u'ERROR: unable to extract video description')
1840 return
1841 video_description = mobj.group(1).decode('utf-8')
1842 if not video_description:
1843 video_description = 'No description available.'
1844
1845 # Extract video thumbnail
1846 if self._downloader.params.get('forcethumbnail', False):
1847 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1848 try:
1849 webpage = urllib2.urlopen(request).read()
1850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1852 return
1853 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1854 if mobj is None:
1855 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1856 return
1857 video_thumbnail = mobj.group(1)
1858 else: # we need something to pass to process_info
1859 video_thumbnail = ''
1860
1861 try:
1862 # Process video information
1863 self._downloader.process_info({
1864 'id': video_id.decode('utf-8'),
1865 'url': video_url.decode('utf-8'),
1866 'uploader': u'NA',
1867 'upload_date': u'NA',
1868 'title': video_title,
1869 'stitle': simple_title,
1870 'ext': video_extension.decode('utf-8'),
1871 'format': u'NA',
1872 'player_url': None,
1873 })
1874 except UnavailableVideoError:
1875 self._downloader.trouble(u'\nERROR: unable to download video')
1876
1877
1878 class PhotobucketIE(InfoExtractor):
1879 """Information extractor for photobucket.com."""
1880
1881 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1882 IE_NAME = u'photobucket'
1883
1884 def __init__(self, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1886
1887 def report_download_webpage(self, video_id):
1888 """Report webpage download."""
1889 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1890
1891 def report_extraction(self, video_id):
1892 """Report information extraction."""
1893 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1894
1895 def _real_extract(self, url):
1896 # Extract id from URL
1897 mobj = re.match(self._VALID_URL, url)
1898 if mobj is None:
1899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1900 return
1901
1902 # At this point we have a new video
1903 self._downloader.increment_downloads()
1904 video_id = mobj.group(1)
1905
1906 video_extension = 'flv'
1907
1908 # Retrieve video webpage to extract further information
1909 request = urllib2.Request(url)
1910 try:
1911 self.report_download_webpage(video_id)
1912 webpage = urllib2.urlopen(request).read()
1913 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1914 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1915 return
1916
1917 # Extract URL, uploader, and title from webpage
1918 self.report_extraction(video_id)
1919 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1920 if mobj is None:
1921 self._downloader.trouble(u'ERROR: unable to extract media URL')
1922 return
1923 mediaURL = urllib.unquote(mobj.group(1))
1924
1925 video_url = mediaURL
1926
1927 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1928 if mobj is None:
1929 self._downloader.trouble(u'ERROR: unable to extract title')
1930 return
1931 video_title = mobj.group(1).decode('utf-8')
1932 video_title = sanitize_title(video_title)
1933 simple_title = _simplify_title(vide_title)
1934
1935 video_uploader = mobj.group(2).decode('utf-8')
1936
1937 try:
1938 # Process video information
1939 self._downloader.process_info({
1940 'id': video_id.decode('utf-8'),
1941 'url': video_url.decode('utf-8'),
1942 'uploader': video_uploader,
1943 'upload_date': u'NA',
1944 'title': video_title,
1945 'stitle': simple_title,
1946 'ext': video_extension.decode('utf-8'),
1947 'format': u'NA',
1948 'player_url': None,
1949 })
1950 except UnavailableVideoError:
1951 self._downloader.trouble(u'\nERROR: unable to download video')
1952
1953
1954 class YahooIE(InfoExtractor):
1955 """Information extractor for video.yahoo.com."""
1956
1957 # _VALID_URL matches all Yahoo! Video URLs
1958 # _VPAGE_URL matches only the extractable '/watch/' URLs
1959 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1960 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1961 IE_NAME = u'video.yahoo'
1962
1963 def __init__(self, downloader=None):
1964 InfoExtractor.__init__(self, downloader)
1965
1966 def report_download_webpage(self, video_id):
1967 """Report webpage download."""
1968 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1969
1970 def report_extraction(self, video_id):
1971 """Report information extraction."""
1972 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1973
1974 def _real_extract(self, url, new_video=True):
1975 # Extract ID from URL
1976 mobj = re.match(self._VALID_URL, url)
1977 if mobj is None:
1978 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1979 return
1980
1981 # At this point we have a new video
1982 self._downloader.increment_downloads()
1983 video_id = mobj.group(2)
1984 video_extension = 'flv'
1985
1986 # Rewrite valid but non-extractable URLs as
1987 # extractable English language /watch/ URLs
1988 if re.match(self._VPAGE_URL, url) is None:
1989 request = urllib2.Request(url)
1990 try:
1991 webpage = urllib2.urlopen(request).read()
1992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1994 return
1995
1996 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1997 if mobj is None:
1998 self._downloader.trouble(u'ERROR: Unable to extract id field')
1999 return
2000 yahoo_id = mobj.group(1)
2001
2002 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2003 if mobj is None:
2004 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2005 return
2006 yahoo_vid = mobj.group(1)
2007
2008 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2009 return self._real_extract(url, new_video=False)
2010
2011 # Retrieve video webpage to extract further information
2012 request = urllib2.Request(url)
2013 try:
2014 self.report_download_webpage(video_id)
2015 webpage = urllib2.urlopen(request).read()
2016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2018 return
2019
2020 # Extract uploader and title from webpage
2021 self.report_extraction(video_id)
2022 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2023 if mobj is None:
2024 self._downloader.trouble(u'ERROR: unable to extract video title')
2025 return
2026 video_title = mobj.group(1).decode('utf-8')
2027 simple_title = _simplify_title(video_title)
2028
2029 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2030 if mobj is None:
2031 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2032 return
2033 video_uploader = mobj.group(1).decode('utf-8')
2034
2035 # Extract video thumbnail
2036 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2037 if mobj is None:
2038 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2039 return
2040 video_thumbnail = mobj.group(1).decode('utf-8')
2041
2042 # Extract video description
2043 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2044 if mobj is None:
2045 self._downloader.trouble(u'ERROR: unable to extract video description')
2046 return
2047 video_description = mobj.group(1).decode('utf-8')
2048 if not video_description:
2049 video_description = 'No description available.'
2050
2051 # Extract video height and width
2052 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2053 if mobj is None:
2054 self._downloader.trouble(u'ERROR: unable to extract video height')
2055 return
2056 yv_video_height = mobj.group(1)
2057
2058 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2059 if mobj is None:
2060 self._downloader.trouble(u'ERROR: unable to extract video width')
2061 return
2062 yv_video_width = mobj.group(1)
2063
2064 # Retrieve video playlist to extract media URL
2065 # I'm not completely sure what all these options are, but we
2066 # seem to need most of them, otherwise the server sends a 401.
2067 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2068 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2069 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2070 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2071 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2072 try:
2073 self.report_download_webpage(video_id)
2074 webpage = urllib2.urlopen(request).read()
2075 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2077 return
2078
2079 # Extract media URL from playlist XML
2080 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2081 if mobj is None:
2082 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2083 return
2084 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2085 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2086
2087 try:
2088 # Process video information
2089 self._downloader.process_info({
2090 'id': video_id.decode('utf-8'),
2091 'url': video_url,
2092 'uploader': video_uploader,
2093 'upload_date': u'NA',
2094 'title': video_title,
2095 'stitle': simple_title,
2096 'ext': video_extension.decode('utf-8'),
2097 'thumbnail': video_thumbnail.decode('utf-8'),
2098 'description': video_description,
2099 'thumbnail': video_thumbnail,
2100 'player_url': None,
2101 })
2102 except UnavailableVideoError:
2103 self._downloader.trouble(u'\nERROR: unable to download video')
2104
2105
2106 class VimeoIE(InfoExtractor):
2107 """Information extractor for vimeo.com."""
2108
2109 # _VALID_URL matches Vimeo URLs
2110 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2111 IE_NAME = u'vimeo'
2112
2113 def __init__(self, downloader=None):
2114 InfoExtractor.__init__(self, downloader)
2115
2116 def report_download_webpage(self, video_id):
2117 """Report webpage download."""
2118 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2119
2120 def report_extraction(self, video_id):
2121 """Report information extraction."""
2122 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2123
2124 def _real_extract(self, url, new_video=True):
2125 # Extract ID from URL
2126 mobj = re.match(self._VALID_URL, url)
2127 if mobj is None:
2128 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2129 return
2130
2131 # At this point we have a new video
2132 self._downloader.increment_downloads()
2133 video_id = mobj.group(1)
2134
2135 # Retrieve video webpage to extract further information
2136 request = urllib2.Request(url, None, std_headers)
2137 try:
2138 self.report_download_webpage(video_id)
2139 webpage = urllib2.urlopen(request).read()
2140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2142 return
2143
2144 # Now we begin extracting as much information as we can from what we
2145 # retrieved. First we extract the information common to all extractors,
2146 # and latter we extract those that are Vimeo specific.
2147 self.report_extraction(video_id)
2148
2149 # Extract the config JSON
2150 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2151 try:
2152 config = json.loads(config)
2153 except:
2154 self._downloader.trouble(u'ERROR: unable to extract info section')
2155 return
2156
2157 # Extract title
2158 video_title = config["video"]["title"]
2159 simple_title = _simplify_title(video_title)
2160
2161 # Extract uploader
2162 video_uploader = config["video"]["owner"]["name"]
2163
2164 # Extract video thumbnail
2165 video_thumbnail = config["video"]["thumbnail"]
2166
2167 # Extract video description
2168 try:
2169 lxml.etree
2170 except NameError:
2171 video_description = u'No description available.'
2172 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2173 if mobj is not None:
2174 video_description = mobj.group(1)
2175 else:
2176 html_parser = lxml.etree.HTMLParser()
2177 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2178 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2179 # TODO use another parser
2180
2181 # Extract upload date
2182 video_upload_date = u'NA'
2183 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2184 if mobj is not None:
2185 video_upload_date = mobj.group(1)
2186
2187 # Vimeo specific: extract request signature and timestamp
2188 sig = config['request']['signature']
2189 timestamp = config['request']['timestamp']
2190
2191 # Vimeo specific: extract video codec and quality information
2192 # TODO bind to format param
2193 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2194 for codec in codecs:
2195 if codec[0] in config["video"]["files"]:
2196 video_codec = codec[0]
2197 video_extension = codec[1]
2198 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2199 else: quality = 'sd'
2200 break
2201 else:
2202 self._downloader.trouble(u'ERROR: no known codec found')
2203 return
2204
2205 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2206 %(video_id, sig, timestamp, quality, video_codec.upper())
2207
2208 try:
2209 # Process video information
2210 self._downloader.process_info({
2211 'id': video_id,
2212 'url': video_url,
2213 'uploader': video_uploader,
2214 'upload_date': video_upload_date,
2215 'title': video_title,
2216 'stitle': simple_title,
2217 'ext': video_extension,
2218 'thumbnail': video_thumbnail,
2219 'description': video_description,
2220 'player_url': None,
2221 })
2222 except UnavailableVideoError:
2223 self._downloader.trouble(u'ERROR: unable to download video')
2224
2225
2226 class GenericIE(InfoExtractor):
2227 """Generic last-resort information extractor."""
2228
2229 _VALID_URL = r'.*'
2230 IE_NAME = u'generic'
2231
2232 def __init__(self, downloader=None):
2233 InfoExtractor.__init__(self, downloader)
2234
2235 def report_download_webpage(self, video_id):
2236 """Report webpage download."""
2237 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2238 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2239
2240 def report_extraction(self, video_id):
2241 """Report information extraction."""
2242 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2243
2244 def _real_extract(self, url):
2245 # At this point we have a new video
2246 self._downloader.increment_downloads()
2247
2248 video_id = url.split('/')[-1]
2249 request = urllib2.Request(url)
2250 try:
2251 self.report_download_webpage(video_id)
2252 webpage = urllib2.urlopen(request).read()
2253 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2254 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2255 return
2256 except ValueError, err:
2257 # since this is the last-resort InfoExtractor, if
2258 # this error is thrown, it'll be thrown here
2259 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2260 return
2261
2262 self.report_extraction(video_id)
2263 # Start with something easy: JW Player in SWFObject
2264 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2265 if mobj is None:
2266 # Broaden the search a little bit
2267 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2268 if mobj is None:
2269 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2270 return
2271
2272 # It's possible that one of the regexes
2273 # matched, but returned an empty group:
2274 if mobj.group(1) is None:
2275 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2276 return
2277
2278 video_url = urllib.unquote(mobj.group(1))
2279 video_id = os.path.basename(video_url)
2280
2281 # here's a fun little line of code for you:
2282 video_extension = os.path.splitext(video_id)[1][1:]
2283 video_id = os.path.splitext(video_id)[0]
2284
2285 # it's tempting to parse this further, but you would
2286 # have to take into account all the variations like
2287 # Video Title - Site Name
2288 # Site Name | Video Title
2289 # Video Title - Tagline | Site Name
2290 # and so on and so forth; it's just not practical
2291 mobj = re.search(r'<title>(.*)</title>', webpage)
2292 if mobj is None:
2293 self._downloader.trouble(u'ERROR: unable to extract title')
2294 return
2295 video_title = mobj.group(1).decode('utf-8')
2296 video_title = sanitize_title(video_title)
2297 simple_title = _simplify_title(video_title)
2298
2299 # video uploader is domain name
2300 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2301 if mobj is None:
2302 self._downloader.trouble(u'ERROR: unable to extract title')
2303 return
2304 video_uploader = mobj.group(1).decode('utf-8')
2305
2306 try:
2307 # Process video information
2308 self._downloader.process_info({
2309 'id': video_id.decode('utf-8'),
2310 'url': video_url.decode('utf-8'),
2311 'uploader': video_uploader,
2312 'upload_date': u'NA',
2313 'title': video_title,
2314 'stitle': simple_title,
2315 'ext': video_extension.decode('utf-8'),
2316 'format': u'NA',
2317 'player_url': None,
2318 })
2319 except UnavailableVideoError, err:
2320 self._downloader.trouble(u'\nERROR: unable to download video')
2321
2322
2323 class YoutubeSearchIE(InfoExtractor):
2324 """Information Extractor for YouTube search queries."""
2325 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2326 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2327 _youtube_ie = None
2328 _max_youtube_results = 1000
2329 IE_NAME = u'youtube:search'
2330
2331 def __init__(self, youtube_ie, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2333 self._youtube_ie = youtube_ie
2334
2335 def report_download_page(self, query, pagenum):
2336 """Report attempt to download playlist page with given number."""
2337 query = query.decode(preferredencoding())
2338 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2339
2340 def _real_initialize(self):
2341 self._youtube_ie.initialize()
2342
2343 def _real_extract(self, query):
2344 mobj = re.match(self._VALID_URL, query)
2345 if mobj is None:
2346 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2347 return
2348
2349 prefix, query = query.split(':')
2350 prefix = prefix[8:]
2351 query = query.encode('utf-8')
2352 if prefix == '':
2353 self._download_n_results(query, 1)
2354 return
2355 elif prefix == 'all':
2356 self._download_n_results(query, self._max_youtube_results)
2357 return
2358 else:
2359 try:
2360 n = long(prefix)
2361 if n <= 0:
2362 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2363 return
2364 elif n > self._max_youtube_results:
2365 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2366 n = self._max_youtube_results
2367 self._download_n_results(query, n)
2368 return
2369 except ValueError: # parsing prefix as integer fails
2370 self._download_n_results(query, 1)
2371 return
2372
2373 def _download_n_results(self, query, n):
2374 """Downloads a specified number of results for a query"""
2375
2376 video_ids = []
2377 pagenum = 0
2378 limit = n
2379
2380 while (50 * pagenum) < limit:
2381 self.report_download_page(query, pagenum+1)
2382 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2383 request = urllib2.Request(result_url)
2384 try:
2385 data = urllib2.urlopen(request).read()
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2388 return
2389 api_response = json.loads(data)['data']
2390
2391 new_ids = list(video['id'] for video in api_response['items'])
2392 video_ids += new_ids
2393
2394 limit = min(n, api_response['totalItems'])
2395 pagenum += 1
2396
2397 if len(video_ids) > n:
2398 video_ids = video_ids[:n]
2399 for id in video_ids:
2400 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2401 return
2402
2403
2404 class GoogleSearchIE(InfoExtractor):
2405 """Information Extractor for Google Video search queries."""
2406 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2407 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2408 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2409 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2410 _google_ie = None
2411 _max_google_results = 1000
2412 IE_NAME = u'video.google:search'
2413
2414 def __init__(self, google_ie, downloader=None):
2415 InfoExtractor.__init__(self, downloader)
2416 self._google_ie = google_ie
2417
2418 def report_download_page(self, query, pagenum):
2419 """Report attempt to download playlist page with given number."""
2420 query = query.decode(preferredencoding())
2421 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2422
2423 def _real_initialize(self):
2424 self._google_ie.initialize()
2425
2426 def _real_extract(self, query):
2427 mobj = re.match(self._VALID_URL, query)
2428 if mobj is None:
2429 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2430 return
2431
2432 prefix, query = query.split(':')
2433 prefix = prefix[8:]
2434 query = query.encode('utf-8')
2435 if prefix == '':
2436 self._download_n_results(query, 1)
2437 return
2438 elif prefix == 'all':
2439 self._download_n_results(query, self._max_google_results)
2440 return
2441 else:
2442 try:
2443 n = long(prefix)
2444 if n <= 0:
2445 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2446 return
2447 elif n > self._max_google_results:
2448 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2449 n = self._max_google_results
2450 self._download_n_results(query, n)
2451 return
2452 except ValueError: # parsing prefix as integer fails
2453 self._download_n_results(query, 1)
2454 return
2455
2456 def _download_n_results(self, query, n):
2457 """Downloads a specified number of results for a query"""
2458
2459 video_ids = []
2460 pagenum = 0
2461
2462 while True:
2463 self.report_download_page(query, pagenum)
2464 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2465 request = urllib2.Request(result_url)
2466 try:
2467 page = urllib2.urlopen(request).read()
2468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2470 return
2471
2472 # Extract video identifiers
2473 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474 video_id = mobj.group(1)
2475 if video_id not in video_ids:
2476 video_ids.append(video_id)
2477 if len(video_ids) == n:
2478 # Specified n videos reached
2479 for id in video_ids:
2480 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2481 return
2482
2483 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2484 for id in video_ids:
2485 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2486 return
2487
2488 pagenum = pagenum + 1
2489
2490
2491 class YahooSearchIE(InfoExtractor):
2492 """Information Extractor for Yahoo! Video search queries."""
2493 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2494 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2495 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2496 _MORE_PAGES_INDICATOR = r'\s*Next'
2497 _yahoo_ie = None
2498 _max_yahoo_results = 1000
2499 IE_NAME = u'video.yahoo:search'
2500
2501 def __init__(self, yahoo_ie, downloader=None):
2502 InfoExtractor.__init__(self, downloader)
2503 self._yahoo_ie = yahoo_ie
2504
2505 def report_download_page(self, query, pagenum):
2506 """Report attempt to download playlist page with given number."""
2507 query = query.decode(preferredencoding())
2508 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2509
2510 def _real_initialize(self):
2511 self._yahoo_ie.initialize()
2512
2513 def _real_extract(self, query):
2514 mobj = re.match(self._VALID_URL, query)
2515 if mobj is None:
2516 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2517 return
2518
2519 prefix, query = query.split(':')
2520 prefix = prefix[8:]
2521 query = query.encode('utf-8')
2522 if prefix == '':
2523 self._download_n_results(query, 1)
2524 return
2525 elif prefix == 'all':
2526 self._download_n_results(query, self._max_yahoo_results)
2527 return
2528 else:
2529 try:
2530 n = long(prefix)
2531 if n <= 0:
2532 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2533 return
2534 elif n > self._max_yahoo_results:
2535 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2536 n = self._max_yahoo_results
2537 self._download_n_results(query, n)
2538 return
2539 except ValueError: # parsing prefix as integer fails
2540 self._download_n_results(query, 1)
2541 return
2542
2543 def _download_n_results(self, query, n):
2544 """Downloads a specified number of results for a query"""
2545
2546 video_ids = []
2547 already_seen = set()
2548 pagenum = 1
2549
2550 while True:
2551 self.report_download_page(query, pagenum)
2552 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2553 request = urllib2.Request(result_url)
2554 try:
2555 page = urllib2.urlopen(request).read()
2556 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2557 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2558 return
2559
2560 # Extract video identifiers
2561 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2562 video_id = mobj.group(1)
2563 if video_id not in already_seen:
2564 video_ids.append(video_id)
2565 already_seen.add(video_id)
2566 if len(video_ids) == n:
2567 # Specified n videos reached
2568 for id in video_ids:
2569 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2570 return
2571
2572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2573 for id in video_ids:
2574 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2575 return
2576
2577 pagenum = pagenum + 1
2578
2579
2580 class YoutubePlaylistIE(InfoExtractor):
2581 """Information Extractor for YouTube playlists."""
2582
2583 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2584 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2585 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2586 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2587 _youtube_ie = None
2588 IE_NAME = u'youtube:playlist'
2589
2590 def __init__(self, youtube_ie, downloader=None):
2591 InfoExtractor.__init__(self, downloader)
2592 self._youtube_ie = youtube_ie
2593
2594 def report_download_page(self, playlist_id, pagenum):
2595 """Report attempt to download playlist page with given number."""
2596 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2597
2598 def _real_initialize(self):
2599 self._youtube_ie.initialize()
2600
2601 def _real_extract(self, url):
2602 # Extract playlist id
2603 mobj = re.match(self._VALID_URL, url)
2604 if mobj is None:
2605 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2606 return
2607
2608 # Single video case
2609 if mobj.group(3) is not None:
2610 self._youtube_ie.extract(mobj.group(3))
2611 return
2612
2613 # Download playlist pages
2614 # prefix is 'p' as default for playlists but there are other types that need extra care
2615 playlist_prefix = mobj.group(1)
2616 if playlist_prefix == 'a':
2617 playlist_access = 'artist'
2618 else:
2619 playlist_prefix = 'p'
2620 playlist_access = 'view_play_list'
2621 playlist_id = mobj.group(2)
2622 video_ids = []
2623 pagenum = 1
2624
2625 while True:
2626 self.report_download_page(playlist_id, pagenum)
2627 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2628 request = urllib2.Request(url)
2629 try:
2630 page = urllib2.urlopen(request).read()
2631 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2633 return
2634
2635 # Extract video identifiers
2636 ids_in_page = []
2637 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2638 if mobj.group(1) not in ids_in_page:
2639 ids_in_page.append(mobj.group(1))
2640 video_ids.extend(ids_in_page)
2641
2642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2643 break
2644 pagenum = pagenum + 1
2645
2646 playliststart = self._downloader.params.get('playliststart', 1) - 1
2647 playlistend = self._downloader.params.get('playlistend', -1)
2648 if playlistend == -1:
2649 video_ids = video_ids[playliststart:]
2650 else:
2651 video_ids = video_ids[playliststart:playlistend]
2652
2653 for id in video_ids:
2654 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2655 return
2656
2657
2658 class YoutubeUserIE(InfoExtractor):
2659 """Information Extractor for YouTube users."""
2660
2661 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2662 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2663 _GDATA_PAGE_SIZE = 50
2664 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2665 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2666 _youtube_ie = None
2667 IE_NAME = u'youtube:user'
2668
2669 def __init__(self, youtube_ie, downloader=None):
2670 InfoExtractor.__init__(self, downloader)
2671 self._youtube_ie = youtube_ie
2672
2673 def report_download_page(self, username, start_index):
2674 """Report attempt to download user page."""
2675 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2676 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2677
2678 def _real_initialize(self):
2679 self._youtube_ie.initialize()
2680
2681 def _real_extract(self, url):
2682 # Extract username
2683 mobj = re.match(self._VALID_URL, url)
2684 if mobj is None:
2685 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2686 return
2687
2688 username = mobj.group(1)
2689
2690 # Download video ids using YouTube Data API. Result size per
2691 # query is limited (currently to 50 videos) so we need to query
2692 # page by page until there are no video ids - it means we got
2693 # all of them.
2694
2695 video_ids = []
2696 pagenum = 0
2697
2698 while True:
2699 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2700 self.report_download_page(username, start_index)
2701
2702 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2703
2704 try:
2705 page = urllib2.urlopen(request).read()
2706 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2707 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2708 return
2709
2710 # Extract video identifiers
2711 ids_in_page = []
2712
2713 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2714 if mobj.group(1) not in ids_in_page:
2715 ids_in_page.append(mobj.group(1))
2716
2717 video_ids.extend(ids_in_page)
2718
2719 # A little optimization - if current page is not
2720 # "full", ie. does not contain PAGE_SIZE video ids then
2721 # we can assume that this page is the last one - there
2722 # are no more ids on further pages - no need to query
2723 # again.
2724
2725 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2726 break
2727
2728 pagenum += 1
2729
2730 all_ids_count = len(video_ids)
2731 playliststart = self._downloader.params.get('playliststart', 1) - 1
2732 playlistend = self._downloader.params.get('playlistend', -1)
2733
2734 if playlistend == -1:
2735 video_ids = video_ids[playliststart:]
2736 else:
2737 video_ids = video_ids[playliststart:playlistend]
2738
2739 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2740 (username, all_ids_count, len(video_ids)))
2741
2742 for video_id in video_ids:
2743 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2744
2745
2746 class DepositFilesIE(InfoExtractor):
2747 """Information extractor for depositfiles.com"""
2748
2749 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2750 IE_NAME = u'DepositFiles'
2751
2752 def __init__(self, downloader=None):
2753 InfoExtractor.__init__(self, downloader)
2754
2755 def report_download_webpage(self, file_id):
2756 """Report webpage download."""
2757 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2758
2759 def report_extraction(self, file_id):
2760 """Report information extraction."""
2761 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2762
2763 def _real_extract(self, url):
2764 # At this point we have a new file
2765 self._downloader.increment_downloads()
2766
2767 file_id = url.split('/')[-1]
2768 # Rebuild url in english locale
2769 url = 'http://depositfiles.com/en/files/' + file_id
2770
2771 # Retrieve file webpage with 'Free download' button pressed
2772 free_download_indication = { 'gateway_result' : '1' }
2773 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2774 try:
2775 self.report_download_webpage(file_id)
2776 webpage = urllib2.urlopen(request).read()
2777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2779 return
2780
2781 # Search for the real file URL
2782 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2783 if (mobj is None) or (mobj.group(1) is None):
2784 # Try to figure out reason of the error.
2785 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2786 if (mobj is not None) and (mobj.group(1) is not None):
2787 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2788 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2789 else:
2790 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2791 return
2792
2793 file_url = mobj.group(1)
2794 file_extension = os.path.splitext(file_url)[1][1:]
2795
2796 # Search for file title
2797 mobj = re.search(r'<b title="(.*?)">', webpage)
2798 if mobj is None:
2799 self._downloader.trouble(u'ERROR: unable to extract title')
2800 return
2801 file_title = mobj.group(1).decode('utf-8')
2802
2803 try:
2804 # Process file information
2805 self._downloader.process_info({
2806 'id': file_id.decode('utf-8'),
2807 'url': file_url.decode('utf-8'),
2808 'uploader': u'NA',
2809 'upload_date': u'NA',
2810 'title': file_title,
2811 'stitle': file_title,
2812 'ext': file_extension.decode('utf-8'),
2813 'format': u'NA',
2814 'player_url': None,
2815 })
2816 except UnavailableVideoError, err:
2817 self._downloader.trouble(u'ERROR: unable to download file')
2818
2819
2820 class FacebookIE(InfoExtractor):
2821 """Information Extractor for Facebook"""
2822
2823 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2824 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2825 _NETRC_MACHINE = 'facebook'
2826 _available_formats = ['video', 'highqual', 'lowqual']
2827 _video_extensions = {
2828 'video': 'mp4',
2829 'highqual': 'mp4',
2830 'lowqual': 'mp4',
2831 }
2832 IE_NAME = u'facebook'
2833
2834 def __init__(self, downloader=None):
2835 InfoExtractor.__init__(self, downloader)
2836
2837 def _reporter(self, message):
2838 """Add header and report message."""
2839 self._downloader.to_screen(u'[facebook] %s' % message)
2840
2841 def report_login(self):
2842 """Report attempt to log in."""
2843 self._reporter(u'Logging in')
2844
2845 def report_video_webpage_download(self, video_id):
2846 """Report attempt to download video webpage."""
2847 self._reporter(u'%s: Downloading video webpage' % video_id)
2848
2849 def report_information_extraction(self, video_id):
2850 """Report attempt to extract video information."""
2851 self._reporter(u'%s: Extracting video information' % video_id)
2852
2853 def _parse_page(self, video_webpage):
2854 """Extract video information from page"""
2855 # General data
2856 data = {'title': r'\("video_title", "(.*?)"\)',
2857 'description': r'<div class="datawrap">(.*?)</div>',
2858 'owner': r'\("video_owner_name", "(.*?)"\)',
2859 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2860 }
2861 video_info = {}
2862 for piece in data.keys():
2863 mobj = re.search(data[piece], video_webpage)
2864 if mobj is not None:
2865 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2866
2867 # Video urls
2868 video_urls = {}
2869 for fmt in self._available_formats:
2870 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2871 if mobj is not None:
2872 # URL is in a Javascript segment inside an escaped Unicode format within
2873 # the generally utf-8 page
2874 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2875 video_info['video_urls'] = video_urls
2876
2877 return video_info
2878
2879 def _real_initialize(self):
2880 if self._downloader is None:
2881 return
2882
2883 useremail = None
2884 password = None
2885 downloader_params = self._downloader.params
2886
2887 # Attempt to use provided username and password or .netrc data
2888 if downloader_params.get('username', None) is not None:
2889 useremail = downloader_params['username']
2890 password = downloader_params['password']
2891 elif downloader_params.get('usenetrc', False):
2892 try:
2893 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2894 if info is not None:
2895 useremail = info[0]
2896 password = info[2]
2897 else:
2898 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2899 except (IOError, netrc.NetrcParseError), err:
2900 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2901 return
2902
2903 if useremail is None:
2904 return
2905
2906 # Log in
2907 login_form = {
2908 'email': useremail,
2909 'pass': password,
2910 'login': 'Log+In'
2911 }
2912 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2913 try:
2914 self.report_login()
2915 login_results = urllib2.urlopen(request).read()
2916 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2917 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2918 return
2919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2921 return
2922
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2925 if mobj is None:
2926 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2927 return
2928 video_id = mobj.group('ID')
2929
2930 # Get video webpage
2931 self.report_video_webpage_download(video_id)
2932 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2933 try:
2934 page = urllib2.urlopen(request)
2935 video_webpage = page.read()
2936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2937 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2938 return
2939
2940 # Start extracting information
2941 self.report_information_extraction(video_id)
2942
2943 # Extract information
2944 video_info = self._parse_page(video_webpage)
2945
2946 # uploader
2947 if 'owner' not in video_info:
2948 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2949 return
2950 video_uploader = video_info['owner']
2951
2952 # title
2953 if 'title' not in video_info:
2954 self._downloader.trouble(u'ERROR: unable to extract video title')
2955 return
2956 video_title = video_info['title']
2957 video_title = video_title.decode('utf-8')
2958 video_title = sanitize_title(video_title)
2959
2960 simple_title = _simplify_title(video_title)
2961
2962 # thumbnail image
2963 if 'thumbnail' not in video_info:
2964 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2965 video_thumbnail = ''
2966 else:
2967 video_thumbnail = video_info['thumbnail']
2968
2969 # upload date
2970 upload_date = u'NA'
2971 if 'upload_date' in video_info:
2972 upload_time = video_info['upload_date']
2973 timetuple = email.utils.parsedate_tz(upload_time)
2974 if timetuple is not None:
2975 try:
2976 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2977 except:
2978 pass
2979
2980 # description
2981 video_description = video_info.get('description', 'No description available.')
2982
2983 url_map = video_info['video_urls']
2984 if len(url_map.keys()) > 0:
2985 # Decide which formats to download
2986 req_format = self._downloader.params.get('format', None)
2987 format_limit = self._downloader.params.get('format_limit', None)
2988
2989 if format_limit is not None and format_limit in self._available_formats:
2990 format_list = self._available_formats[self._available_formats.index(format_limit):]
2991 else:
2992 format_list = self._available_formats
2993 existing_formats = [x for x in format_list if x in url_map]
2994 if len(existing_formats) == 0:
2995 self._downloader.trouble(u'ERROR: no known formats available for video')
2996 return
2997 if req_format is None:
2998 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2999 elif req_format == 'worst':
3000 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3001 elif req_format == '-1':
3002 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3003 else:
3004 # Specific format
3005 if req_format not in url_map:
3006 self._downloader.trouble(u'ERROR: requested format not available')
3007 return
3008 video_url_list = [(req_format, url_map[req_format])] # Specific format
3009
3010 for format_param, video_real_url in video_url_list:
3011
3012 # At this point we have a new video
3013 self._downloader.increment_downloads()
3014
3015 # Extension
3016 video_extension = self._video_extensions.get(format_param, 'mp4')
3017
3018 try:
3019 # Process video information
3020 self._downloader.process_info({
3021 'id': video_id.decode('utf-8'),
3022 'url': video_real_url.decode('utf-8'),
3023 'uploader': video_uploader.decode('utf-8'),
3024 'upload_date': upload_date,
3025 'title': video_title,
3026 'stitle': simple_title,
3027 'ext': video_extension.decode('utf-8'),
3028 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3029 'thumbnail': video_thumbnail.decode('utf-8'),
3030 'description': video_description.decode('utf-8'),
3031 'player_url': None,
3032 })
3033 except UnavailableVideoError, err:
3034 self._downloader.trouble(u'\nERROR: unable to download video')
3035
3036 class BlipTVIE(InfoExtractor):
3037 """Information extractor for blip.tv"""
3038
3039 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3040 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3041 IE_NAME = u'blip.tv'
3042
3043 def report_extraction(self, file_id):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3046
3047 def report_direct_download(self, title):
3048 """Report information extraction."""
3049 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3050
3051 def _real_extract(self, url):
3052 mobj = re.match(self._VALID_URL, url)
3053 if mobj is None:
3054 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3055 return
3056
3057 if '?' in url:
3058 cchar = '&'
3059 else:
3060 cchar = '?'
3061 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3062 request = urllib2.Request(json_url)
3063 self.report_extraction(mobj.group(1))
3064 info = None
3065 try:
3066 urlh = urllib2.urlopen(request)
3067 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3068 basename = url.split('/')[-1]
3069 title,ext = os.path.splitext(basename)
3070 title = title.decode('UTF-8')
3071 ext = ext.replace('.', '')
3072 self.report_direct_download(title)
3073 info = {
3074 'id': title,
3075 'url': url,
3076 'title': title,
3077 'stitle': _simplify_title(title),
3078 'ext': ext,
3079 'urlhandle': urlh
3080 }
3081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3082 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3083 return
3084 if info is None: # Regular URL
3085 try:
3086 json_code = urlh.read()
3087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3088 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3089 return
3090
3091 try:
3092 json_data = json.loads(json_code)
3093 if 'Post' in json_data:
3094 data = json_data['Post']
3095 else:
3096 data = json_data
3097
3098 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3099 video_url = data['media']['url']
3100 umobj = re.match(self._URL_EXT, video_url)
3101 if umobj is None:
3102 raise ValueError('Can not determine filename extension')
3103 ext = umobj.group(1)
3104
3105 info = {
3106 'id': data['item_id'],
3107 'url': video_url,
3108 'uploader': data['display_name'],
3109 'upload_date': upload_date,
3110 'title': data['title'],
3111 'stitle': _simplify_title(data['title']),
3112 'ext': ext,
3113 'format': data['media']['mimeType'],
3114 'thumbnail': data['thumbnailUrl'],
3115 'description': data['description'],
3116 'player_url': data['embedUrl']
3117 }
3118 except (ValueError,KeyError), err:
3119 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3120 return
3121
3122 self._downloader.increment_downloads()
3123
3124 try:
3125 self._downloader.process_info(info)
3126 except UnavailableVideoError, err:
3127 self._downloader.trouble(u'\nERROR: unable to download video')
3128
3129
3130 class MyVideoIE(InfoExtractor):
3131 """Information Extractor for myvideo.de."""
3132
3133 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3134 IE_NAME = u'myvideo'
3135
3136 def __init__(self, downloader=None):
3137 InfoExtractor.__init__(self, downloader)
3138
3139 def report_download_webpage(self, video_id):
3140 """Report webpage download."""
3141 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3142
3143 def report_extraction(self, video_id):
3144 """Report information extraction."""
3145 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3146
3147 def _real_extract(self,url):
3148 mobj = re.match(self._VALID_URL, url)
3149 if mobj is None:
3150 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3151 return
3152
3153 video_id = mobj.group(1)
3154
3155 # Get video webpage
3156 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3157 try:
3158 self.report_download_webpage(video_id)
3159 webpage = urllib2.urlopen(request).read()
3160 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3161 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3162 return
3163
3164 self.report_extraction(video_id)
3165 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3166 webpage)
3167 if mobj is None:
3168 self._downloader.trouble(u'ERROR: unable to extract media URL')
3169 return
3170 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3171
3172 mobj = re.search('<title>([^<]+)</title>', webpage)
3173 if mobj is None:
3174 self._downloader.trouble(u'ERROR: unable to extract title')
3175 return
3176
3177 video_title = mobj.group(1)
3178 video_title = sanitize_title(video_title)
3179
3180 simple_title = _simplify_title(video_title)
3181
3182 try:
3183 self._downloader.process_info({
3184 'id': video_id,
3185 'url': video_url,
3186 'uploader': u'NA',
3187 'upload_date': u'NA',
3188 'title': video_title,
3189 'stitle': simple_title,
3190 'ext': u'flv',
3191 'format': u'NA',
3192 'player_url': None,
3193 })
3194 except UnavailableVideoError:
3195 self._downloader.trouble(u'\nERROR: Unable to download video')
3196
3197 class ComedyCentralIE(InfoExtractor):
3198 """Information extractor for The Daily Show and Colbert Report """
3199
3200 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3201 IE_NAME = u'comedycentral'
3202
3203 def report_extraction(self, episode_id):
3204 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3205
3206 def report_config_download(self, episode_id):
3207 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3208
3209 def report_index_download(self, episode_id):
3210 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3211
3212 def report_player_url(self, episode_id):
3213 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3214
3215 def _real_extract(self, url):
3216 mobj = re.match(self._VALID_URL, url)
3217 if mobj is None:
3218 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3219 return
3220
3221 if mobj.group('shortname'):
3222 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3223 url = u'http://www.thedailyshow.com/full-episodes/'
3224 else:
3225 url = u'http://www.colbertnation.com/full-episodes/'
3226 mobj = re.match(self._VALID_URL, url)
3227 assert mobj is not None
3228
3229 dlNewest = not mobj.group('episode')
3230 if dlNewest:
3231 epTitle = mobj.group('showname')
3232 else:
3233 epTitle = mobj.group('episode')
3234
3235 req = urllib2.Request(url)
3236 self.report_extraction(epTitle)
3237 try:
3238 htmlHandle = urllib2.urlopen(req)
3239 html = htmlHandle.read()
3240 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3241 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3242 return
3243 if dlNewest:
3244 url = htmlHandle.geturl()
3245 mobj = re.match(self._VALID_URL, url)
3246 if mobj is None:
3247 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3248 return
3249 if mobj.group('episode') == '':
3250 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3251 return
3252 epTitle = mobj.group('episode')
3253
3254 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3255 if len(mMovieParams) == 0:
3256 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3257 return
3258
3259 playerUrl_raw = mMovieParams[0][0]
3260 self.report_player_url(epTitle)
3261 try:
3262 urlHandle = urllib2.urlopen(playerUrl_raw)
3263 playerUrl = urlHandle.geturl()
3264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3265 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3266 return
3267
3268 uri = mMovieParams[0][1]
3269 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3270 self.report_index_download(epTitle)
3271 try:
3272 indexXml = urllib2.urlopen(indexUrl).read()
3273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3275 return
3276
3277 idoc = xml.etree.ElementTree.fromstring(indexXml)
3278 itemEls = idoc.findall('.//item')
3279 for itemEl in itemEls:
3280 mediaId = itemEl.findall('./guid')[0].text
3281 shortMediaId = mediaId.split(':')[-1]
3282 showId = mediaId.split(':')[-2].replace('.com', '')
3283 officialTitle = itemEl.findall('./title')[0].text
3284 officialDate = itemEl.findall('./pubDate')[0].text
3285
3286 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3287 urllib.urlencode({'uri': mediaId}))
3288 configReq = urllib2.Request(configUrl)
3289 self.report_config_download(epTitle)
3290 try:
3291 configXml = urllib2.urlopen(configReq).read()
3292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3294 return
3295
3296 cdoc = xml.etree.ElementTree.fromstring(configXml)
3297 turls = []
3298 for rendition in cdoc.findall('.//rendition'):
3299 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3300 turls.append(finfo)
3301
3302 if len(turls) == 0:
3303 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3304 continue
3305
3306 # For now, just pick the highest bitrate
3307 format,video_url = turls[-1]
3308
3309 self._downloader.increment_downloads()
3310
3311 effTitle = showId + u'-' + epTitle
3312 info = {
3313 'id': shortMediaId,
3314 'url': video_url,
3315 'uploader': showId,
3316 'upload_date': officialDate,
3317 'title': effTitle,
3318 'stitle': _simplify_title(effTitle),
3319 'ext': 'mp4',
3320 'format': format,
3321 'thumbnail': None,
3322 'description': officialTitle,
3323 'player_url': playerUrl
3324 }
3325
3326 try:
3327 self._downloader.process_info(info)
3328 except UnavailableVideoError, err:
3329 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3330 continue
3331
3332
3333 class EscapistIE(InfoExtractor):
3334 """Information extractor for The Escapist """
3335
3336 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3337 IE_NAME = u'escapist'
3338
3339 def report_extraction(self, showName):
3340 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3341
3342 def report_config_download(self, showName):
3343 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3344
3345 def _real_extract(self, url):
3346 htmlParser = HTMLParser.HTMLParser()
3347
3348 mobj = re.match(self._VALID_URL, url)
3349 if mobj is None:
3350 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3351 return
3352 showName = mobj.group('showname')
3353 videoId = mobj.group('episode')
3354
3355 self.report_extraction(showName)
3356 try:
3357 webPage = urllib2.urlopen(url).read()
3358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3360 return
3361
3362 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3363 description = htmlParser.unescape(descMatch.group(1))
3364 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3365 imgUrl = htmlParser.unescape(imgMatch.group(1))
3366 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3367 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3368 configUrlMatch = re.search('config=(.*)$', playerUrl)
3369 configUrl = urllib2.unquote(configUrlMatch.group(1))
3370
3371 self.report_config_download(showName)
3372 try:
3373 configJSON = urllib2.urlopen(configUrl).read()
3374 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3375 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3376 return
3377
3378 # Technically, it's JavaScript, not JSON
3379 configJSON = configJSON.replace("'", '"')
3380
3381 try:
3382 config = json.loads(configJSON)
3383 except (ValueError,), err:
3384 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3385 return
3386
3387 playlist = config['playlist']
3388 videoUrl = playlist[1]['url']
3389
3390 self._downloader.increment_downloads()
3391 info = {
3392 'id': videoId,
3393 'url': videoUrl,
3394 'uploader': showName,
3395 'upload_date': None,
3396 'title': showName,
3397 'stitle': _simplify_title(showName),
3398 'ext': 'flv',
3399 'format': 'flv',
3400 'thumbnail': imgUrl,
3401 'description': description,
3402 'player_url': playerUrl,
3403 }
3404
3405 try:
3406 self._downloader.process_info(info)
3407 except UnavailableVideoError, err:
3408 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3409
3410
3411 class CollegeHumorIE(InfoExtractor):
3412 """Information extractor for collegehumor.com"""
3413
3414 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3415 IE_NAME = u'collegehumor'
3416
3417 def report_webpage(self, video_id):
3418 """Report information extraction."""
3419 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3420
3421 def report_extraction(self, video_id):
3422 """Report information extraction."""
3423 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3424
3425 def _real_extract(self, url):
3426 htmlParser = HTMLParser.HTMLParser()
3427
3428 mobj = re.match(self._VALID_URL, url)
3429 if mobj is None:
3430 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3431 return
3432 video_id = mobj.group('videoid')
3433
3434 self.report_webpage(video_id)
3435 request = urllib2.Request(url)
3436 try:
3437 webpage = urllib2.urlopen(request).read()
3438 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3439 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3440 return
3441
3442 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3443 if m is None:
3444 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3445 return
3446 internal_video_id = m.group('internalvideoid')
3447
3448 info = {
3449 'id': video_id,
3450 'internal_id': internal_video_id,
3451 }
3452
3453 self.report_extraction(video_id)
3454 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3455 try:
3456 metaXml = urllib2.urlopen(xmlUrl).read()
3457 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3458 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3459 return
3460
3461 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3462 try:
3463 videoNode = mdoc.findall('./video')[0]
3464 info['description'] = videoNode.findall('./description')[0].text
3465 info['title'] = videoNode.findall('./caption')[0].text
3466 info['stitle'] = _simplify_title(info['title'])
3467 info['url'] = videoNode.findall('./file')[0].text
3468 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3469 info['ext'] = info['url'].rpartition('.')[2]
3470 info['format'] = info['ext']
3471 except IndexError:
3472 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3473 return
3474
3475 self._downloader.increment_downloads()
3476
3477 try:
3478 self._downloader.process_info(info)
3479 except UnavailableVideoError, err:
3480 self._downloader.trouble(u'\nERROR: unable to download video')
3481
3482
3483 class XVideosIE(InfoExtractor):
3484 """Information extractor for xvideos.com"""
3485
3486 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3487 IE_NAME = u'xvideos'
3488
3489 def report_webpage(self, video_id):
3490 """Report information extraction."""
3491 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3492
3493 def report_extraction(self, video_id):
3494 """Report information extraction."""
3495 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3496
3497 def _real_extract(self, url):
3498 htmlParser = HTMLParser.HTMLParser()
3499
3500 mobj = re.match(self._VALID_URL, url)
3501 if mobj is None:
3502 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3503 return
3504 video_id = mobj.group(1).decode('utf-8')
3505
3506 self.report_webpage(video_id)
3507
3508 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3509 try:
3510 webpage = urllib2.urlopen(request).read()
3511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3512 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3513 return
3514
3515 self.report_extraction(video_id)
3516
3517
3518 # Extract video URL
3519 mobj = re.search(r'flv_url=(.+?)&', webpage)
3520 if mobj is None:
3521 self._downloader.trouble(u'ERROR: unable to extract video url')
3522 return
3523 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3524
3525
3526 # Extract title
3527 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3528 if mobj is None:
3529 self._downloader.trouble(u'ERROR: unable to extract video title')
3530 return
3531 video_title = mobj.group(1).decode('utf-8')
3532
3533
3534 # Extract video thumbnail
3535 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3536 if mobj is None:
3537 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3538 return
3539 video_thumbnail = mobj.group(1).decode('utf-8')
3540
3541
3542
3543 self._downloader.increment_downloads()
3544 info = {
3545 'id': video_id,
3546 'url': video_url,
3547 'uploader': None,
3548 'upload_date': None,
3549 'title': video_title,
3550 'stitle': _simplify_title(video_title),
3551 'ext': 'flv',
3552 'format': 'flv',
3553 'thumbnail': video_thumbnail,
3554 'description': None,
3555 'player_url': None,
3556 }
3557
3558 try:
3559 self._downloader.process_info(info)
3560 except UnavailableVideoError, err:
3561 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3562
3563
3564 class SoundcloudIE(InfoExtractor):
3565 """Information extractor for soundcloud.com
3566 To access the media, the uid of the song and a stream token
3567 must be extracted from the page source and the script must make
3568 a request to media.soundcloud.com/crossdomain.xml. Then
3569 the media can be grabbed by requesting from an url composed
3570 of the stream token and uid
3571 """
3572
3573 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3574 IE_NAME = u'soundcloud'
3575
3576 def __init__(self, downloader=None):
3577 InfoExtractor.__init__(self, downloader)
3578
3579 def report_webpage(self, video_id):
3580 """Report information extraction."""
3581 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3582
3583 def report_extraction(self, video_id):
3584 """Report information extraction."""
3585 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3586
3587 def _real_extract(self, url):
3588 htmlParser = HTMLParser.HTMLParser()
3589
3590 mobj = re.match(self._VALID_URL, url)
3591 if mobj is None:
3592 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3593 return
3594
3595 # extract uploader (which is in the url)
3596 uploader = mobj.group(1).decode('utf-8')
3597 # extract simple title (uploader + slug of song title)
3598 slug_title = mobj.group(2).decode('utf-8')
3599 simple_title = uploader + '-' + slug_title
3600
3601 self.report_webpage('%s/%s' % (uploader, slug_title))
3602
3603 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3604 try:
3605 webpage = urllib2.urlopen(request).read()
3606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3607 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3608 return
3609
3610 self.report_extraction('%s/%s' % (uploader, slug_title))
3611
3612 # extract uid and stream token that soundcloud hands out for access
3613 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3614 if mobj:
3615 video_id = mobj.group(1)
3616 stream_token = mobj.group(2)
3617
3618 # extract unsimplified title
3619 mobj = re.search('"title":"(.*?)",', webpage)
3620 if mobj:
3621 title = mobj.group(1)
3622
3623 # construct media url (with uid/token)
3624 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3625 mediaURL = mediaURL % (video_id, stream_token)
3626
3627 # description
3628 description = u'No description available'
3629 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3630 if mobj:
3631 description = mobj.group(1)
3632
3633 # upload date
3634 upload_date = None
3635 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3636 if mobj:
3637 try:
3638 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3639 except Exception, e:
3640 print str(e)
3641
3642 # for soundcloud, a request to a cross domain is required for cookies
3643 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3644
3645 try:
3646 self._downloader.process_info({
3647 'id': video_id.decode('utf-8'),
3648 'url': mediaURL,
3649 'uploader': uploader.decode('utf-8'),
3650 'upload_date': upload_date,
3651 'title': simple_title.decode('utf-8'),
3652 'stitle': simple_title.decode('utf-8'),
3653 'ext': u'mp3',
3654 'format': u'NA',
3655 'player_url': None,
3656 'description': description.decode('utf-8')
3657 })
3658 except UnavailableVideoError:
3659 self._downloader.trouble(u'\nERROR: unable to download video')
3660
3661
3662 class InfoQIE(InfoExtractor):
3663 """Information extractor for infoq.com"""
3664
3665 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3666 IE_NAME = u'infoq'
3667
3668 def report_webpage(self, video_id):
3669 """Report information extraction."""
3670 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3671
3672 def report_extraction(self, video_id):
3673 """Report information extraction."""
3674 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3675
3676 def _real_extract(self, url):
3677 htmlParser = HTMLParser.HTMLParser()
3678
3679 mobj = re.match(self._VALID_URL, url)
3680 if mobj is None:
3681 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3682 return
3683
3684 self.report_webpage(url)
3685
3686 request = urllib2.Request(url)
3687 try:
3688 webpage = urllib2.urlopen(request).read()
3689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3690 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3691 return
3692
3693 self.report_extraction(url)
3694
3695
3696 # Extract video URL
3697 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3698 if mobj is None:
3699 self._downloader.trouble(u'ERROR: unable to extract video url')
3700 return
3701 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3702
3703
3704 # Extract title
3705 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3706 if mobj is None:
3707 self._downloader.trouble(u'ERROR: unable to extract video title')
3708 return
3709 video_title = mobj.group(1).decode('utf-8')
3710
3711 # Extract description
3712 video_description = u'No description available.'
3713 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3714 if mobj is not None:
3715 video_description = mobj.group(1).decode('utf-8')
3716
3717 video_filename = video_url.split('/')[-1]
3718 video_id, extension = video_filename.split('.')
3719
3720 self._downloader.increment_downloads()
3721 info = {
3722 'id': video_id,
3723 'url': video_url,
3724 'uploader': None,
3725 'upload_date': None,
3726 'title': video_title,
3727 'stitle': _simplify_title(video_title),
3728 'ext': extension,
3729 'format': extension, # Extension is always(?) mp4, but seems to be flv
3730 'thumbnail': None,
3731 'description': video_description,
3732 'player_url': None,
3733 }
3734
3735 try:
3736 self._downloader.process_info(info)
3737 except UnavailableVideoError, err:
3738 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3739
3740 class MixcloudIE(InfoExtractor):
3741 """Information extractor for www.mixcloud.com"""
3742 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3743 IE_NAME = u'mixcloud'
3744
3745 def __init__(self, downloader=None):
3746 InfoExtractor.__init__(self, downloader)
3747
3748 def report_download_json(self, file_id):
3749 """Report JSON download."""
3750 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3751
3752 def report_extraction(self, file_id):
3753 """Report information extraction."""
3754 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3755
3756 def get_urls(self, jsonData, fmt, bitrate='best'):
3757 """Get urls from 'audio_formats' section in json"""
3758 file_url = None
3759 try:
3760 bitrate_list = jsonData[fmt]
3761 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3762 bitrate = max(bitrate_list) # select highest
3763
3764 url_list = jsonData[fmt][bitrate]
3765 except TypeError: # we have no bitrate info.
3766 url_list = jsonData[fmt]
3767 return url_list
3768
3769 def check_urls(self, url_list):
3770 """Returns 1st active url from list"""
3771 for url in url_list:
3772 try:
3773 urllib2.urlopen(url)
3774 return url
3775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3776 url = None
3777
3778 return None
3779
3780 def _print_formats(self, formats):
3781 print 'Available formats:'
3782 for fmt in formats.keys():
3783 for b in formats[fmt]:
3784 try:
3785 ext = formats[fmt][b][0]
3786 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3787 except TypeError: # we have no bitrate info
3788 ext = formats[fmt][0]
3789 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3790 break
3791
3792 def _real_extract(self, url):
3793 mobj = re.match(self._VALID_URL, url)
3794 if mobj is None:
3795 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3796 return
3797 # extract uploader & filename from url
3798 uploader = mobj.group(1).decode('utf-8')
3799 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3800
3801 # construct API request
3802 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3803 # retrieve .json file with links to files
3804 request = urllib2.Request(file_url)
3805 try:
3806 self.report_download_json(file_url)
3807 jsonData = urllib2.urlopen(request).read()
3808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3809 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3810 return
3811
3812 # parse JSON
3813 json_data = json.loads(jsonData)
3814 player_url = json_data['player_swf_url']
3815 formats = dict(json_data['audio_formats'])
3816
3817 req_format = self._downloader.params.get('format', None)
3818 bitrate = None
3819
3820 if self._downloader.params.get('listformats', None):
3821 self._print_formats(formats)
3822 return
3823
3824 if req_format is None or req_format == 'best':
3825 for format_param in formats.keys():
3826 url_list = self.get_urls(formats, format_param)
3827 # check urls
3828 file_url = self.check_urls(url_list)
3829 if file_url is not None:
3830 break # got it!
3831 else:
3832 if req_format not in formats.keys():
3833 self._downloader.trouble(u'ERROR: format is not available')
3834 return
3835
3836 url_list = self.get_urls(formats, req_format)
3837 file_url = self.check_urls(url_list)
3838 format_param = req_format
3839
3840 # We have audio
3841 self._downloader.increment_downloads()
3842 try:
3843 # Process file information
3844 self._downloader.process_info({
3845 'id': file_id.decode('utf-8'),
3846 'url': file_url.decode('utf-8'),
3847 'uploader': uploader.decode('utf-8'),
3848 'upload_date': u'NA',
3849 'title': json_data['name'],
3850 'stitle': _simplify_title(json_data['name']),
3851 'ext': file_url.split('.')[-1].decode('utf-8'),
3852 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3853 'thumbnail': json_data['thumbnail_url'],
3854 'description': json_data['description'],
3855 'player_url': player_url.decode('utf-8'),
3856 })
3857 except UnavailableVideoError, err:
3858 self._downloader.trouble(u'ERROR: unable to download file')
3859
3860 class StanfordOpenClassroomIE(InfoExtractor):
3861 """Information extractor for Stanford's Open ClassRoom"""
3862
3863 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3864 IE_NAME = u'stanfordoc'
3865
3866 def report_download_webpage(self, objid):
3867 """Report information extraction."""
3868 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3869
3870 def report_extraction(self, video_id):
3871 """Report information extraction."""
3872 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3873
3874 def _real_extract(self, url):
3875 mobj = re.match(self._VALID_URL, url)
3876 if mobj is None:
3877 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3878 return
3879
3880 if mobj.group('course') and mobj.group('video'): # A specific video
3881 course = mobj.group('course')
3882 video = mobj.group('video')
3883 info = {
3884 'id': _simplify_title(course + '_' + video),
3885 }
3886
3887 self.report_extraction(info['id'])
3888 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3889 xmlUrl = baseUrl + video + '.xml'
3890 try:
3891 metaXml = urllib2.urlopen(xmlUrl).read()
3892 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3893 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3894 return
3895 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3896 try:
3897 info['title'] = mdoc.findall('./title')[0].text
3898 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3899 except IndexError:
3900 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3901 return
3902 info['stitle'] = _simplify_title(info['title'])
3903 info['ext'] = info['url'].rpartition('.')[2]
3904 info['format'] = info['ext']
3905 self._downloader.increment_downloads()
3906 try:
3907 self._downloader.process_info(info)
3908 except UnavailableVideoError, err:
3909 self._downloader.trouble(u'\nERROR: unable to download video')
3910 elif mobj.group('course'): # A course page
3911 unescapeHTML = HTMLParser.HTMLParser().unescape
3912
3913 course = mobj.group('course')
3914 info = {
3915 'id': _simplify_title(course),
3916 'type': 'playlist',
3917 }
3918
3919 self.report_download_webpage(info['id'])
3920 try:
3921 coursepage = urllib2.urlopen(url).read()
3922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3923 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3924 return
3925
3926 m = re.search('<h1>([^<]+)</h1>', coursepage)
3927 if m:
3928 info['title'] = unescapeHTML(m.group(1))
3929 else:
3930 info['title'] = info['id']
3931 info['stitle'] = _simplify_title(info['title'])
3932
3933 m = re.search('<description>([^<]+)</description>', coursepage)
3934 if m:
3935 info['description'] = unescapeHTML(m.group(1))
3936
3937 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3938 info['list'] = [
3939 {
3940 'type': 'reference',
3941 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3942 }
3943 for vpage in links]
3944
3945 for entry in info['list']:
3946 assert entry['type'] == 'reference'
3947 self.extract(entry['url'])
3948 else: # Root page
3949 unescapeHTML = HTMLParser.HTMLParser().unescape
3950
3951 info = {
3952 'id': 'Stanford OpenClassroom',
3953 'type': 'playlist',
3954 }
3955
3956 self.report_download_webpage(info['id'])
3957 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3958 try:
3959 rootpage = urllib2.urlopen(rootURL).read()
3960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3961 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3962 return
3963
3964 info['title'] = info['id']
3965 info['stitle'] = _simplify_title(info['title'])
3966
3967 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3968 info['list'] = [
3969 {
3970 'type': 'reference',
3971 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3972 }
3973 for cpage in links]
3974
3975 for entry in info['list']:
3976 assert entry['type'] == 'reference'
3977 self.extract(entry['url'])
3978
3979 class MTVIE(InfoExtractor):
3980 """Information extractor for MTV.com"""
3981
3982 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3983 IE_NAME = u'mtv'
3984
3985 def report_webpage(self, video_id):
3986 """Report information extraction."""
3987 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3988
3989 def report_extraction(self, video_id):
3990 """Report information extraction."""
3991 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3992
3993 def _real_extract(self, url):
3994 mobj = re.match(self._VALID_URL, url)
3995 if mobj is None:
3996 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3997 return
3998 if not mobj.group('proto'):
3999 url = 'http://' + url
4000 video_id = mobj.group('videoid')
4001 self.report_webpage(video_id)
4002
4003 request = urllib2.Request(url)
4004 try:
4005 webpage = urllib2.urlopen(request).read()
4006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4007 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4008 return
4009
4010 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4011 if mobj is None:
4012 self._downloader.trouble(u'ERROR: unable to extract song name')
4013 return
4014 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4015 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4016 if mobj is None:
4017 self._downloader.trouble(u'ERROR: unable to extract performer')
4018 return
4019 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4020 video_title = performer + ' - ' + song_name
4021
4022 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4023 if mobj is None:
4024 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4025 return
4026 mtvn_uri = mobj.group(1)
4027
4028 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4029 if mobj is None:
4030 self._downloader.trouble(u'ERROR: unable to extract content id')
4031 return
4032 content_id = mobj.group(1)
4033
4034 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4035 self.report_extraction(video_id)
4036 request = urllib2.Request(videogen_url)
4037 try:
4038 metadataXml = urllib2.urlopen(request).read()
4039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4040 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4041 return
4042
4043 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4044 renditions = mdoc.findall('.//rendition')
4045
4046 # For now, always pick the highest quality.
4047 rendition = renditions[-1]
4048
4049 try:
4050 _,_,ext = rendition.attrib['type'].partition('/')
4051 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4052 video_url = rendition.find('./src').text
4053 except KeyError:
4054 self._downloader.trouble('Invalid rendition field.')
4055 return
4056
4057 self._downloader.increment_downloads()
4058 info = {
4059 'id': video_id,
4060 'url': video_url,
4061 'uploader': performer,
4062 'title': video_title,
4063 'stitle': _simplify_title(video_title),
4064 'ext': ext,
4065 'format': format,
4066 }
4067
4068 try:
4069 self._downloader.process_info(info)
4070 except UnavailableVideoError, err:
4071 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4072
4073
4074 class PostProcessor(object):
4075 """Post Processor class.
4076
4077 PostProcessor objects can be added to downloaders with their
4078 add_post_processor() method. When the downloader has finished a
4079 successful download, it will take its internal chain of PostProcessors
4080 and start calling the run() method on each one of them, first with
4081 an initial argument and then with the returned value of the previous
4082 PostProcessor.
4083
4084 The chain will be stopped if one of them ever returns None or the end
4085 of the chain is reached.
4086
4087 PostProcessor objects follow a "mutual registration" process similar
4088 to InfoExtractor objects.
4089 """
4090
4091 _downloader = None
4092
4093 def __init__(self, downloader=None):
4094 self._downloader = downloader
4095
4096 def set_downloader(self, downloader):
4097 """Sets the downloader for this PP."""
4098 self._downloader = downloader
4099
4100 def run(self, information):
4101 """Run the PostProcessor.
4102
4103 The "information" argument is a dictionary like the ones
4104 composed by InfoExtractors. The only difference is that this
4105 one has an extra field called "filepath" that points to the
4106 downloaded file.
4107
4108 When this method returns None, the postprocessing chain is
4109 stopped. However, this method may return an information
4110 dictionary that will be passed to the next postprocessing
4111 object in the chain. It can be the one it received after
4112 changing some fields.
4113
4114 In addition, this method may raise a PostProcessingError
4115 exception that will be taken into account by the downloader
4116 it was called from.
4117 """
4118 return information # by default, do nothing
4119
4120 class AudioConversionError(BaseException):
4121 def __init__(self, message):
4122 self.message = message
4123
4124 class FFmpegExtractAudioPP(PostProcessor):
4125
4126 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4127 PostProcessor.__init__(self, downloader)
4128 if preferredcodec is None:
4129 preferredcodec = 'best'
4130 self._preferredcodec = preferredcodec
4131 self._preferredquality = preferredquality
4132 self._keepvideo = keepvideo
4133
4134 @staticmethod
4135 def get_audio_codec(path):
4136 try:
4137 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4138 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4139 output = handle.communicate()[0]
4140 if handle.wait() != 0:
4141 return None
4142 except (IOError, OSError):
4143 return None
4144 audio_codec = None
4145 for line in output.split('\n'):
4146 if line.startswith('codec_name='):
4147 audio_codec = line.split('=')[1].strip()
4148 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4149 return audio_codec
4150 return None
4151
4152 @staticmethod
4153 def run_ffmpeg(path, out_path, codec, more_opts):
4154 if codec is None:
4155 acodec_opts = []
4156 else:
4157 acodec_opts = ['-acodec', codec]
4158 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4159 try:
4160 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4161 stdout,stderr = p.communicate()
4162 except (IOError, OSError):
4163 e = sys.exc_info()[1]
4164 if isinstance(e, OSError) and e.errno == 2:
4165 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4166 else:
4167 raise e
4168 if p.returncode != 0:
4169 msg = stderr.strip().split('\n')[-1]
4170 raise AudioConversionError(msg)
4171
4172 def run(self, information):
4173 path = information['filepath']
4174
4175 filecodec = self.get_audio_codec(path)
4176 if filecodec is None:
4177 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4178 return None
4179
4180 more_opts = []
4181 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4182 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4183 # Lossless, but in another container
4184 acodec = 'copy'
4185 extension = self._preferredcodec
4186 more_opts = ['-absf', 'aac_adtstoasc']
4187 elif filecodec in ['aac', 'mp3', 'vorbis']:
4188 # Lossless if possible
4189 acodec = 'copy'
4190 extension = filecodec
4191 if filecodec == 'aac':
4192 more_opts = ['-f', 'adts']
4193 if filecodec == 'vorbis':
4194 extension = 'ogg'
4195 else:
4196 # MP3 otherwise.
4197 acodec = 'libmp3lame'
4198 extension = 'mp3'
4199 more_opts = []
4200 if self._preferredquality is not None:
4201 more_opts += ['-ab', self._preferredquality]
4202 else:
4203 # We convert the audio (lossy)
4204 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4205 extension = self._preferredcodec
4206 more_opts = []
4207 if self._preferredquality is not None:
4208 more_opts += ['-ab', self._preferredquality]
4209 if self._preferredcodec == 'aac':
4210 more_opts += ['-f', 'adts']
4211 if self._preferredcodec == 'm4a':
4212 more_opts += ['-absf', 'aac_adtstoasc']
4213 if self._preferredcodec == 'vorbis':
4214 extension = 'ogg'
4215 if self._preferredcodec == 'wav':
4216 extension = 'wav'
4217 more_opts += ['-f', 'wav']
4218
4219 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4220 new_path = prefix + sep + extension
4221 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4222 try:
4223 self.run_ffmpeg(path, new_path, acodec, more_opts)
4224 except:
4225 etype,e,tb = sys.exc_info()
4226 if isinstance(e, AudioConversionError):
4227 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4228 else:
4229 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4230 return None
4231
4232 # Try to update the date time for extracted audio file.
4233 if information.get('filetime') is not None:
4234 try:
4235 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4236 except:
4237 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4238
4239 if not self._keepvideo:
4240 try:
4241 os.remove(_encodeFilename(path))
4242 except (IOError, OSError):
4243 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4244 return None
4245
4246 information['filepath'] = new_path
4247 return information
4248
4249
4250 def updateSelf(downloader, filename):
4251 ''' Update the program file with the latest version from the repository '''
4252 # Note: downloader only used for options
4253 if not os.access(filename, os.W_OK):
4254 sys.exit('ERROR: no write permissions on %s' % filename)
4255
4256 downloader.to_screen(u'Updating to latest version...')
4257
4258 try:
4259 try:
4260 urlh = urllib.urlopen(UPDATE_URL)
4261 newcontent = urlh.read()
4262
4263 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4264 if vmatch is not None and vmatch.group(1) == __version__:
4265 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4266 return
4267 finally:
4268 urlh.close()
4269 except (IOError, OSError), err:
4270 sys.exit('ERROR: unable to download latest version')
4271
4272 try:
4273 outf = open(filename, 'wb')
4274 try:
4275 outf.write(newcontent)
4276 finally:
4277 outf.close()
4278 except (IOError, OSError), err:
4279 sys.exit('ERROR: unable to overwrite current version')
4280
4281 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4282
4283 def parseOpts():
4284 def _readOptions(filename_bytes):
4285 try:
4286 optionf = open(filename_bytes)
4287 except IOError:
4288 return [] # silently skip if file is not present
4289 try:
4290 res = []
4291 for l in optionf:
4292 res += shlex.split(l, comments=True)
4293 finally:
4294 optionf.close()
4295 return res
4296
4297 def _format_option_string(option):
4298 ''' ('-o', '--option') -> -o, --format METAVAR'''
4299
4300 opts = []
4301
4302 if option._short_opts: opts.append(option._short_opts[0])
4303 if option._long_opts: opts.append(option._long_opts[0])
4304 if len(opts) > 1: opts.insert(1, ', ')
4305
4306 if option.takes_value(): opts.append(' %s' % option.metavar)
4307
4308 return "".join(opts)
4309
4310 def _find_term_columns():
4311 columns = os.environ.get('COLUMNS', None)
4312 if columns:
4313 return int(columns)
4314
4315 try:
4316 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4317 out,err = sp.communicate()
4318 return int(out.split()[1])
4319 except:
4320 pass
4321 return None
4322
4323 max_width = 80
4324 max_help_position = 80
4325
4326 # No need to wrap help messages if we're on a wide console
4327 columns = _find_term_columns()
4328 if columns: max_width = columns
4329
4330 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4331 fmt.format_option_strings = _format_option_string
4332
4333 kw = {
4334 'version' : __version__,
4335 'formatter' : fmt,
4336 'usage' : '%prog [options] url [url...]',
4337 'conflict_handler' : 'resolve',
4338 }
4339
4340 parser = optparse.OptionParser(**kw)
4341
4342 # option groups
4343 general = optparse.OptionGroup(parser, 'General Options')
4344 selection = optparse.OptionGroup(parser, 'Video Selection')
4345 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4346 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4347 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4348 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4349 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4350
4351 general.add_option('-h', '--help',
4352 action='help', help='print this help text and exit')
4353 general.add_option('-v', '--version',
4354 action='version', help='print program version and exit')
4355 general.add_option('-U', '--update',
4356 action='store_true', dest='update_self', help='update this program to latest version')
4357 general.add_option('-i', '--ignore-errors',
4358 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4359 general.add_option('-r', '--rate-limit',
4360 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4361 general.add_option('-R', '--retries',
4362 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4363 general.add_option('--dump-user-agent',
4364 action='store_true', dest='dump_user_agent',
4365 help='display the current browser identification', default=False)
4366 general.add_option('--list-extractors',
4367 action='store_true', dest='list_extractors',
4368 help='List all supported extractors and the URLs they would handle', default=False)
4369
4370 selection.add_option('--playlist-start',
4371 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4372 selection.add_option('--playlist-end',
4373 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4374 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4375 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4376 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4377
4378 authentication.add_option('-u', '--username',
4379 dest='username', metavar='USERNAME', help='account username')
4380 authentication.add_option('-p', '--password',
4381 dest='password', metavar='PASSWORD', help='account password')
4382 authentication.add_option('-n', '--netrc',
4383 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4384
4385
4386 video_format.add_option('-f', '--format',
4387 action='store', dest='format', metavar='FORMAT', help='video format code')
4388 video_format.add_option('--all-formats',
4389 action='store_const', dest='format', help='download all available video formats', const='all')
4390 video_format.add_option('--prefer-free-formats',
4391 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4392 video_format.add_option('--max-quality',
4393 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4394 video_format.add_option('-F', '--list-formats',
4395 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4396 video_format.add_option('--write-srt',
4397 action='store_true', dest='writesubtitles',
4398 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4399 video_format.add_option('--srt-lang',
4400 action='store', dest='subtitleslang', metavar='LANG',
4401 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4402
4403
4404 verbosity.add_option('-q', '--quiet',
4405 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4406 verbosity.add_option('-s', '--simulate',
4407 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4408 verbosity.add_option('--skip-download',
4409 action='store_true', dest='skip_download', help='do not download the video', default=False)
4410 verbosity.add_option('-g', '--get-url',
4411 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4412 verbosity.add_option('-e', '--get-title',
4413 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4414 verbosity.add_option('--get-thumbnail',
4415 action='store_true', dest='getthumbnail',
4416 help='simulate, quiet but print thumbnail URL', default=False)
4417 verbosity.add_option('--get-description',
4418 action='store_true', dest='getdescription',
4419 help='simulate, quiet but print video description', default=False)
4420 verbosity.add_option('--get-filename',
4421 action='store_true', dest='getfilename',
4422 help='simulate, quiet but print output filename', default=False)
4423 verbosity.add_option('--get-format',
4424 action='store_true', dest='getformat',
4425 help='simulate, quiet but print output format', default=False)
4426 verbosity.add_option('--no-progress',
4427 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4428 verbosity.add_option('--console-title',
4429 action='store_true', dest='consoletitle',
4430 help='display progress in console titlebar', default=False)
4431 verbosity.add_option('-v', '--verbose',
4432 action='store_true', dest='verbose', help='print various debugging information', default=False)
4433
4434
4435 filesystem.add_option('-t', '--title',
4436 action='store_true', dest='usetitle', help='use title in file name', default=False)
4437 filesystem.add_option('-l', '--literal',
4438 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4439 filesystem.add_option('-A', '--auto-number',
4440 action='store_true', dest='autonumber',
4441 help='number downloaded files starting from 00000', default=False)
4442 filesystem.add_option('-o', '--output',
4443 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4444 filesystem.add_option('-a', '--batch-file',
4445 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4446 filesystem.add_option('-w', '--no-overwrites',
4447 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4448 filesystem.add_option('-c', '--continue',
4449 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4450 filesystem.add_option('--no-continue',
4451 action='store_false', dest='continue_dl',
4452 help='do not resume partially downloaded files (restart from beginning)')
4453 filesystem.add_option('--cookies',
4454 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4455 filesystem.add_option('--no-part',
4456 action='store_true', dest='nopart', help='do not use .part files', default=False)
4457 filesystem.add_option('--no-mtime',
4458 action='store_false', dest='updatetime',
4459 help='do not use the Last-modified header to set the file modification time', default=True)
4460 filesystem.add_option('--write-description',
4461 action='store_true', dest='writedescription',
4462 help='write video description to a .description file', default=False)
4463 filesystem.add_option('--write-info-json',
4464 action='store_true', dest='writeinfojson',
4465 help='write video metadata to a .info.json file', default=False)
4466
4467
4468 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4469 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4470 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4471 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4472 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4473 help='ffmpeg audio bitrate specification, 128k by default')
4474 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4475 help='keeps the video file on disk after the post-processing; the video is erased by default')
4476
4477
4478 parser.add_option_group(general)
4479 parser.add_option_group(selection)
4480 parser.add_option_group(filesystem)
4481 parser.add_option_group(verbosity)
4482 parser.add_option_group(video_format)
4483 parser.add_option_group(authentication)
4484 parser.add_option_group(postproc)
4485
4486 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4487 if xdg_config_home:
4488 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4489 else:
4490 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4491 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4492 opts, args = parser.parse_args(argv)
4493
4494 return parser, opts, args
4495
4496 def gen_extractors():
4497 """ Return a list of an instance of every supported extractor.
4498 The order does matter; the first extractor matched is the one handling the URL.
4499 """
4500 youtube_ie = YoutubeIE()
4501 google_ie = GoogleIE()
4502 yahoo_ie = YahooIE()
4503 return [
4504 YoutubePlaylistIE(youtube_ie),
4505 YoutubeUserIE(youtube_ie),
4506 YoutubeSearchIE(youtube_ie),
4507 youtube_ie,
4508 MetacafeIE(youtube_ie),
4509 DailymotionIE(),
4510 google_ie,
4511 GoogleSearchIE(google_ie),
4512 PhotobucketIE(),
4513 yahoo_ie,
4514 YahooSearchIE(yahoo_ie),
4515 DepositFilesIE(),
4516 FacebookIE(),
4517 BlipTVIE(),
4518 VimeoIE(),
4519 MyVideoIE(),
4520 ComedyCentralIE(),
4521 EscapistIE(),
4522 CollegeHumorIE(),
4523 XVideosIE(),
4524 SoundcloudIE(),
4525 InfoQIE(),
4526 MixcloudIE(),
4527 StanfordOpenClassroomIE(),
4528 MTVIE(),
4529
4530 GenericIE()
4531 ]
4532
4533 def _real_main():
4534 parser, opts, args = parseOpts()
4535
4536 # Open appropriate CookieJar
4537 if opts.cookiefile is None:
4538 jar = cookielib.CookieJar()
4539 else:
4540 try:
4541 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4542 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4543 jar.load()
4544 except (IOError, OSError), err:
4545 sys.exit(u'ERROR: unable to open cookie file')
4546
4547 # Dump user agent
4548 if opts.dump_user_agent:
4549 print std_headers['User-Agent']
4550 sys.exit(0)
4551
4552 # Batch file verification
4553 batchurls = []
4554 if opts.batchfile is not None:
4555 try:
4556 if opts.batchfile == '-':
4557 batchfd = sys.stdin
4558 else:
4559 batchfd = open(opts.batchfile, 'r')
4560 batchurls = batchfd.readlines()
4561 batchurls = [x.strip() for x in batchurls]
4562 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4563 except IOError:
4564 sys.exit(u'ERROR: batch file could not be read')
4565 all_urls = batchurls + args
4566
4567 # General configuration
4568 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4569 proxy_handler = urllib2.ProxyHandler()
4570 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4571 urllib2.install_opener(opener)
4572 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4573
4574 if opts.verbose:
4575 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4576
4577 extractors = gen_extractors()
4578
4579 if opts.list_extractors:
4580 for ie in extractors:
4581 print(ie.IE_NAME)
4582 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4583 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4584 for mu in matchedUrls:
4585 print(u' ' + mu)
4586 sys.exit(0)
4587
4588 # Conflicting, missing and erroneous options
4589 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4590 parser.error(u'using .netrc conflicts with giving username/password')
4591 if opts.password is not None and opts.username is None:
4592 parser.error(u'account username missing')
4593 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4594 parser.error(u'using output template conflicts with using title, literal title or auto number')
4595 if opts.usetitle and opts.useliteral:
4596 parser.error(u'using title conflicts with using literal title')
4597 if opts.username is not None and opts.password is None:
4598 opts.password = getpass.getpass(u'Type account password and press return:')
4599 if opts.ratelimit is not None:
4600 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4601 if numeric_limit is None:
4602 parser.error(u'invalid rate limit specified')
4603 opts.ratelimit = numeric_limit
4604 if opts.retries is not None:
4605 try:
4606 opts.retries = long(opts.retries)
4607 except (TypeError, ValueError), err:
4608 parser.error(u'invalid retry count specified')
4609 try:
4610 opts.playliststart = int(opts.playliststart)
4611 if opts.playliststart <= 0:
4612 raise ValueError(u'Playlist start must be positive')
4613 except (TypeError, ValueError), err:
4614 parser.error(u'invalid playlist start number specified')
4615 try:
4616 opts.playlistend = int(opts.playlistend)
4617 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4618 raise ValueError(u'Playlist end must be greater than playlist start')
4619 except (TypeError, ValueError), err:
4620 parser.error(u'invalid playlist end number specified')
4621 if opts.extractaudio:
4622 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4623 parser.error(u'invalid audio format specified')
4624
4625 # File downloader
4626 fd = FileDownloader({
4627 'usenetrc': opts.usenetrc,
4628 'username': opts.username,
4629 'password': opts.password,
4630 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4631 'forceurl': opts.geturl,
4632 'forcetitle': opts.gettitle,
4633 'forcethumbnail': opts.getthumbnail,
4634 'forcedescription': opts.getdescription,
4635 'forcefilename': opts.getfilename,
4636 'forceformat': opts.getformat,
4637 'simulate': opts.simulate,
4638 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4639 'format': opts.format,
4640 'format_limit': opts.format_limit,
4641 'listformats': opts.listformats,
4642 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4643 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4644 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4645 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4646 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4647 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4648 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4649 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4650 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4651 or u'%(id)s.%(ext)s'),
4652 'ignoreerrors': opts.ignoreerrors,
4653 'ratelimit': opts.ratelimit,
4654 'nooverwrites': opts.nooverwrites,
4655 'retries': opts.retries,
4656 'continuedl': opts.continue_dl,
4657 'noprogress': opts.noprogress,
4658 'playliststart': opts.playliststart,
4659 'playlistend': opts.playlistend,
4660 'logtostderr': opts.outtmpl == '-',
4661 'consoletitle': opts.consoletitle,
4662 'nopart': opts.nopart,
4663 'updatetime': opts.updatetime,
4664 'writedescription': opts.writedescription,
4665 'writeinfojson': opts.writeinfojson,
4666 'writesubtitles': opts.writesubtitles,
4667 'subtitleslang': opts.subtitleslang,
4668 'matchtitle': opts.matchtitle,
4669 'rejecttitle': opts.rejecttitle,
4670 'max_downloads': opts.max_downloads,
4671 'prefer_free_formats': opts.prefer_free_formats,
4672 'verbose': opts.verbose,
4673 })
4674 for extractor in extractors:
4675 fd.add_info_extractor(extractor)
4676
4677 # PostProcessors
4678 if opts.extractaudio:
4679 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4680
4681 # Update version
4682 if opts.update_self:
4683 updateSelf(fd, sys.argv[0])
4684
4685 # Maybe do nothing
4686 if len(all_urls) < 1:
4687 if not opts.update_self:
4688 parser.error(u'you must provide at least one URL')
4689 else:
4690 sys.exit()
4691
4692 try:
4693 retcode = fd.download(all_urls)
4694 except MaxDownloadsReached:
4695 fd.to_screen(u'--max-download limit reached, aborting.')
4696 retcode = 101
4697
4698 # Dump cookie jar if requested
4699 if opts.cookiefile is not None:
4700 try:
4701 jar.save()
4702 except (IOError, OSError), err:
4703 sys.exit(u'ERROR: unable to save cookie jar')
4704
4705 sys.exit(retcode)
4706
4707 def main():
4708 try:
4709 _real_main()
4710 except DownloadError:
4711 sys.exit(1)
4712 except SameFileError:
4713 sys.exit(u'ERROR: fixed output name but more than one file to download')
4714 except KeyboardInterrupt:
4715 sys.exit(u'\nERROR: Interrupted by user')
4716
4717 if __name__ == '__main__':
4718 main()
4719
4720 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: