]> jfr.im git - yt-dlp.git/blob - youtube-dl
Always extract original URL from next_url (#318)
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52 import ctypes
53
54 try:
55 import email.utils
56 except ImportError: # Python 2.4
57 import email.Utils
58 try:
59 import cStringIO as StringIO
60 except ImportError:
61 import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65 from urlparse import parse_qs
66 except ImportError:
67 from cgi import parse_qs
68
69 try:
70 import lxml.etree
71 except ImportError:
72 pass # Handled below
73
74 try:
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88 import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 import re
91 class json(object):
92 @staticmethod
93 def loads(s):
94 s = s.decode('UTF-8')
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
99 i += 1
100 if expectMore:
101 if i >= len(s):
102 raiseError('Premature end', i)
103 return i
104 def decodeEscape(match):
105 esc = match.group(1)
106 _STATIC = {
107 '"': '"',
108 '\\': '\\',
109 '/': '/',
110 'b': unichr(0x8),
111 'f': unichr(0xc),
112 'n': '\n',
113 'r': '\r',
114 't': '\t',
115 }
116 if esc in _STATIC:
117 return _STATIC[esc]
118 if esc[0] == 'u':
119 if len(esc) == 1+4:
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
126 def parseString(i):
127 i += 1
128 e = i
129 while True:
130 e = s.index('"', e)
131 bslashes = 0
132 while s[e-bslashes-1] == '\\':
133 bslashes += 1
134 if bslashes % 2 == 1:
135 e += 1
136 continue
137 break
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
140 return (e+1,stri)
141 def parseObj(i):
142 i += 1
143 res = {}
144 i = skipSpace(i)
145 if s[i] == '}': # Empty dictionary
146 return (i+1,res)
147 while True:
148 if s[i] != '"':
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
151 i = skipSpace(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
154 i,val = parse(i+1)
155 res[key] = val
156 i = skipSpace(i)
157 if s[i] == '}':
158 return (i+1, res)
159 if s[i] != ',':
160 raiseError('Expected comma or closing curly brace', i)
161 i = skipSpace(i+1)
162 def parseArray(i):
163 res = []
164 i = skipSpace(i+1)
165 if s[i] == ']': # Empty array
166 return (i+1,res)
167 while True:
168 i,val = parse(i)
169 res.append(val)
170 i = skipSpace(i) # Raise exception if premature end
171 if s[i] == ']':
172 return (i+1, res)
173 if s[i] != ',':
174 raiseError('Expected a comma or closing bracket', i)
175 i = skipSpace(i+1)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
179 return (i+len(k), v)
180 raiseError('Not a boolean (or null)', i)
181 def parseNumber(i):
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183 if mobj is None:
184 raiseError('Not a number', i)
185 nums = mobj.group(1)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190 def parse(i):
191 i = skipSpace(i)
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
194 return (i,res)
195 i,res = parse(0)
196 if i < len(s):
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198 return res
199
200 def preferredencoding():
201 """Get preferred encoding.
202
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
205 """
206 def yield_preferredencoding():
207 try:
208 pref = locale.getpreferredencoding()
209 u'TEST'.encode(pref)
210 except:
211 pref = 'UTF-8'
212 while True:
213 yield pref
214 return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
219
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
222 """
223 entity = matchobj.group(1)
224
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
228
229 # Unicode character
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
231 if mobj is not None:
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
234 base = 16
235 numstr = u'0%s' % numstr
236 else:
237 base = 10
238 return unichr(long(numstr, base))
239
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
252
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
256 function.
257
258 It returns the tuple (stream, definitive_file_name).
259 """
260 try:
261 if filename == u'-':
262 if sys.platform == 'win32':
263 import msvcrt
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
275
276
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
279 timestamp = None
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
283 return timestamp
284
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
291 res = []
292 for el in iterable:
293 if el not in res:
294 res.append(el)
295 return res
296
297 def _unescapeHTML(s):
298 """
299 @param s a string (of type unicode)
300 """
301 assert type(s) == type(u'')
302
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307 """
308 @param s The name of the file (of type unicode)
309 """
310
311 assert type(s) == type(u'')
312
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317 return s
318 else:
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322 """Download Error exception.
323
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
326 error message.
327 """
328 pass
329
330
331 class SameFileError(Exception):
332 """Same File exception.
333
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
336 """
337 pass
338
339
340 class PostProcessingError(Exception):
341 """Post Processing exception.
342
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
345 """
346 pass
347
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
350 pass
351
352
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
355
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
358 """
359 pass
360
361
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
364
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
368 """
369 # Both in bytes
370 downloaded = None
371 expected = None
372
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
380
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
387
388 Part of this code was copied from:
389
390 http://techknack.net/python-urllib2-handlers/
391
392 Andrew Rowls, the author of that code, agreed to release it to the
393 public domain.
394 """
395
396 @staticmethod
397 def deflate(data):
398 try:
399 return zlib.decompress(data, -zlib.MAX_WBITS)
400 except zlib.error:
401 return zlib.decompress(data)
402
403 @staticmethod
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
408 ret.code = code
409 return ret
410
411 def http_request(self, req):
412 for h in std_headers:
413 if h in req.headers:
414 del req.headers[h]
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
420 return req
421
422 def http_response(self, req, resp):
423 old_resp = resp
424 # gzip
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
429 # deflate
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
434 return resp
435
436
437 class FileDownloader(object):
438 """File Downloader class.
439
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
446
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
454
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
461
462 Available options:
463
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
494 subtitleslang: Language of the subtitles to download
495 """
496
497 params = None
498 _ies = []
499 _pps = []
500 _download_retcode = None
501 _num_downloads = None
502 _screen_file = None
503
504 def __init__(self, params):
505 """Create a FileDownloader object with the given options."""
506 self._ies = []
507 self._pps = []
508 self._download_retcode = 0
509 self._num_downloads = 0
510 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
511 self.params = params
512
513 @staticmethod
514 def format_bytes(bytes):
515 if bytes is None:
516 return 'N/A'
517 if type(bytes) is str:
518 bytes = float(bytes)
519 if bytes == 0.0:
520 exponent = 0
521 else:
522 exponent = long(math.log(bytes, 1024.0))
523 suffix = 'bkMGTPEZY'[exponent]
524 converted = float(bytes) / float(1024 ** exponent)
525 return '%.2f%s' % (converted, suffix)
526
527 @staticmethod
528 def calc_percent(byte_counter, data_len):
529 if data_len is None:
530 return '---.-%'
531 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532
533 @staticmethod
534 def calc_eta(start, now, total, current):
535 if total is None:
536 return '--:--'
537 dif = now - start
538 if current == 0 or dif < 0.001: # One millisecond
539 return '--:--'
540 rate = float(current) / dif
541 eta = long((float(total) - float(current)) / rate)
542 (eta_mins, eta_secs) = divmod(eta, 60)
543 if eta_mins > 99:
544 return '--:--'
545 return '%02d:%02d' % (eta_mins, eta_secs)
546
547 @staticmethod
548 def calc_speed(start, now, bytes):
549 dif = now - start
550 if bytes == 0 or dif < 0.001: # One millisecond
551 return '%10s' % '---b/s'
552 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553
554 @staticmethod
555 def best_block_size(elapsed_time, bytes):
556 new_min = max(bytes / 2.0, 1.0)
557 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
558 if elapsed_time < 0.001:
559 return long(new_max)
560 rate = bytes / elapsed_time
561 if rate > new_max:
562 return long(new_max)
563 if rate < new_min:
564 return long(new_min)
565 return long(rate)
566
567 @staticmethod
568 def parse_bytes(bytestr):
569 """Parse a string indicating a byte quantity into a long integer."""
570 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571 if matchobj is None:
572 return None
573 number = float(matchobj.group(1))
574 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
575 return long(round(number * multiplier))
576
577 def add_info_extractor(self, ie):
578 """Add an InfoExtractor object to the end of the list."""
579 self._ies.append(ie)
580 ie.set_downloader(self)
581
582 def add_post_processor(self, pp):
583 """Add a PostProcessor object to the end of the chain."""
584 self._pps.append(pp)
585 pp.set_downloader(self)
586
587 def to_screen(self, message, skip_eol=False):
588 """Print message to stdout if not in quiet mode."""
589 assert type(message) == type(u'')
590 if not self.params.get('quiet', False):
591 terminator = [u'\n', u''][skip_eol]
592 output = message + terminator
593
594 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
595 output = output.encode(preferredencoding(), 'ignore')
596 self._screen_file.write(output)
597 self._screen_file.flush()
598
599 def to_stderr(self, message):
600 """Print message to stderr."""
601 print >>sys.stderr, message.encode(preferredencoding())
602
603 def to_cons_title(self, message):
604 """Set console/terminal window title to message."""
605 if not self.params.get('consoletitle', False):
606 return
607 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
608 # c_wchar_p() might not be necessary if `message` is
609 # already of type unicode()
610 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
611 elif 'TERM' in os.environ:
612 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
613
614 def fixed_template(self):
615 """Checks if the output template is fixed."""
616 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
617
618 def trouble(self, message=None):
619 """Determine action to take when a download problem appears.
620
621 Depending on if the downloader has been configured to ignore
622 download errors or not, this method may throw an exception or
623 not when errors are found, after printing the message.
624 """
625 if message is not None:
626 self.to_stderr(message)
627 if not self.params.get('ignoreerrors', False):
628 raise DownloadError(message)
629 self._download_retcode = 1
630
631 def slow_down(self, start_time, byte_counter):
632 """Sleep if the download speed is over the rate limit."""
633 rate_limit = self.params.get('ratelimit', None)
634 if rate_limit is None or byte_counter == 0:
635 return
636 now = time.time()
637 elapsed = now - start_time
638 if elapsed <= 0.0:
639 return
640 speed = float(byte_counter) / elapsed
641 if speed > rate_limit:
642 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
643
644 def temp_name(self, filename):
645 """Returns a temporary filename for the given filename."""
646 if self.params.get('nopart', False) or filename == u'-' or \
647 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
648 return filename
649 return filename + u'.part'
650
651 def undo_temp_name(self, filename):
652 if filename.endswith(u'.part'):
653 return filename[:-len(u'.part')]
654 return filename
655
656 def try_rename(self, old_filename, new_filename):
657 try:
658 if old_filename == new_filename:
659 return
660 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
661 except (IOError, OSError), err:
662 self.trouble(u'ERROR: unable to rename file')
663
664 def try_utime(self, filename, last_modified_hdr):
665 """Try to set the last-modified time of the given file."""
666 if last_modified_hdr is None:
667 return
668 if not os.path.isfile(_encodeFilename(filename)):
669 return
670 timestr = last_modified_hdr
671 if timestr is None:
672 return
673 filetime = timeconvert(timestr)
674 if filetime is None:
675 return filetime
676 try:
677 os.utime(filename, (time.time(), filetime))
678 except:
679 pass
680 return filetime
681
682 def report_writedescription(self, descfn):
683 """ Report that the description file is being written """
684 self.to_screen(u'[info] Writing video description to: ' + descfn)
685
686 def report_writesubtitles(self, srtfn):
687 """ Report that the subtitles file is being written """
688 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
689
690 def report_writeinfojson(self, infofn):
691 """ Report that the metadata file has been written """
692 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
693
694 def report_destination(self, filename):
695 """Report destination filename."""
696 self.to_screen(u'[download] Destination: ' + filename)
697
698 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
699 """Report download progress."""
700 if self.params.get('noprogress', False):
701 return
702 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
703 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
704 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
705 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
706
707 def report_resuming_byte(self, resume_len):
708 """Report attempt to resume at given byte."""
709 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
710
711 def report_retry(self, count, retries):
712 """Report retry in case of HTTP error 5xx"""
713 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
714
715 def report_file_already_downloaded(self, file_name):
716 """Report file has already been fully downloaded."""
717 try:
718 self.to_screen(u'[download] %s has already been downloaded' % file_name)
719 except (UnicodeEncodeError), err:
720 self.to_screen(u'[download] The file has already been downloaded')
721
722 def report_unable_to_resume(self):
723 """Report it was impossible to resume download."""
724 self.to_screen(u'[download] Unable to resume')
725
726 def report_finish(self):
727 """Report download finished."""
728 if self.params.get('noprogress', False):
729 self.to_screen(u'[download] Download completed')
730 else:
731 self.to_screen(u'')
732
733 def increment_downloads(self):
734 """Increment the ordinal that assigns a number to each file."""
735 self._num_downloads += 1
736
737 def prepare_filename(self, info_dict):
738 """Generate the output filename."""
739 try:
740 template_dict = dict(info_dict)
741 template_dict['epoch'] = unicode(long(time.time()))
742 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
743 filename = self.params['outtmpl'] % template_dict
744 return filename
745 except (ValueError, KeyError), err:
746 self.trouble(u'ERROR: invalid system charset or erroneous output template')
747 return None
748
749 def _match_entry(self, info_dict):
750 """ Returns None iff the file should be downloaded """
751
752 title = info_dict['title']
753 matchtitle = self.params.get('matchtitle', False)
754 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
755 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
756 rejecttitle = self.params.get('rejecttitle', False)
757 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
758 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
759 return None
760
761 def process_info(self, info_dict):
762 """Process a single dictionary returned by an InfoExtractor."""
763
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen(u'[download] ' + reason)
767 return
768
769 max_downloads = self.params.get('max_downloads')
770 if max_downloads is not None:
771 if self._num_downloads > int(max_downloads):
772 raise MaxDownloadsReached()
773
774 filename = self.prepare_filename(info_dict)
775
776 # Forced printings
777 if self.params.get('forcetitle', False):
778 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forceurl', False):
780 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
782 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
783 if self.params.get('forcedescription', False) and 'description' in info_dict:
784 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
785 if self.params.get('forcefilename', False) and filename is not None:
786 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
787 if self.params.get('forceformat', False):
788 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
789
790 # Do nothing else if in simulate mode
791 if self.params.get('simulate', False):
792 return
793
794 if filename is None:
795 return
796
797 try:
798 dn = os.path.dirname(_encodeFilename(filename))
799 if dn != '' and not os.path.exists(dn): # dn is already encoded
800 os.makedirs(dn)
801 except (OSError, IOError), err:
802 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
803 return
804
805 if self.params.get('writedescription', False):
806 try:
807 descfn = filename + u'.description'
808 self.report_writedescription(descfn)
809 descfile = open(_encodeFilename(descfn), 'wb')
810 try:
811 descfile.write(info_dict['description'].encode('utf-8'))
812 finally:
813 descfile.close()
814 except (OSError, IOError):
815 self.trouble(u'ERROR: Cannot write description file ' + descfn)
816 return
817
818 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
819 # subtitles download errors are already managed as troubles in relevant IE
820 # that way it will silently go on when used with unsupporting IE
821 try:
822 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
823 self.report_writesubtitles(srtfn)
824 srtfile = open(_encodeFilename(srtfn), 'wb')
825 try:
826 srtfile.write(info_dict['subtitles'].encode('utf-8'))
827 finally:
828 srtfile.close()
829 except (OSError, IOError):
830 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
831 return
832
833 if self.params.get('writeinfojson', False):
834 infofn = filename + u'.info.json'
835 self.report_writeinfojson(infofn)
836 try:
837 json.dump
838 except (NameError,AttributeError):
839 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
840 return
841 try:
842 infof = open(_encodeFilename(infofn), 'wb')
843 try:
844 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
845 json.dump(json_info_dict, infof)
846 finally:
847 infof.close()
848 except (OSError, IOError):
849 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
850 return
851
852 if not self.params.get('skip_download', False):
853 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
854 success = True
855 else:
856 try:
857 success = self._do_download(filename, info_dict)
858 except (OSError, IOError), err:
859 raise UnavailableVideoError
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
862 return
863 except (ContentTooShortError, ), err:
864 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
865 return
866
867 if success:
868 try:
869 self.post_process(filename, info_dict)
870 except (PostProcessingError), err:
871 self.trouble(u'ERROR: postprocessing: %s' % str(err))
872 return
873
874 def download(self, url_list):
875 """Download a given list of URLs."""
876 if len(url_list) > 1 and self.fixed_template():
877 raise SameFileError(self.params['outtmpl'])
878
879 for url in url_list:
880 suitable_found = False
881 for ie in self._ies:
882 # Go to next InfoExtractor if not suitable
883 if not ie.suitable(url):
884 continue
885
886 # Suitable InfoExtractor found
887 suitable_found = True
888
889 # Extract information from URL and process it
890 ie.extract(url)
891
892 # Suitable InfoExtractor had been found; go to next URL
893 break
894
895 if not suitable_found:
896 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
897
898 return self._download_retcode
899
900 def post_process(self, filename, ie_info):
901 """Run the postprocessing chain on the given file."""
902 info = dict(ie_info)
903 info['filepath'] = filename
904 for pp in self._pps:
905 info = pp.run(info)
906 if info is None:
907 break
908
909 def _download_with_rtmpdump(self, filename, url, player_url):
910 self.report_destination(filename)
911 tmpfilename = self.temp_name(filename)
912
913 # Check for rtmpdump first
914 try:
915 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
916 except (OSError, IOError):
917 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
918 return False
919
920 # Download using rtmpdump. rtmpdump returns exit code 2 when
921 # the connection was interrumpted and resuming appears to be
922 # possible. This is part of rtmpdump's normal usage, AFAIK.
923 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
924 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
925 if self.params.get('verbose', False):
926 try:
927 import pipes
928 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
929 except ImportError:
930 shell_quote = repr
931 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
932 retval = subprocess.call(args)
933 while retval == 2 or retval == 1:
934 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
935 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
936 time.sleep(5.0) # This seems to be needed
937 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
938 cursize = os.path.getsize(_encodeFilename(tmpfilename))
939 if prevsize == cursize and retval == 1:
940 break
941 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
942 if prevsize == cursize and retval == 2 and cursize > 1024:
943 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
944 retval = 0
945 break
946 if retval == 0:
947 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
948 self.try_rename(tmpfilename, filename)
949 return True
950 else:
951 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
952 return False
953
954 def _do_download(self, filename, info_dict):
955 url = info_dict['url']
956 player_url = info_dict.get('player_url', None)
957
958 # Check file already present
959 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
960 self.report_file_already_downloaded(filename)
961 return True
962
963 # Attempt to download using rtmpdump
964 if url.startswith('rtmp'):
965 return self._download_with_rtmpdump(filename, url, player_url)
966
967 tmpfilename = self.temp_name(filename)
968 stream = None
969
970 # Do not include the Accept-Encoding header
971 headers = {'Youtubedl-no-compression': 'True'}
972 basic_request = urllib2.Request(url, None, headers)
973 request = urllib2.Request(url, None, headers)
974
975 # Establish possible resume length
976 if os.path.isfile(_encodeFilename(tmpfilename)):
977 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
978 else:
979 resume_len = 0
980
981 open_mode = 'wb'
982 if resume_len != 0:
983 if self.params.get('continuedl', False):
984 self.report_resuming_byte(resume_len)
985 request.add_header('Range','bytes=%d-' % resume_len)
986 open_mode = 'ab'
987 else:
988 resume_len = 0
989
990 count = 0
991 retries = self.params.get('retries', 0)
992 while count <= retries:
993 # Establish connection
994 try:
995 if count == 0 and 'urlhandle' in info_dict:
996 data = info_dict['urlhandle']
997 data = urllib2.urlopen(request)
998 break
999 except (urllib2.HTTPError, ), err:
1000 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001 # Unexpected HTTP error
1002 raise
1003 elif err.code == 416:
1004 # Unable to resume (requested range not satisfiable)
1005 try:
1006 # Open the connection again without the range header
1007 data = urllib2.urlopen(basic_request)
1008 content_length = data.info()['Content-Length']
1009 except (urllib2.HTTPError, ), err:
1010 if err.code < 500 or err.code >= 600:
1011 raise
1012 else:
1013 # Examine the reported length
1014 if (content_length is not None and
1015 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016 # The file had already been fully downloaded.
1017 # Explanation to the above condition: in issue #175 it was revealed that
1018 # YouTube sometimes adds or removes a few bytes from the end of the file,
1019 # changing the file size slightly and causing problems for some users. So
1020 # I decided to implement a suggested change and consider the file
1021 # completely downloaded if the file size differs less than 100 bytes from
1022 # the one in the hard drive.
1023 self.report_file_already_downloaded(filename)
1024 self.try_rename(tmpfilename, filename)
1025 return True
1026 else:
1027 # The length does not match, we start the download over
1028 self.report_unable_to_resume()
1029 open_mode = 'wb'
1030 break
1031 # Retry
1032 count += 1
1033 if count <= retries:
1034 self.report_retry(count, retries)
1035
1036 if count > retries:
1037 self.trouble(u'ERROR: giving up after %s retries' % retries)
1038 return False
1039
1040 data_len = data.info().get('Content-length', None)
1041 if data_len is not None:
1042 data_len = long(data_len) + resume_len
1043 data_len_str = self.format_bytes(data_len)
1044 byte_counter = 0 + resume_len
1045 block_size = 1024
1046 start = time.time()
1047 while True:
1048 # Download and write
1049 before = time.time()
1050 data_block = data.read(block_size)
1051 after = time.time()
1052 if len(data_block) == 0:
1053 break
1054 byte_counter += len(data_block)
1055
1056 # Open file just in time
1057 if stream is None:
1058 try:
1059 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060 assert stream is not None
1061 filename = self.undo_temp_name(tmpfilename)
1062 self.report_destination(filename)
1063 except (OSError, IOError), err:
1064 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1065 return False
1066 try:
1067 stream.write(data_block)
1068 except (IOError, OSError), err:
1069 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070 return False
1071 block_size = self.best_block_size(after - before, len(data_block))
1072
1073 # Progress message
1074 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075 if data_len is None:
1076 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077 else:
1078 percent_str = self.calc_percent(byte_counter, data_len)
1079 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1081
1082 # Apply rate limit
1083 self.slow_down(start, byte_counter - resume_len)
1084
1085 if stream is None:
1086 self.trouble(u'\nERROR: Did not get any data blocks')
1087 return False
1088 stream.close()
1089 self.report_finish()
1090 if data_len is not None and byte_counter != data_len:
1091 raise ContentTooShortError(byte_counter, long(data_len))
1092 self.try_rename(tmpfilename, filename)
1093
1094 # Update file modification time
1095 if self.params.get('updatetime', True):
1096 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1097
1098 return True
1099
1100
1101 class InfoExtractor(object):
1102 """Information Extractor class.
1103
1104 Information extractors are the classes that, given a URL, extract
1105 information from the video (or videos) the URL refers to. This
1106 information includes the real video URL, the video title and simplified
1107 title, author and others. The information is stored in a dictionary
1108 which is then passed to the FileDownloader. The FileDownloader
1109 processes this information possibly downloading the video to the file
1110 system, among other possible outcomes. The dictionaries must include
1111 the following fields:
1112
1113 id: Video identifier.
1114 url: Final video URL.
1115 uploader: Nickname of the video uploader.
1116 title: Literal title.
1117 stitle: Simplified title.
1118 ext: Video filename extension.
1119 format: Video format.
1120 player_url: SWF Player URL (may be None).
1121
1122 The following fields are optional. Their primary purpose is to allow
1123 youtube-dl to serve as the backend for a video search function, such
1124 as the one in youtube2mp3. They are only used when their respective
1125 forced printing functions are called:
1126
1127 thumbnail: Full URL to a video thumbnail image.
1128 description: One-line video description.
1129
1130 Subclasses of this one should re-define the _real_initialize() and
1131 _real_extract() methods and define a _VALID_URL regexp.
1132 Probably, they should also be added to the list of extractors.
1133 """
1134
1135 _ready = False
1136 _downloader = None
1137
1138 def __init__(self, downloader=None):
1139 """Constructor. Receives an optional downloader."""
1140 self._ready = False
1141 self.set_downloader(downloader)
1142
1143 def suitable(self, url):
1144 """Receives a URL and returns True if suitable for this IE."""
1145 return re.match(self._VALID_URL, url) is not None
1146
1147 def initialize(self):
1148 """Initializes an instance (authentication, etc)."""
1149 if not self._ready:
1150 self._real_initialize()
1151 self._ready = True
1152
1153 def extract(self, url):
1154 """Extracts URL information and returns it in list of dicts."""
1155 self.initialize()
1156 return self._real_extract(url)
1157
1158 def set_downloader(self, downloader):
1159 """Sets the downloader for this IE."""
1160 self._downloader = downloader
1161
1162 def _real_initialize(self):
1163 """Real initialization process. Redefine in subclasses."""
1164 pass
1165
1166 def _real_extract(self, url):
1167 """Real extraction process. Redefine in subclasses."""
1168 pass
1169
1170
1171 class YoutubeIE(InfoExtractor):
1172 """Information extractor for youtube.com."""
1173
1174 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1179 _NETRC_MACHINE = 'youtube'
1180 # Listed in order of quality
1181 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183 _video_extensions = {
1184 '13': '3gp',
1185 '17': 'mp4',
1186 '18': 'mp4',
1187 '22': 'mp4',
1188 '37': 'mp4',
1189 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1190 '43': 'webm',
1191 '44': 'webm',
1192 '45': 'webm',
1193 }
1194 _video_dimensions = {
1195 '5': '240x400',
1196 '6': '???',
1197 '13': '???',
1198 '17': '144x176',
1199 '18': '360x640',
1200 '22': '720x1280',
1201 '34': '360x640',
1202 '35': '480x854',
1203 '37': '1080x1920',
1204 '38': '3072x4096',
1205 '43': '360x640',
1206 '44': '480x854',
1207 '45': '720x1280',
1208 }
1209 IE_NAME = u'youtube'
1210
1211 def report_lang(self):
1212 """Report attempt to set language."""
1213 self._downloader.to_screen(u'[youtube] Setting language')
1214
1215 def report_login(self):
1216 """Report attempt to log in."""
1217 self._downloader.to_screen(u'[youtube] Logging in')
1218
1219 def report_age_confirmation(self):
1220 """Report attempt to confirm age."""
1221 self._downloader.to_screen(u'[youtube] Confirming age')
1222
1223 def report_video_webpage_download(self, video_id):
1224 """Report attempt to download video webpage."""
1225 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1226
1227 def report_video_info_webpage_download(self, video_id):
1228 """Report attempt to download video info webpage."""
1229 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1230
1231 def report_video_subtitles_download(self, video_id):
1232 """Report attempt to download video info webpage."""
1233 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1234
1235 def report_information_extraction(self, video_id):
1236 """Report attempt to extract video information."""
1237 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1238
1239 def report_unavailable_format(self, video_id, format):
1240 """Report extracted video URL."""
1241 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1242
1243 def report_rtmp_download(self):
1244 """Indicate the download will use the RTMP protocol."""
1245 self._downloader.to_screen(u'[youtube] RTMP download detected')
1246
1247 def _closed_captions_xml_to_srt(self, xml_string):
1248 srt = ''
1249 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250 # TODO parse xml instead of regex
1251 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252 if not dur: dur = '4'
1253 start = float(start)
1254 end = start + float(dur)
1255 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259 srt += str(n) + '\n'
1260 srt += start + ' --> ' + end + '\n'
1261 srt += caption + '\n\n'
1262 return srt
1263
1264 def _print_formats(self, formats):
1265 print 'Available formats:'
1266 for x in formats:
1267 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1268
1269 def _real_initialize(self):
1270 if self._downloader is None:
1271 return
1272
1273 username = None
1274 password = None
1275 downloader_params = self._downloader.params
1276
1277 # Attempt to use provided username and password or .netrc data
1278 if downloader_params.get('username', None) is not None:
1279 username = downloader_params['username']
1280 password = downloader_params['password']
1281 elif downloader_params.get('usenetrc', False):
1282 try:
1283 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284 if info is not None:
1285 username = info[0]
1286 password = info[2]
1287 else:
1288 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289 except (IOError, netrc.NetrcParseError), err:
1290 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1291 return
1292
1293 # Set language
1294 request = urllib2.Request(self._LANG_URL)
1295 try:
1296 self.report_lang()
1297 urllib2.urlopen(request).read()
1298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1300 return
1301
1302 # No authentication to be performed
1303 if username is None:
1304 return
1305
1306 # Log in
1307 login_form = {
1308 'current_form': 'loginForm',
1309 'next': '/',
1310 'action_login': 'Log In',
1311 'username': username,
1312 'password': password,
1313 }
1314 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1315 try:
1316 self.report_login()
1317 login_results = urllib2.urlopen(request).read()
1318 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1320 return
1321 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1323 return
1324
1325 # Confirm age
1326 age_form = {
1327 'next_url': '/',
1328 'action_confirm': 'Confirm',
1329 }
1330 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1331 try:
1332 self.report_age_confirmation()
1333 age_results = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1336 return
1337
1338 def _real_extract(self, url):
1339 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1340 mobj = re.search(self._NEXT_URL_RE, url)
1341 if mobj:
1342 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
1343
1344 # Extract video id from URL
1345 mobj = re.match(self._VALID_URL, url)
1346 if mobj is None:
1347 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1348 return
1349 video_id = mobj.group(2)
1350
1351 # Get video webpage
1352 self.report_video_webpage_download(video_id)
1353 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1354 try:
1355 video_webpage = urllib2.urlopen(request).read()
1356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1357 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1358 return
1359
1360 # Attempt to extract SWF player URL
1361 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1362 if mobj is not None:
1363 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1364 else:
1365 player_url = None
1366
1367 # Get video info
1368 self.report_video_info_webpage_download(video_id)
1369 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1370 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1371 % (video_id, el_type))
1372 request = urllib2.Request(video_info_url)
1373 try:
1374 video_info_webpage = urllib2.urlopen(request).read()
1375 video_info = parse_qs(video_info_webpage)
1376 if 'token' in video_info:
1377 break
1378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1379 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1380 return
1381 if 'token' not in video_info:
1382 if 'reason' in video_info:
1383 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1384 else:
1385 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1386 return
1387
1388 # Start extracting information
1389 self.report_information_extraction(video_id)
1390
1391 # uploader
1392 if 'author' not in video_info:
1393 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1394 return
1395 video_uploader = urllib.unquote_plus(video_info['author'][0])
1396
1397 # title
1398 if 'title' not in video_info:
1399 self._downloader.trouble(u'ERROR: unable to extract video title')
1400 return
1401 video_title = urllib.unquote_plus(video_info['title'][0])
1402 video_title = video_title.decode('utf-8')
1403 video_title = sanitize_title(video_title)
1404
1405 # simplified title
1406 simple_title = _simplify_title(video_title)
1407
1408 # thumbnail image
1409 if 'thumbnail_url' not in video_info:
1410 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1411 video_thumbnail = ''
1412 else: # don't panic if we can't find it
1413 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1414
1415 # upload date
1416 upload_date = u'NA'
1417 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1418 if mobj is not None:
1419 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1420 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1421 for expression in format_expressions:
1422 try:
1423 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1424 except:
1425 pass
1426
1427 # description
1428 try:
1429 lxml.etree
1430 except NameError:
1431 video_description = u'No description available.'
1432 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1433 if mobj is not None:
1434 video_description = mobj.group(1).decode('utf-8')
1435 else:
1436 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1437 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1438 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1439 # TODO use another parser
1440
1441 # closed captions
1442 video_subtitles = None
1443 if self._downloader.params.get('writesubtitles', False):
1444 self.report_video_subtitles_download(video_id)
1445 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1446 try:
1447 srt_list = urllib2.urlopen(request).read()
1448 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1450 else:
1451 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1452 if srt_lang_list:
1453 if self._downloader.params.get('subtitleslang', False):
1454 srt_lang = self._downloader.params.get('subtitleslang')
1455 elif 'en' in srt_lang_list:
1456 srt_lang = 'en'
1457 else:
1458 srt_lang = srt_lang_list[0]
1459 if not srt_lang in srt_lang_list:
1460 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1461 else:
1462 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1463 try:
1464 srt_xml = urllib2.urlopen(request).read()
1465 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1466 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1467 else:
1468 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1469 else:
1470 self._downloader.trouble(u'WARNING: video has no closed captions')
1471
1472 # token
1473 video_token = urllib.unquote_plus(video_info['token'][0])
1474
1475 # Decide which formats to download
1476 req_format = self._downloader.params.get('format', None)
1477
1478 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1479 self.report_rtmp_download()
1480 video_url_list = [(None, video_info['conn'][0])]
1481 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1482 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1483 url_data = [parse_qs(uds) for uds in url_data_strs]
1484 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1485 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1486
1487 format_limit = self._downloader.params.get('format_limit', None)
1488 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1489 if format_limit is not None and format_limit in available_formats:
1490 format_list = available_formats[available_formats.index(format_limit):]
1491 else:
1492 format_list = available_formats
1493 existing_formats = [x for x in format_list if x in url_map]
1494 if len(existing_formats) == 0:
1495 self._downloader.trouble(u'ERROR: no known formats available for video')
1496 return
1497 if self._downloader.params.get('listformats', None):
1498 self._print_formats(existing_formats)
1499 return
1500 if req_format is None or req_format == 'best':
1501 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1502 elif req_format == 'worst':
1503 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1504 elif req_format in ('-1', 'all'):
1505 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1506 else:
1507 # Specific formats. We pick the first in a slash-delimeted sequence.
1508 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1509 req_formats = req_format.split('/')
1510 video_url_list = None
1511 for rf in req_formats:
1512 if rf in url_map:
1513 video_url_list = [(rf, url_map[rf])]
1514 break
1515 if video_url_list is None:
1516 self._downloader.trouble(u'ERROR: requested format not available')
1517 return
1518 else:
1519 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1520 return
1521
1522 for format_param, video_real_url in video_url_list:
1523 # At this point we have a new video
1524 self._downloader.increment_downloads()
1525
1526 # Extension
1527 video_extension = self._video_extensions.get(format_param, 'flv')
1528
1529 try:
1530 # Process video information
1531 self._downloader.process_info({
1532 'id': video_id.decode('utf-8'),
1533 'url': video_real_url.decode('utf-8'),
1534 'uploader': video_uploader.decode('utf-8'),
1535 'upload_date': upload_date,
1536 'title': video_title,
1537 'stitle': simple_title,
1538 'ext': video_extension.decode('utf-8'),
1539 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1540 'thumbnail': video_thumbnail.decode('utf-8'),
1541 'description': video_description,
1542 'player_url': player_url,
1543 'subtitles': video_subtitles
1544 })
1545 except UnavailableVideoError, err:
1546 self._downloader.trouble(u'\nERROR: unable to download video')
1547
1548
1549 class MetacafeIE(InfoExtractor):
1550 """Information Extractor for metacafe.com."""
1551
1552 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1553 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1554 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1555 _youtube_ie = None
1556 IE_NAME = u'metacafe'
1557
1558 def __init__(self, youtube_ie, downloader=None):
1559 InfoExtractor.__init__(self, downloader)
1560 self._youtube_ie = youtube_ie
1561
1562 def report_disclaimer(self):
1563 """Report disclaimer retrieval."""
1564 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1565
1566 def report_age_confirmation(self):
1567 """Report attempt to confirm age."""
1568 self._downloader.to_screen(u'[metacafe] Confirming age')
1569
1570 def report_download_webpage(self, video_id):
1571 """Report webpage download."""
1572 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1573
1574 def report_extraction(self, video_id):
1575 """Report information extraction."""
1576 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1577
1578 def _real_initialize(self):
1579 # Retrieve disclaimer
1580 request = urllib2.Request(self._DISCLAIMER)
1581 try:
1582 self.report_disclaimer()
1583 disclaimer = urllib2.urlopen(request).read()
1584 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1585 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1586 return
1587
1588 # Confirm age
1589 disclaimer_form = {
1590 'filters': '0',
1591 'submit': "Continue - I'm over 18",
1592 }
1593 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1594 try:
1595 self.report_age_confirmation()
1596 disclaimer = urllib2.urlopen(request).read()
1597 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1598 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1599 return
1600
1601 def _real_extract(self, url):
1602 # Extract id and simplified title from URL
1603 mobj = re.match(self._VALID_URL, url)
1604 if mobj is None:
1605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1606 return
1607
1608 video_id = mobj.group(1)
1609
1610 # Check if video comes from YouTube
1611 mobj2 = re.match(r'^yt-(.*)$', video_id)
1612 if mobj2 is not None:
1613 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1614 return
1615
1616 # At this point we have a new video
1617 self._downloader.increment_downloads()
1618
1619 simple_title = mobj.group(2).decode('utf-8')
1620
1621 # Retrieve video webpage to extract further information
1622 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1623 try:
1624 self.report_download_webpage(video_id)
1625 webpage = urllib2.urlopen(request).read()
1626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1628 return
1629
1630 # Extract URL, uploader and title from webpage
1631 self.report_extraction(video_id)
1632 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1633 if mobj is not None:
1634 mediaURL = urllib.unquote(mobj.group(1))
1635 video_extension = mediaURL[-3:]
1636
1637 # Extract gdaKey if available
1638 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1639 if mobj is None:
1640 video_url = mediaURL
1641 else:
1642 gdaKey = mobj.group(1)
1643 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1644 else:
1645 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1646 if mobj is None:
1647 self._downloader.trouble(u'ERROR: unable to extract media URL')
1648 return
1649 vardict = parse_qs(mobj.group(1))
1650 if 'mediaData' not in vardict:
1651 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652 return
1653 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1654 if mobj is None:
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1656 return
1657 mediaURL = mobj.group(1).replace('\\/', '/')
1658 video_extension = mediaURL[-3:]
1659 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1660
1661 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1662 if mobj is None:
1663 self._downloader.trouble(u'ERROR: unable to extract title')
1664 return
1665 video_title = mobj.group(1).decode('utf-8')
1666 video_title = sanitize_title(video_title)
1667
1668 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1669 if mobj is None:
1670 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1671 return
1672 video_uploader = mobj.group(1)
1673
1674 try:
1675 # Process video information
1676 self._downloader.process_info({
1677 'id': video_id.decode('utf-8'),
1678 'url': video_url.decode('utf-8'),
1679 'uploader': video_uploader.decode('utf-8'),
1680 'upload_date': u'NA',
1681 'title': video_title,
1682 'stitle': simple_title,
1683 'ext': video_extension.decode('utf-8'),
1684 'format': u'NA',
1685 'player_url': None,
1686 })
1687 except UnavailableVideoError:
1688 self._downloader.trouble(u'\nERROR: unable to download video')
1689
1690
1691 class DailymotionIE(InfoExtractor):
1692 """Information Extractor for Dailymotion"""
1693
1694 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1695 IE_NAME = u'dailymotion'
1696
1697 def __init__(self, downloader=None):
1698 InfoExtractor.__init__(self, downloader)
1699
1700 def report_download_webpage(self, video_id):
1701 """Report webpage download."""
1702 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1703
1704 def report_extraction(self, video_id):
1705 """Report information extraction."""
1706 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1707
1708 def _real_extract(self, url):
1709 # Extract id and simplified title from URL
1710 mobj = re.match(self._VALID_URL, url)
1711 if mobj is None:
1712 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1713 return
1714
1715 # At this point we have a new video
1716 self._downloader.increment_downloads()
1717 video_id = mobj.group(1)
1718
1719 video_extension = 'flv'
1720
1721 # Retrieve video webpage to extract further information
1722 request = urllib2.Request(url)
1723 request.add_header('Cookie', 'family_filter=off')
1724 try:
1725 self.report_download_webpage(video_id)
1726 webpage = urllib2.urlopen(request).read()
1727 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1728 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1729 return
1730
1731 # Extract URL, uploader and title from webpage
1732 self.report_extraction(video_id)
1733 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1734 if mobj is None:
1735 self._downloader.trouble(u'ERROR: unable to extract media URL')
1736 return
1737 sequence = urllib.unquote(mobj.group(1))
1738 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1739 if mobj is None:
1740 self._downloader.trouble(u'ERROR: unable to extract media URL')
1741 return
1742 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1743
1744 # if needed add http://www.dailymotion.com/ if relative URL
1745
1746 video_url = mediaURL
1747
1748 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1749 if mobj is None:
1750 self._downloader.trouble(u'ERROR: unable to extract title')
1751 return
1752 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1753 video_title = sanitize_title(video_title)
1754 simple_title = _simplify_title(video_title)
1755
1756 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1757 if mobj is None:
1758 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1759 return
1760 video_uploader = mobj.group(1)
1761
1762 try:
1763 # Process video information
1764 self._downloader.process_info({
1765 'id': video_id.decode('utf-8'),
1766 'url': video_url.decode('utf-8'),
1767 'uploader': video_uploader.decode('utf-8'),
1768 'upload_date': u'NA',
1769 'title': video_title,
1770 'stitle': simple_title,
1771 'ext': video_extension.decode('utf-8'),
1772 'format': u'NA',
1773 'player_url': None,
1774 })
1775 except UnavailableVideoError:
1776 self._downloader.trouble(u'\nERROR: unable to download video')
1777
1778
1779 class GoogleIE(InfoExtractor):
1780 """Information extractor for video.google.com."""
1781
1782 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1783 IE_NAME = u'video.google'
1784
1785 def __init__(self, downloader=None):
1786 InfoExtractor.__init__(self, downloader)
1787
1788 def report_download_webpage(self, video_id):
1789 """Report webpage download."""
1790 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1791
1792 def report_extraction(self, video_id):
1793 """Report information extraction."""
1794 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1795
1796 def _real_extract(self, url):
1797 # Extract id from URL
1798 mobj = re.match(self._VALID_URL, url)
1799 if mobj is None:
1800 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1801 return
1802
1803 # At this point we have a new video
1804 self._downloader.increment_downloads()
1805 video_id = mobj.group(1)
1806
1807 video_extension = 'mp4'
1808
1809 # Retrieve video webpage to extract further information
1810 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1811 try:
1812 self.report_download_webpage(video_id)
1813 webpage = urllib2.urlopen(request).read()
1814 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1815 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1816 return
1817
1818 # Extract URL, uploader, and title from webpage
1819 self.report_extraction(video_id)
1820 mobj = re.search(r"download_url:'([^']+)'", webpage)
1821 if mobj is None:
1822 video_extension = 'flv'
1823 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1824 if mobj is None:
1825 self._downloader.trouble(u'ERROR: unable to extract media URL')
1826 return
1827 mediaURL = urllib.unquote(mobj.group(1))
1828 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1829 mediaURL = mediaURL.replace('\\x26', '\x26')
1830
1831 video_url = mediaURL
1832
1833 mobj = re.search(r'<title>(.*)</title>', webpage)
1834 if mobj is None:
1835 self._downloader.trouble(u'ERROR: unable to extract title')
1836 return
1837 video_title = mobj.group(1).decode('utf-8')
1838 video_title = sanitize_title(video_title)
1839 simple_title = _simplify_title(video_title)
1840
1841 # Extract video description
1842 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1843 if mobj is None:
1844 self._downloader.trouble(u'ERROR: unable to extract video description')
1845 return
1846 video_description = mobj.group(1).decode('utf-8')
1847 if not video_description:
1848 video_description = 'No description available.'
1849
1850 # Extract video thumbnail
1851 if self._downloader.params.get('forcethumbnail', False):
1852 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1853 try:
1854 webpage = urllib2.urlopen(request).read()
1855 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1857 return
1858 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1859 if mobj is None:
1860 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1861 return
1862 video_thumbnail = mobj.group(1)
1863 else: # we need something to pass to process_info
1864 video_thumbnail = ''
1865
1866 try:
1867 # Process video information
1868 self._downloader.process_info({
1869 'id': video_id.decode('utf-8'),
1870 'url': video_url.decode('utf-8'),
1871 'uploader': u'NA',
1872 'upload_date': u'NA',
1873 'title': video_title,
1874 'stitle': simple_title,
1875 'ext': video_extension.decode('utf-8'),
1876 'format': u'NA',
1877 'player_url': None,
1878 })
1879 except UnavailableVideoError:
1880 self._downloader.trouble(u'\nERROR: unable to download video')
1881
1882
1883 class PhotobucketIE(InfoExtractor):
1884 """Information extractor for photobucket.com."""
1885
1886 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1887 IE_NAME = u'photobucket'
1888
1889 def __init__(self, downloader=None):
1890 InfoExtractor.__init__(self, downloader)
1891
1892 def report_download_webpage(self, video_id):
1893 """Report webpage download."""
1894 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1895
1896 def report_extraction(self, video_id):
1897 """Report information extraction."""
1898 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1899
1900 def _real_extract(self, url):
1901 # Extract id from URL
1902 mobj = re.match(self._VALID_URL, url)
1903 if mobj is None:
1904 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1905 return
1906
1907 # At this point we have a new video
1908 self._downloader.increment_downloads()
1909 video_id = mobj.group(1)
1910
1911 video_extension = 'flv'
1912
1913 # Retrieve video webpage to extract further information
1914 request = urllib2.Request(url)
1915 try:
1916 self.report_download_webpage(video_id)
1917 webpage = urllib2.urlopen(request).read()
1918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1919 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1920 return
1921
1922 # Extract URL, uploader, and title from webpage
1923 self.report_extraction(video_id)
1924 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1925 if mobj is None:
1926 self._downloader.trouble(u'ERROR: unable to extract media URL')
1927 return
1928 mediaURL = urllib.unquote(mobj.group(1))
1929
1930 video_url = mediaURL
1931
1932 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1933 if mobj is None:
1934 self._downloader.trouble(u'ERROR: unable to extract title')
1935 return
1936 video_title = mobj.group(1).decode('utf-8')
1937 video_title = sanitize_title(video_title)
1938 simple_title = _simplify_title(vide_title)
1939
1940 video_uploader = mobj.group(2).decode('utf-8')
1941
1942 try:
1943 # Process video information
1944 self._downloader.process_info({
1945 'id': video_id.decode('utf-8'),
1946 'url': video_url.decode('utf-8'),
1947 'uploader': video_uploader,
1948 'upload_date': u'NA',
1949 'title': video_title,
1950 'stitle': simple_title,
1951 'ext': video_extension.decode('utf-8'),
1952 'format': u'NA',
1953 'player_url': None,
1954 })
1955 except UnavailableVideoError:
1956 self._downloader.trouble(u'\nERROR: unable to download video')
1957
1958
1959 class YahooIE(InfoExtractor):
1960 """Information extractor for video.yahoo.com."""
1961
1962 # _VALID_URL matches all Yahoo! Video URLs
1963 # _VPAGE_URL matches only the extractable '/watch/' URLs
1964 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1965 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1966 IE_NAME = u'video.yahoo'
1967
1968 def __init__(self, downloader=None):
1969 InfoExtractor.__init__(self, downloader)
1970
1971 def report_download_webpage(self, video_id):
1972 """Report webpage download."""
1973 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1974
1975 def report_extraction(self, video_id):
1976 """Report information extraction."""
1977 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1978
1979 def _real_extract(self, url, new_video=True):
1980 # Extract ID from URL
1981 mobj = re.match(self._VALID_URL, url)
1982 if mobj is None:
1983 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1984 return
1985
1986 # At this point we have a new video
1987 self._downloader.increment_downloads()
1988 video_id = mobj.group(2)
1989 video_extension = 'flv'
1990
1991 # Rewrite valid but non-extractable URLs as
1992 # extractable English language /watch/ URLs
1993 if re.match(self._VPAGE_URL, url) is None:
1994 request = urllib2.Request(url)
1995 try:
1996 webpage = urllib2.urlopen(request).read()
1997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1998 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1999 return
2000
2001 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2002 if mobj is None:
2003 self._downloader.trouble(u'ERROR: Unable to extract id field')
2004 return
2005 yahoo_id = mobj.group(1)
2006
2007 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2008 if mobj is None:
2009 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2010 return
2011 yahoo_vid = mobj.group(1)
2012
2013 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2014 return self._real_extract(url, new_video=False)
2015
2016 # Retrieve video webpage to extract further information
2017 request = urllib2.Request(url)
2018 try:
2019 self.report_download_webpage(video_id)
2020 webpage = urllib2.urlopen(request).read()
2021 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2022 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2023 return
2024
2025 # Extract uploader and title from webpage
2026 self.report_extraction(video_id)
2027 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2028 if mobj is None:
2029 self._downloader.trouble(u'ERROR: unable to extract video title')
2030 return
2031 video_title = mobj.group(1).decode('utf-8')
2032 simple_title = _simplify_title(video_title)
2033
2034 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2035 if mobj is None:
2036 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2037 return
2038 video_uploader = mobj.group(1).decode('utf-8')
2039
2040 # Extract video thumbnail
2041 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2042 if mobj is None:
2043 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2044 return
2045 video_thumbnail = mobj.group(1).decode('utf-8')
2046
2047 # Extract video description
2048 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2049 if mobj is None:
2050 self._downloader.trouble(u'ERROR: unable to extract video description')
2051 return
2052 video_description = mobj.group(1).decode('utf-8')
2053 if not video_description:
2054 video_description = 'No description available.'
2055
2056 # Extract video height and width
2057 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2058 if mobj is None:
2059 self._downloader.trouble(u'ERROR: unable to extract video height')
2060 return
2061 yv_video_height = mobj.group(1)
2062
2063 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2064 if mobj is None:
2065 self._downloader.trouble(u'ERROR: unable to extract video width')
2066 return
2067 yv_video_width = mobj.group(1)
2068
2069 # Retrieve video playlist to extract media URL
2070 # I'm not completely sure what all these options are, but we
2071 # seem to need most of them, otherwise the server sends a 401.
2072 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2073 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2074 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2075 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2076 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2077 try:
2078 self.report_download_webpage(video_id)
2079 webpage = urllib2.urlopen(request).read()
2080 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2081 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2082 return
2083
2084 # Extract media URL from playlist XML
2085 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2086 if mobj is None:
2087 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2088 return
2089 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2090 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2091
2092 try:
2093 # Process video information
2094 self._downloader.process_info({
2095 'id': video_id.decode('utf-8'),
2096 'url': video_url,
2097 'uploader': video_uploader,
2098 'upload_date': u'NA',
2099 'title': video_title,
2100 'stitle': simple_title,
2101 'ext': video_extension.decode('utf-8'),
2102 'thumbnail': video_thumbnail.decode('utf-8'),
2103 'description': video_description,
2104 'thumbnail': video_thumbnail,
2105 'player_url': None,
2106 })
2107 except UnavailableVideoError:
2108 self._downloader.trouble(u'\nERROR: unable to download video')
2109
2110
2111 class VimeoIE(InfoExtractor):
2112 """Information extractor for vimeo.com."""
2113
2114 # _VALID_URL matches Vimeo URLs
2115 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2116 IE_NAME = u'vimeo'
2117
2118 def __init__(self, downloader=None):
2119 InfoExtractor.__init__(self, downloader)
2120
2121 def report_download_webpage(self, video_id):
2122 """Report webpage download."""
2123 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2124
2125 def report_extraction(self, video_id):
2126 """Report information extraction."""
2127 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2128
2129 def _real_extract(self, url, new_video=True):
2130 # Extract ID from URL
2131 mobj = re.match(self._VALID_URL, url)
2132 if mobj is None:
2133 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2134 return
2135
2136 # At this point we have a new video
2137 self._downloader.increment_downloads()
2138 video_id = mobj.group(1)
2139
2140 # Retrieve video webpage to extract further information
2141 request = urllib2.Request(url, None, std_headers)
2142 try:
2143 self.report_download_webpage(video_id)
2144 webpage = urllib2.urlopen(request).read()
2145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2146 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2147 return
2148
2149 # Now we begin extracting as much information as we can from what we
2150 # retrieved. First we extract the information common to all extractors,
2151 # and latter we extract those that are Vimeo specific.
2152 self.report_extraction(video_id)
2153
2154 # Extract the config JSON
2155 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2156 try:
2157 config = json.loads(config)
2158 except:
2159 self._downloader.trouble(u'ERROR: unable to extract info section')
2160 return
2161
2162 # Extract title
2163 video_title = config["video"]["title"]
2164 simple_title = _simplify_title(video_title)
2165
2166 # Extract uploader
2167 video_uploader = config["video"]["owner"]["name"]
2168
2169 # Extract video thumbnail
2170 video_thumbnail = config["video"]["thumbnail"]
2171
2172 # Extract video description
2173 try:
2174 lxml.etree
2175 except NameError:
2176 video_description = u'No description available.'
2177 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2178 if mobj is not None:
2179 video_description = mobj.group(1)
2180 else:
2181 html_parser = lxml.etree.HTMLParser()
2182 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2183 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2184 # TODO use another parser
2185
2186 # Extract upload date
2187 video_upload_date = u'NA'
2188 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2189 if mobj is not None:
2190 video_upload_date = mobj.group(1)
2191
2192 # Vimeo specific: extract request signature and timestamp
2193 sig = config['request']['signature']
2194 timestamp = config['request']['timestamp']
2195
2196 # Vimeo specific: extract video codec and quality information
2197 # TODO bind to format param
2198 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2199 for codec in codecs:
2200 if codec[0] in config["video"]["files"]:
2201 video_codec = codec[0]
2202 video_extension = codec[1]
2203 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2204 else: quality = 'sd'
2205 break
2206 else:
2207 self._downloader.trouble(u'ERROR: no known codec found')
2208 return
2209
2210 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2211 %(video_id, sig, timestamp, quality, video_codec.upper())
2212
2213 try:
2214 # Process video information
2215 self._downloader.process_info({
2216 'id': video_id,
2217 'url': video_url,
2218 'uploader': video_uploader,
2219 'upload_date': video_upload_date,
2220 'title': video_title,
2221 'stitle': simple_title,
2222 'ext': video_extension,
2223 'thumbnail': video_thumbnail,
2224 'description': video_description,
2225 'player_url': None,
2226 })
2227 except UnavailableVideoError:
2228 self._downloader.trouble(u'ERROR: unable to download video')
2229
2230
2231 class GenericIE(InfoExtractor):
2232 """Generic last-resort information extractor."""
2233
2234 _VALID_URL = r'.*'
2235 IE_NAME = u'generic'
2236
2237 def __init__(self, downloader=None):
2238 InfoExtractor.__init__(self, downloader)
2239
2240 def report_download_webpage(self, video_id):
2241 """Report webpage download."""
2242 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2243 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2244
2245 def report_extraction(self, video_id):
2246 """Report information extraction."""
2247 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2248
2249 def _real_extract(self, url):
2250 # At this point we have a new video
2251 self._downloader.increment_downloads()
2252
2253 video_id = url.split('/')[-1]
2254 request = urllib2.Request(url)
2255 try:
2256 self.report_download_webpage(video_id)
2257 webpage = urllib2.urlopen(request).read()
2258 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2259 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2260 return
2261 except ValueError, err:
2262 # since this is the last-resort InfoExtractor, if
2263 # this error is thrown, it'll be thrown here
2264 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2265 return
2266
2267 self.report_extraction(video_id)
2268 # Start with something easy: JW Player in SWFObject
2269 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2270 if mobj is None:
2271 # Broaden the search a little bit
2272 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2273 if mobj is None:
2274 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2275 return
2276
2277 # It's possible that one of the regexes
2278 # matched, but returned an empty group:
2279 if mobj.group(1) is None:
2280 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2281 return
2282
2283 video_url = urllib.unquote(mobj.group(1))
2284 video_id = os.path.basename(video_url)
2285
2286 # here's a fun little line of code for you:
2287 video_extension = os.path.splitext(video_id)[1][1:]
2288 video_id = os.path.splitext(video_id)[0]
2289
2290 # it's tempting to parse this further, but you would
2291 # have to take into account all the variations like
2292 # Video Title - Site Name
2293 # Site Name | Video Title
2294 # Video Title - Tagline | Site Name
2295 # and so on and so forth; it's just not practical
2296 mobj = re.search(r'<title>(.*)</title>', webpage)
2297 if mobj is None:
2298 self._downloader.trouble(u'ERROR: unable to extract title')
2299 return
2300 video_title = mobj.group(1).decode('utf-8')
2301 video_title = sanitize_title(video_title)
2302 simple_title = _simplify_title(video_title)
2303
2304 # video uploader is domain name
2305 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2306 if mobj is None:
2307 self._downloader.trouble(u'ERROR: unable to extract title')
2308 return
2309 video_uploader = mobj.group(1).decode('utf-8')
2310
2311 try:
2312 # Process video information
2313 self._downloader.process_info({
2314 'id': video_id.decode('utf-8'),
2315 'url': video_url.decode('utf-8'),
2316 'uploader': video_uploader,
2317 'upload_date': u'NA',
2318 'title': video_title,
2319 'stitle': simple_title,
2320 'ext': video_extension.decode('utf-8'),
2321 'format': u'NA',
2322 'player_url': None,
2323 })
2324 except UnavailableVideoError, err:
2325 self._downloader.trouble(u'\nERROR: unable to download video')
2326
2327
2328 class YoutubeSearchIE(InfoExtractor):
2329 """Information Extractor for YouTube search queries."""
2330 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2331 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2332 _youtube_ie = None
2333 _max_youtube_results = 1000
2334 IE_NAME = u'youtube:search'
2335
2336 def __init__(self, youtube_ie, downloader=None):
2337 InfoExtractor.__init__(self, downloader)
2338 self._youtube_ie = youtube_ie
2339
2340 def report_download_page(self, query, pagenum):
2341 """Report attempt to download playlist page with given number."""
2342 query = query.decode(preferredencoding())
2343 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2344
2345 def _real_initialize(self):
2346 self._youtube_ie.initialize()
2347
2348 def _real_extract(self, query):
2349 mobj = re.match(self._VALID_URL, query)
2350 if mobj is None:
2351 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2352 return
2353
2354 prefix, query = query.split(':')
2355 prefix = prefix[8:]
2356 query = query.encode('utf-8')
2357 if prefix == '':
2358 self._download_n_results(query, 1)
2359 return
2360 elif prefix == 'all':
2361 self._download_n_results(query, self._max_youtube_results)
2362 return
2363 else:
2364 try:
2365 n = long(prefix)
2366 if n <= 0:
2367 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2368 return
2369 elif n > self._max_youtube_results:
2370 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2371 n = self._max_youtube_results
2372 self._download_n_results(query, n)
2373 return
2374 except ValueError: # parsing prefix as integer fails
2375 self._download_n_results(query, 1)
2376 return
2377
2378 def _download_n_results(self, query, n):
2379 """Downloads a specified number of results for a query"""
2380
2381 video_ids = []
2382 pagenum = 0
2383 limit = n
2384
2385 while (50 * pagenum) < limit:
2386 self.report_download_page(query, pagenum+1)
2387 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2388 request = urllib2.Request(result_url)
2389 try:
2390 data = urllib2.urlopen(request).read()
2391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2393 return
2394 api_response = json.loads(data)['data']
2395
2396 new_ids = list(video['id'] for video in api_response['items'])
2397 video_ids += new_ids
2398
2399 limit = min(n, api_response['totalItems'])
2400 pagenum += 1
2401
2402 if len(video_ids) > n:
2403 video_ids = video_ids[:n]
2404 for id in video_ids:
2405 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2406 return
2407
2408
2409 class GoogleSearchIE(InfoExtractor):
2410 """Information Extractor for Google Video search queries."""
2411 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2412 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2413 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2414 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2415 _google_ie = None
2416 _max_google_results = 1000
2417 IE_NAME = u'video.google:search'
2418
2419 def __init__(self, google_ie, downloader=None):
2420 InfoExtractor.__init__(self, downloader)
2421 self._google_ie = google_ie
2422
2423 def report_download_page(self, query, pagenum):
2424 """Report attempt to download playlist page with given number."""
2425 query = query.decode(preferredencoding())
2426 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2427
2428 def _real_initialize(self):
2429 self._google_ie.initialize()
2430
2431 def _real_extract(self, query):
2432 mobj = re.match(self._VALID_URL, query)
2433 if mobj is None:
2434 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2435 return
2436
2437 prefix, query = query.split(':')
2438 prefix = prefix[8:]
2439 query = query.encode('utf-8')
2440 if prefix == '':
2441 self._download_n_results(query, 1)
2442 return
2443 elif prefix == 'all':
2444 self._download_n_results(query, self._max_google_results)
2445 return
2446 else:
2447 try:
2448 n = long(prefix)
2449 if n <= 0:
2450 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2451 return
2452 elif n > self._max_google_results:
2453 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2454 n = self._max_google_results
2455 self._download_n_results(query, n)
2456 return
2457 except ValueError: # parsing prefix as integer fails
2458 self._download_n_results(query, 1)
2459 return
2460
2461 def _download_n_results(self, query, n):
2462 """Downloads a specified number of results for a query"""
2463
2464 video_ids = []
2465 pagenum = 0
2466
2467 while True:
2468 self.report_download_page(query, pagenum)
2469 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2470 request = urllib2.Request(result_url)
2471 try:
2472 page = urllib2.urlopen(request).read()
2473 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2475 return
2476
2477 # Extract video identifiers
2478 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2479 video_id = mobj.group(1)
2480 if video_id not in video_ids:
2481 video_ids.append(video_id)
2482 if len(video_ids) == n:
2483 # Specified n videos reached
2484 for id in video_ids:
2485 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2486 return
2487
2488 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2489 for id in video_ids:
2490 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2491 return
2492
2493 pagenum = pagenum + 1
2494
2495
2496 class YahooSearchIE(InfoExtractor):
2497 """Information Extractor for Yahoo! Video search queries."""
2498 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2499 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2500 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2501 _MORE_PAGES_INDICATOR = r'\s*Next'
2502 _yahoo_ie = None
2503 _max_yahoo_results = 1000
2504 IE_NAME = u'video.yahoo:search'
2505
2506 def __init__(self, yahoo_ie, downloader=None):
2507 InfoExtractor.__init__(self, downloader)
2508 self._yahoo_ie = yahoo_ie
2509
2510 def report_download_page(self, query, pagenum):
2511 """Report attempt to download playlist page with given number."""
2512 query = query.decode(preferredencoding())
2513 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2514
2515 def _real_initialize(self):
2516 self._yahoo_ie.initialize()
2517
2518 def _real_extract(self, query):
2519 mobj = re.match(self._VALID_URL, query)
2520 if mobj is None:
2521 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2522 return
2523
2524 prefix, query = query.split(':')
2525 prefix = prefix[8:]
2526 query = query.encode('utf-8')
2527 if prefix == '':
2528 self._download_n_results(query, 1)
2529 return
2530 elif prefix == 'all':
2531 self._download_n_results(query, self._max_yahoo_results)
2532 return
2533 else:
2534 try:
2535 n = long(prefix)
2536 if n <= 0:
2537 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2538 return
2539 elif n > self._max_yahoo_results:
2540 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2541 n = self._max_yahoo_results
2542 self._download_n_results(query, n)
2543 return
2544 except ValueError: # parsing prefix as integer fails
2545 self._download_n_results(query, 1)
2546 return
2547
2548 def _download_n_results(self, query, n):
2549 """Downloads a specified number of results for a query"""
2550
2551 video_ids = []
2552 already_seen = set()
2553 pagenum = 1
2554
2555 while True:
2556 self.report_download_page(query, pagenum)
2557 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2558 request = urllib2.Request(result_url)
2559 try:
2560 page = urllib2.urlopen(request).read()
2561 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2562 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2563 return
2564
2565 # Extract video identifiers
2566 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2567 video_id = mobj.group(1)
2568 if video_id not in already_seen:
2569 video_ids.append(video_id)
2570 already_seen.add(video_id)
2571 if len(video_ids) == n:
2572 # Specified n videos reached
2573 for id in video_ids:
2574 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2575 return
2576
2577 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2578 for id in video_ids:
2579 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2580 return
2581
2582 pagenum = pagenum + 1
2583
2584
2585 class YoutubePlaylistIE(InfoExtractor):
2586 """Information Extractor for YouTube playlists."""
2587
2588 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2589 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2590 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2591 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2592 _youtube_ie = None
2593 IE_NAME = u'youtube:playlist'
2594
2595 def __init__(self, youtube_ie, downloader=None):
2596 InfoExtractor.__init__(self, downloader)
2597 self._youtube_ie = youtube_ie
2598
2599 def report_download_page(self, playlist_id, pagenum):
2600 """Report attempt to download playlist page with given number."""
2601 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2602
2603 def _real_initialize(self):
2604 self._youtube_ie.initialize()
2605
2606 def _real_extract(self, url):
2607 # Extract playlist id
2608 mobj = re.match(self._VALID_URL, url)
2609 if mobj is None:
2610 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2611 return
2612
2613 # Single video case
2614 if mobj.group(3) is not None:
2615 self._youtube_ie.extract(mobj.group(3))
2616 return
2617
2618 # Download playlist pages
2619 # prefix is 'p' as default for playlists but there are other types that need extra care
2620 playlist_prefix = mobj.group(1)
2621 if playlist_prefix == 'a':
2622 playlist_access = 'artist'
2623 else:
2624 playlist_prefix = 'p'
2625 playlist_access = 'view_play_list'
2626 playlist_id = mobj.group(2)
2627 video_ids = []
2628 pagenum = 1
2629
2630 while True:
2631 self.report_download_page(playlist_id, pagenum)
2632 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2633 request = urllib2.Request(url)
2634 try:
2635 page = urllib2.urlopen(request).read()
2636 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2637 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2638 return
2639
2640 # Extract video identifiers
2641 ids_in_page = []
2642 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2643 if mobj.group(1) not in ids_in_page:
2644 ids_in_page.append(mobj.group(1))
2645 video_ids.extend(ids_in_page)
2646
2647 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2648 break
2649 pagenum = pagenum + 1
2650
2651 playliststart = self._downloader.params.get('playliststart', 1) - 1
2652 playlistend = self._downloader.params.get('playlistend', -1)
2653 if playlistend == -1:
2654 video_ids = video_ids[playliststart:]
2655 else:
2656 video_ids = video_ids[playliststart:playlistend]
2657
2658 for id in video_ids:
2659 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2660 return
2661
2662
2663 class YoutubeUserIE(InfoExtractor):
2664 """Information Extractor for YouTube users."""
2665
2666 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2667 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2668 _GDATA_PAGE_SIZE = 50
2669 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2670 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2671 _youtube_ie = None
2672 IE_NAME = u'youtube:user'
2673
2674 def __init__(self, youtube_ie, downloader=None):
2675 InfoExtractor.__init__(self, downloader)
2676 self._youtube_ie = youtube_ie
2677
2678 def report_download_page(self, username, start_index):
2679 """Report attempt to download user page."""
2680 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2681 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2682
2683 def _real_initialize(self):
2684 self._youtube_ie.initialize()
2685
2686 def _real_extract(self, url):
2687 # Extract username
2688 mobj = re.match(self._VALID_URL, url)
2689 if mobj is None:
2690 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2691 return
2692
2693 username = mobj.group(1)
2694
2695 # Download video ids using YouTube Data API. Result size per
2696 # query is limited (currently to 50 videos) so we need to query
2697 # page by page until there are no video ids - it means we got
2698 # all of them.
2699
2700 video_ids = []
2701 pagenum = 0
2702
2703 while True:
2704 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2705 self.report_download_page(username, start_index)
2706
2707 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2708
2709 try:
2710 page = urllib2.urlopen(request).read()
2711 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2712 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2713 return
2714
2715 # Extract video identifiers
2716 ids_in_page = []
2717
2718 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2719 if mobj.group(1) not in ids_in_page:
2720 ids_in_page.append(mobj.group(1))
2721
2722 video_ids.extend(ids_in_page)
2723
2724 # A little optimization - if current page is not
2725 # "full", ie. does not contain PAGE_SIZE video ids then
2726 # we can assume that this page is the last one - there
2727 # are no more ids on further pages - no need to query
2728 # again.
2729
2730 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2731 break
2732
2733 pagenum += 1
2734
2735 all_ids_count = len(video_ids)
2736 playliststart = self._downloader.params.get('playliststart', 1) - 1
2737 playlistend = self._downloader.params.get('playlistend', -1)
2738
2739 if playlistend == -1:
2740 video_ids = video_ids[playliststart:]
2741 else:
2742 video_ids = video_ids[playliststart:playlistend]
2743
2744 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2745 (username, all_ids_count, len(video_ids)))
2746
2747 for video_id in video_ids:
2748 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2749
2750
2751 class DepositFilesIE(InfoExtractor):
2752 """Information extractor for depositfiles.com"""
2753
2754 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2755 IE_NAME = u'DepositFiles'
2756
2757 def __init__(self, downloader=None):
2758 InfoExtractor.__init__(self, downloader)
2759
2760 def report_download_webpage(self, file_id):
2761 """Report webpage download."""
2762 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2763
2764 def report_extraction(self, file_id):
2765 """Report information extraction."""
2766 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2767
2768 def _real_extract(self, url):
2769 # At this point we have a new file
2770 self._downloader.increment_downloads()
2771
2772 file_id = url.split('/')[-1]
2773 # Rebuild url in english locale
2774 url = 'http://depositfiles.com/en/files/' + file_id
2775
2776 # Retrieve file webpage with 'Free download' button pressed
2777 free_download_indication = { 'gateway_result' : '1' }
2778 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2779 try:
2780 self.report_download_webpage(file_id)
2781 webpage = urllib2.urlopen(request).read()
2782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2784 return
2785
2786 # Search for the real file URL
2787 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2788 if (mobj is None) or (mobj.group(1) is None):
2789 # Try to figure out reason of the error.
2790 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2791 if (mobj is not None) and (mobj.group(1) is not None):
2792 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2793 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2794 else:
2795 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2796 return
2797
2798 file_url = mobj.group(1)
2799 file_extension = os.path.splitext(file_url)[1][1:]
2800
2801 # Search for file title
2802 mobj = re.search(r'<b title="(.*?)">', webpage)
2803 if mobj is None:
2804 self._downloader.trouble(u'ERROR: unable to extract title')
2805 return
2806 file_title = mobj.group(1).decode('utf-8')
2807
2808 try:
2809 # Process file information
2810 self._downloader.process_info({
2811 'id': file_id.decode('utf-8'),
2812 'url': file_url.decode('utf-8'),
2813 'uploader': u'NA',
2814 'upload_date': u'NA',
2815 'title': file_title,
2816 'stitle': file_title,
2817 'ext': file_extension.decode('utf-8'),
2818 'format': u'NA',
2819 'player_url': None,
2820 })
2821 except UnavailableVideoError, err:
2822 self._downloader.trouble(u'ERROR: unable to download file')
2823
2824
2825 class FacebookIE(InfoExtractor):
2826 """Information Extractor for Facebook"""
2827
2828 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2829 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2830 _NETRC_MACHINE = 'facebook'
2831 _available_formats = ['video', 'highqual', 'lowqual']
2832 _video_extensions = {
2833 'video': 'mp4',
2834 'highqual': 'mp4',
2835 'lowqual': 'mp4',
2836 }
2837 IE_NAME = u'facebook'
2838
2839 def __init__(self, downloader=None):
2840 InfoExtractor.__init__(self, downloader)
2841
2842 def _reporter(self, message):
2843 """Add header and report message."""
2844 self._downloader.to_screen(u'[facebook] %s' % message)
2845
2846 def report_login(self):
2847 """Report attempt to log in."""
2848 self._reporter(u'Logging in')
2849
2850 def report_video_webpage_download(self, video_id):
2851 """Report attempt to download video webpage."""
2852 self._reporter(u'%s: Downloading video webpage' % video_id)
2853
2854 def report_information_extraction(self, video_id):
2855 """Report attempt to extract video information."""
2856 self._reporter(u'%s: Extracting video information' % video_id)
2857
2858 def _parse_page(self, video_webpage):
2859 """Extract video information from page"""
2860 # General data
2861 data = {'title': r'\("video_title", "(.*?)"\)',
2862 'description': r'<div class="datawrap">(.*?)</div>',
2863 'owner': r'\("video_owner_name", "(.*?)"\)',
2864 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2865 }
2866 video_info = {}
2867 for piece in data.keys():
2868 mobj = re.search(data[piece], video_webpage)
2869 if mobj is not None:
2870 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2871
2872 # Video urls
2873 video_urls = {}
2874 for fmt in self._available_formats:
2875 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2876 if mobj is not None:
2877 # URL is in a Javascript segment inside an escaped Unicode format within
2878 # the generally utf-8 page
2879 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2880 video_info['video_urls'] = video_urls
2881
2882 return video_info
2883
2884 def _real_initialize(self):
2885 if self._downloader is None:
2886 return
2887
2888 useremail = None
2889 password = None
2890 downloader_params = self._downloader.params
2891
2892 # Attempt to use provided username and password or .netrc data
2893 if downloader_params.get('username', None) is not None:
2894 useremail = downloader_params['username']
2895 password = downloader_params['password']
2896 elif downloader_params.get('usenetrc', False):
2897 try:
2898 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2899 if info is not None:
2900 useremail = info[0]
2901 password = info[2]
2902 else:
2903 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2904 except (IOError, netrc.NetrcParseError), err:
2905 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2906 return
2907
2908 if useremail is None:
2909 return
2910
2911 # Log in
2912 login_form = {
2913 'email': useremail,
2914 'pass': password,
2915 'login': 'Log+In'
2916 }
2917 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2918 try:
2919 self.report_login()
2920 login_results = urllib2.urlopen(request).read()
2921 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2922 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2923 return
2924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2925 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2926 return
2927
2928 def _real_extract(self, url):
2929 mobj = re.match(self._VALID_URL, url)
2930 if mobj is None:
2931 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2932 return
2933 video_id = mobj.group('ID')
2934
2935 # Get video webpage
2936 self.report_video_webpage_download(video_id)
2937 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2938 try:
2939 page = urllib2.urlopen(request)
2940 video_webpage = page.read()
2941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2942 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2943 return
2944
2945 # Start extracting information
2946 self.report_information_extraction(video_id)
2947
2948 # Extract information
2949 video_info = self._parse_page(video_webpage)
2950
2951 # uploader
2952 if 'owner' not in video_info:
2953 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2954 return
2955 video_uploader = video_info['owner']
2956
2957 # title
2958 if 'title' not in video_info:
2959 self._downloader.trouble(u'ERROR: unable to extract video title')
2960 return
2961 video_title = video_info['title']
2962 video_title = video_title.decode('utf-8')
2963 video_title = sanitize_title(video_title)
2964
2965 simple_title = _simplify_title(video_title)
2966
2967 # thumbnail image
2968 if 'thumbnail' not in video_info:
2969 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2970 video_thumbnail = ''
2971 else:
2972 video_thumbnail = video_info['thumbnail']
2973
2974 # upload date
2975 upload_date = u'NA'
2976 if 'upload_date' in video_info:
2977 upload_time = video_info['upload_date']
2978 timetuple = email.utils.parsedate_tz(upload_time)
2979 if timetuple is not None:
2980 try:
2981 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2982 except:
2983 pass
2984
2985 # description
2986 video_description = video_info.get('description', 'No description available.')
2987
2988 url_map = video_info['video_urls']
2989 if len(url_map.keys()) > 0:
2990 # Decide which formats to download
2991 req_format = self._downloader.params.get('format', None)
2992 format_limit = self._downloader.params.get('format_limit', None)
2993
2994 if format_limit is not None and format_limit in self._available_formats:
2995 format_list = self._available_formats[self._available_formats.index(format_limit):]
2996 else:
2997 format_list = self._available_formats
2998 existing_formats = [x for x in format_list if x in url_map]
2999 if len(existing_formats) == 0:
3000 self._downloader.trouble(u'ERROR: no known formats available for video')
3001 return
3002 if req_format is None:
3003 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3004 elif req_format == 'worst':
3005 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3006 elif req_format == '-1':
3007 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3008 else:
3009 # Specific format
3010 if req_format not in url_map:
3011 self._downloader.trouble(u'ERROR: requested format not available')
3012 return
3013 video_url_list = [(req_format, url_map[req_format])] # Specific format
3014
3015 for format_param, video_real_url in video_url_list:
3016
3017 # At this point we have a new video
3018 self._downloader.increment_downloads()
3019
3020 # Extension
3021 video_extension = self._video_extensions.get(format_param, 'mp4')
3022
3023 try:
3024 # Process video information
3025 self._downloader.process_info({
3026 'id': video_id.decode('utf-8'),
3027 'url': video_real_url.decode('utf-8'),
3028 'uploader': video_uploader.decode('utf-8'),
3029 'upload_date': upload_date,
3030 'title': video_title,
3031 'stitle': simple_title,
3032 'ext': video_extension.decode('utf-8'),
3033 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3034 'thumbnail': video_thumbnail.decode('utf-8'),
3035 'description': video_description.decode('utf-8'),
3036 'player_url': None,
3037 })
3038 except UnavailableVideoError, err:
3039 self._downloader.trouble(u'\nERROR: unable to download video')
3040
3041 class BlipTVIE(InfoExtractor):
3042 """Information extractor for blip.tv"""
3043
3044 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3045 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3046 IE_NAME = u'blip.tv'
3047
3048 def report_extraction(self, file_id):
3049 """Report information extraction."""
3050 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3051
3052 def report_direct_download(self, title):
3053 """Report information extraction."""
3054 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3055
3056 def _real_extract(self, url):
3057 mobj = re.match(self._VALID_URL, url)
3058 if mobj is None:
3059 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3060 return
3061
3062 if '?' in url:
3063 cchar = '&'
3064 else:
3065 cchar = '?'
3066 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3067 request = urllib2.Request(json_url)
3068 self.report_extraction(mobj.group(1))
3069 info = None
3070 try:
3071 urlh = urllib2.urlopen(request)
3072 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3073 basename = url.split('/')[-1]
3074 title,ext = os.path.splitext(basename)
3075 title = title.decode('UTF-8')
3076 ext = ext.replace('.', '')
3077 self.report_direct_download(title)
3078 info = {
3079 'id': title,
3080 'url': url,
3081 'title': title,
3082 'stitle': _simplify_title(title),
3083 'ext': ext,
3084 'urlhandle': urlh
3085 }
3086 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3087 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3088 return
3089 if info is None: # Regular URL
3090 try:
3091 json_code = urlh.read()
3092 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3094 return
3095
3096 try:
3097 json_data = json.loads(json_code)
3098 if 'Post' in json_data:
3099 data = json_data['Post']
3100 else:
3101 data = json_data
3102
3103 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3104 video_url = data['media']['url']
3105 umobj = re.match(self._URL_EXT, video_url)
3106 if umobj is None:
3107 raise ValueError('Can not determine filename extension')
3108 ext = umobj.group(1)
3109
3110 info = {
3111 'id': data['item_id'],
3112 'url': video_url,
3113 'uploader': data['display_name'],
3114 'upload_date': upload_date,
3115 'title': data['title'],
3116 'stitle': _simplify_title(data['title']),
3117 'ext': ext,
3118 'format': data['media']['mimeType'],
3119 'thumbnail': data['thumbnailUrl'],
3120 'description': data['description'],
3121 'player_url': data['embedUrl']
3122 }
3123 except (ValueError,KeyError), err:
3124 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3125 return
3126
3127 self._downloader.increment_downloads()
3128
3129 try:
3130 self._downloader.process_info(info)
3131 except UnavailableVideoError, err:
3132 self._downloader.trouble(u'\nERROR: unable to download video')
3133
3134
3135 class MyVideoIE(InfoExtractor):
3136 """Information Extractor for myvideo.de."""
3137
3138 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3139 IE_NAME = u'myvideo'
3140
3141 def __init__(self, downloader=None):
3142 InfoExtractor.__init__(self, downloader)
3143
3144 def report_download_webpage(self, video_id):
3145 """Report webpage download."""
3146 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3147
3148 def report_extraction(self, video_id):
3149 """Report information extraction."""
3150 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3151
3152 def _real_extract(self,url):
3153 mobj = re.match(self._VALID_URL, url)
3154 if mobj is None:
3155 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3156 return
3157
3158 video_id = mobj.group(1)
3159
3160 # Get video webpage
3161 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3162 try:
3163 self.report_download_webpage(video_id)
3164 webpage = urllib2.urlopen(request).read()
3165 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3167 return
3168
3169 self.report_extraction(video_id)
3170 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3171 webpage)
3172 if mobj is None:
3173 self._downloader.trouble(u'ERROR: unable to extract media URL')
3174 return
3175 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3176
3177 mobj = re.search('<title>([^<]+)</title>', webpage)
3178 if mobj is None:
3179 self._downloader.trouble(u'ERROR: unable to extract title')
3180 return
3181
3182 video_title = mobj.group(1)
3183 video_title = sanitize_title(video_title)
3184
3185 simple_title = _simplify_title(video_title)
3186
3187 try:
3188 self._downloader.process_info({
3189 'id': video_id,
3190 'url': video_url,
3191 'uploader': u'NA',
3192 'upload_date': u'NA',
3193 'title': video_title,
3194 'stitle': simple_title,
3195 'ext': u'flv',
3196 'format': u'NA',
3197 'player_url': None,
3198 })
3199 except UnavailableVideoError:
3200 self._downloader.trouble(u'\nERROR: Unable to download video')
3201
3202 class ComedyCentralIE(InfoExtractor):
3203 """Information extractor for The Daily Show and Colbert Report """
3204
3205 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3206 IE_NAME = u'comedycentral'
3207
3208 def report_extraction(self, episode_id):
3209 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3210
3211 def report_config_download(self, episode_id):
3212 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3213
3214 def report_index_download(self, episode_id):
3215 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3216
3217 def report_player_url(self, episode_id):
3218 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3219
3220 def _real_extract(self, url):
3221 mobj = re.match(self._VALID_URL, url)
3222 if mobj is None:
3223 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3224 return
3225
3226 if mobj.group('shortname'):
3227 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3228 url = u'http://www.thedailyshow.com/full-episodes/'
3229 else:
3230 url = u'http://www.colbertnation.com/full-episodes/'
3231 mobj = re.match(self._VALID_URL, url)
3232 assert mobj is not None
3233
3234 dlNewest = not mobj.group('episode')
3235 if dlNewest:
3236 epTitle = mobj.group('showname')
3237 else:
3238 epTitle = mobj.group('episode')
3239
3240 req = urllib2.Request(url)
3241 self.report_extraction(epTitle)
3242 try:
3243 htmlHandle = urllib2.urlopen(req)
3244 html = htmlHandle.read()
3245 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3246 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3247 return
3248 if dlNewest:
3249 url = htmlHandle.geturl()
3250 mobj = re.match(self._VALID_URL, url)
3251 if mobj is None:
3252 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3253 return
3254 if mobj.group('episode') == '':
3255 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3256 return
3257 epTitle = mobj.group('episode')
3258
3259 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3260 if len(mMovieParams) == 0:
3261 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3262 return
3263
3264 playerUrl_raw = mMovieParams[0][0]
3265 self.report_player_url(epTitle)
3266 try:
3267 urlHandle = urllib2.urlopen(playerUrl_raw)
3268 playerUrl = urlHandle.geturl()
3269 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3270 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3271 return
3272
3273 uri = mMovieParams[0][1]
3274 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3275 self.report_index_download(epTitle)
3276 try:
3277 indexXml = urllib2.urlopen(indexUrl).read()
3278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3279 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3280 return
3281
3282 idoc = xml.etree.ElementTree.fromstring(indexXml)
3283 itemEls = idoc.findall('.//item')
3284 for itemEl in itemEls:
3285 mediaId = itemEl.findall('./guid')[0].text
3286 shortMediaId = mediaId.split(':')[-1]
3287 showId = mediaId.split(':')[-2].replace('.com', '')
3288 officialTitle = itemEl.findall('./title')[0].text
3289 officialDate = itemEl.findall('./pubDate')[0].text
3290
3291 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3292 urllib.urlencode({'uri': mediaId}))
3293 configReq = urllib2.Request(configUrl)
3294 self.report_config_download(epTitle)
3295 try:
3296 configXml = urllib2.urlopen(configReq).read()
3297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3299 return
3300
3301 cdoc = xml.etree.ElementTree.fromstring(configXml)
3302 turls = []
3303 for rendition in cdoc.findall('.//rendition'):
3304 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3305 turls.append(finfo)
3306
3307 if len(turls) == 0:
3308 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3309 continue
3310
3311 # For now, just pick the highest bitrate
3312 format,video_url = turls[-1]
3313
3314 self._downloader.increment_downloads()
3315
3316 effTitle = showId + u'-' + epTitle
3317 info = {
3318 'id': shortMediaId,
3319 'url': video_url,
3320 'uploader': showId,
3321 'upload_date': officialDate,
3322 'title': effTitle,
3323 'stitle': _simplify_title(effTitle),
3324 'ext': 'mp4',
3325 'format': format,
3326 'thumbnail': None,
3327 'description': officialTitle,
3328 'player_url': playerUrl
3329 }
3330
3331 try:
3332 self._downloader.process_info(info)
3333 except UnavailableVideoError, err:
3334 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3335 continue
3336
3337
3338 class EscapistIE(InfoExtractor):
3339 """Information extractor for The Escapist """
3340
3341 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3342 IE_NAME = u'escapist'
3343
3344 def report_extraction(self, showName):
3345 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3346
3347 def report_config_download(self, showName):
3348 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3349
3350 def _real_extract(self, url):
3351 htmlParser = HTMLParser.HTMLParser()
3352
3353 mobj = re.match(self._VALID_URL, url)
3354 if mobj is None:
3355 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3356 return
3357 showName = mobj.group('showname')
3358 videoId = mobj.group('episode')
3359
3360 self.report_extraction(showName)
3361 try:
3362 webPage = urllib2.urlopen(url).read()
3363 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3364 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3365 return
3366
3367 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3368 description = htmlParser.unescape(descMatch.group(1))
3369 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3370 imgUrl = htmlParser.unescape(imgMatch.group(1))
3371 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3372 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3373 configUrlMatch = re.search('config=(.*)$', playerUrl)
3374 configUrl = urllib2.unquote(configUrlMatch.group(1))
3375
3376 self.report_config_download(showName)
3377 try:
3378 configJSON = urllib2.urlopen(configUrl).read()
3379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3380 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3381 return
3382
3383 # Technically, it's JavaScript, not JSON
3384 configJSON = configJSON.replace("'", '"')
3385
3386 try:
3387 config = json.loads(configJSON)
3388 except (ValueError,), err:
3389 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3390 return
3391
3392 playlist = config['playlist']
3393 videoUrl = playlist[1]['url']
3394
3395 self._downloader.increment_downloads()
3396 info = {
3397 'id': videoId,
3398 'url': videoUrl,
3399 'uploader': showName,
3400 'upload_date': None,
3401 'title': showName,
3402 'stitle': _simplify_title(showName),
3403 'ext': 'flv',
3404 'format': 'flv',
3405 'thumbnail': imgUrl,
3406 'description': description,
3407 'player_url': playerUrl,
3408 }
3409
3410 try:
3411 self._downloader.process_info(info)
3412 except UnavailableVideoError, err:
3413 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3414
3415
3416 class CollegeHumorIE(InfoExtractor):
3417 """Information extractor for collegehumor.com"""
3418
3419 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3420 IE_NAME = u'collegehumor'
3421
3422 def report_webpage(self, video_id):
3423 """Report information extraction."""
3424 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3425
3426 def report_extraction(self, video_id):
3427 """Report information extraction."""
3428 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3429
3430 def _real_extract(self, url):
3431 htmlParser = HTMLParser.HTMLParser()
3432
3433 mobj = re.match(self._VALID_URL, url)
3434 if mobj is None:
3435 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3436 return
3437 video_id = mobj.group('videoid')
3438
3439 self.report_webpage(video_id)
3440 request = urllib2.Request(url)
3441 try:
3442 webpage = urllib2.urlopen(request).read()
3443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3444 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3445 return
3446
3447 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3448 if m is None:
3449 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3450 return
3451 internal_video_id = m.group('internalvideoid')
3452
3453 info = {
3454 'id': video_id,
3455 'internal_id': internal_video_id,
3456 }
3457
3458 self.report_extraction(video_id)
3459 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3460 try:
3461 metaXml = urllib2.urlopen(xmlUrl).read()
3462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3463 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3464 return
3465
3466 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3467 try:
3468 videoNode = mdoc.findall('./video')[0]
3469 info['description'] = videoNode.findall('./description')[0].text
3470 info['title'] = videoNode.findall('./caption')[0].text
3471 info['stitle'] = _simplify_title(info['title'])
3472 info['url'] = videoNode.findall('./file')[0].text
3473 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3474 info['ext'] = info['url'].rpartition('.')[2]
3475 info['format'] = info['ext']
3476 except IndexError:
3477 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3478 return
3479
3480 self._downloader.increment_downloads()
3481
3482 try:
3483 self._downloader.process_info(info)
3484 except UnavailableVideoError, err:
3485 self._downloader.trouble(u'\nERROR: unable to download video')
3486
3487
3488 class XVideosIE(InfoExtractor):
3489 """Information extractor for xvideos.com"""
3490
3491 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3492 IE_NAME = u'xvideos'
3493
3494 def report_webpage(self, video_id):
3495 """Report information extraction."""
3496 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3497
3498 def report_extraction(self, video_id):
3499 """Report information extraction."""
3500 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3501
3502 def _real_extract(self, url):
3503 htmlParser = HTMLParser.HTMLParser()
3504
3505 mobj = re.match(self._VALID_URL, url)
3506 if mobj is None:
3507 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3508 return
3509 video_id = mobj.group(1).decode('utf-8')
3510
3511 self.report_webpage(video_id)
3512
3513 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3514 try:
3515 webpage = urllib2.urlopen(request).read()
3516 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3517 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3518 return
3519
3520 self.report_extraction(video_id)
3521
3522
3523 # Extract video URL
3524 mobj = re.search(r'flv_url=(.+?)&', webpage)
3525 if mobj is None:
3526 self._downloader.trouble(u'ERROR: unable to extract video url')
3527 return
3528 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3529
3530
3531 # Extract title
3532 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3533 if mobj is None:
3534 self._downloader.trouble(u'ERROR: unable to extract video title')
3535 return
3536 video_title = mobj.group(1).decode('utf-8')
3537
3538
3539 # Extract video thumbnail
3540 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3541 if mobj is None:
3542 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3543 return
3544 video_thumbnail = mobj.group(1).decode('utf-8')
3545
3546
3547
3548 self._downloader.increment_downloads()
3549 info = {
3550 'id': video_id,
3551 'url': video_url,
3552 'uploader': None,
3553 'upload_date': None,
3554 'title': video_title,
3555 'stitle': _simplify_title(video_title),
3556 'ext': 'flv',
3557 'format': 'flv',
3558 'thumbnail': video_thumbnail,
3559 'description': None,
3560 'player_url': None,
3561 }
3562
3563 try:
3564 self._downloader.process_info(info)
3565 except UnavailableVideoError, err:
3566 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3567
3568
3569 class SoundcloudIE(InfoExtractor):
3570 """Information extractor for soundcloud.com
3571 To access the media, the uid of the song and a stream token
3572 must be extracted from the page source and the script must make
3573 a request to media.soundcloud.com/crossdomain.xml. Then
3574 the media can be grabbed by requesting from an url composed
3575 of the stream token and uid
3576 """
3577
3578 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3579 IE_NAME = u'soundcloud'
3580
3581 def __init__(self, downloader=None):
3582 InfoExtractor.__init__(self, downloader)
3583
3584 def report_webpage(self, video_id):
3585 """Report information extraction."""
3586 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3587
3588 def report_extraction(self, video_id):
3589 """Report information extraction."""
3590 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3591
3592 def _real_extract(self, url):
3593 htmlParser = HTMLParser.HTMLParser()
3594
3595 mobj = re.match(self._VALID_URL, url)
3596 if mobj is None:
3597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3598 return
3599
3600 # extract uploader (which is in the url)
3601 uploader = mobj.group(1).decode('utf-8')
3602 # extract simple title (uploader + slug of song title)
3603 slug_title = mobj.group(2).decode('utf-8')
3604 simple_title = uploader + '-' + slug_title
3605
3606 self.report_webpage('%s/%s' % (uploader, slug_title))
3607
3608 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3609 try:
3610 webpage = urllib2.urlopen(request).read()
3611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3612 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3613 return
3614
3615 self.report_extraction('%s/%s' % (uploader, slug_title))
3616
3617 # extract uid and stream token that soundcloud hands out for access
3618 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3619 if mobj:
3620 video_id = mobj.group(1)
3621 stream_token = mobj.group(2)
3622
3623 # extract unsimplified title
3624 mobj = re.search('"title":"(.*?)",', webpage)
3625 if mobj:
3626 title = mobj.group(1)
3627
3628 # construct media url (with uid/token)
3629 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3630 mediaURL = mediaURL % (video_id, stream_token)
3631
3632 # description
3633 description = u'No description available'
3634 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3635 if mobj:
3636 description = mobj.group(1)
3637
3638 # upload date
3639 upload_date = None
3640 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3641 if mobj:
3642 try:
3643 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3644 except Exception, e:
3645 print str(e)
3646
3647 # for soundcloud, a request to a cross domain is required for cookies
3648 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3649
3650 try:
3651 self._downloader.process_info({
3652 'id': video_id.decode('utf-8'),
3653 'url': mediaURL,
3654 'uploader': uploader.decode('utf-8'),
3655 'upload_date': upload_date,
3656 'title': simple_title.decode('utf-8'),
3657 'stitle': simple_title.decode('utf-8'),
3658 'ext': u'mp3',
3659 'format': u'NA',
3660 'player_url': None,
3661 'description': description.decode('utf-8')
3662 })
3663 except UnavailableVideoError:
3664 self._downloader.trouble(u'\nERROR: unable to download video')
3665
3666
3667 class InfoQIE(InfoExtractor):
3668 """Information extractor for infoq.com"""
3669
3670 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3671 IE_NAME = u'infoq'
3672
3673 def report_webpage(self, video_id):
3674 """Report information extraction."""
3675 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3676
3677 def report_extraction(self, video_id):
3678 """Report information extraction."""
3679 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3680
3681 def _real_extract(self, url):
3682 htmlParser = HTMLParser.HTMLParser()
3683
3684 mobj = re.match(self._VALID_URL, url)
3685 if mobj is None:
3686 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3687 return
3688
3689 self.report_webpage(url)
3690
3691 request = urllib2.Request(url)
3692 try:
3693 webpage = urllib2.urlopen(request).read()
3694 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3695 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3696 return
3697
3698 self.report_extraction(url)
3699
3700
3701 # Extract video URL
3702 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3703 if mobj is None:
3704 self._downloader.trouble(u'ERROR: unable to extract video url')
3705 return
3706 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3707
3708
3709 # Extract title
3710 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3711 if mobj is None:
3712 self._downloader.trouble(u'ERROR: unable to extract video title')
3713 return
3714 video_title = mobj.group(1).decode('utf-8')
3715
3716 # Extract description
3717 video_description = u'No description available.'
3718 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3719 if mobj is not None:
3720 video_description = mobj.group(1).decode('utf-8')
3721
3722 video_filename = video_url.split('/')[-1]
3723 video_id, extension = video_filename.split('.')
3724
3725 self._downloader.increment_downloads()
3726 info = {
3727 'id': video_id,
3728 'url': video_url,
3729 'uploader': None,
3730 'upload_date': None,
3731 'title': video_title,
3732 'stitle': _simplify_title(video_title),
3733 'ext': extension,
3734 'format': extension, # Extension is always(?) mp4, but seems to be flv
3735 'thumbnail': None,
3736 'description': video_description,
3737 'player_url': None,
3738 }
3739
3740 try:
3741 self._downloader.process_info(info)
3742 except UnavailableVideoError, err:
3743 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3744
3745 class MixcloudIE(InfoExtractor):
3746 """Information extractor for www.mixcloud.com"""
3747 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3748 IE_NAME = u'mixcloud'
3749
3750 def __init__(self, downloader=None):
3751 InfoExtractor.__init__(self, downloader)
3752
3753 def report_download_json(self, file_id):
3754 """Report JSON download."""
3755 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3756
3757 def report_extraction(self, file_id):
3758 """Report information extraction."""
3759 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3760
3761 def get_urls(self, jsonData, fmt, bitrate='best'):
3762 """Get urls from 'audio_formats' section in json"""
3763 file_url = None
3764 try:
3765 bitrate_list = jsonData[fmt]
3766 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3767 bitrate = max(bitrate_list) # select highest
3768
3769 url_list = jsonData[fmt][bitrate]
3770 except TypeError: # we have no bitrate info.
3771 url_list = jsonData[fmt]
3772
3773 return url_list
3774
3775 def check_urls(self, url_list):
3776 """Returns 1st active url from list"""
3777 for url in url_list:
3778 try:
3779 urllib2.urlopen(url)
3780 return url
3781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3782 url = None
3783
3784 return None
3785
3786 def _print_formats(self, formats):
3787 print 'Available formats:'
3788 for fmt in formats.keys():
3789 for b in formats[fmt]:
3790 try:
3791 ext = formats[fmt][b][0]
3792 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3793 except TypeError: # we have no bitrate info
3794 ext = formats[fmt][0]
3795 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3796 break
3797
3798 def _real_extract(self, url):
3799 mobj = re.match(self._VALID_URL, url)
3800 if mobj is None:
3801 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3802 return
3803 # extract uploader & filename from url
3804 uploader = mobj.group(1).decode('utf-8')
3805 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3806
3807 # construct API request
3808 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3809 # retrieve .json file with links to files
3810 request = urllib2.Request(file_url)
3811 try:
3812 self.report_download_json(file_url)
3813 jsonData = urllib2.urlopen(request).read()
3814 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3815 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3816 return
3817
3818 # parse JSON
3819 json_data = json.loads(jsonData)
3820 player_url = json_data['player_swf_url']
3821 formats = dict(json_data['audio_formats'])
3822
3823 req_format = self._downloader.params.get('format', None)
3824 bitrate = None
3825
3826 if self._downloader.params.get('listformats', None):
3827 self._print_formats(formats)
3828 return
3829
3830 if req_format is None or req_format == 'best':
3831 for format_param in formats.keys():
3832 url_list = self.get_urls(formats, format_param)
3833 # check urls
3834 file_url = self.check_urls(url_list)
3835 if file_url is not None:
3836 break # got it!
3837 else:
3838 if req_format not in formats.keys():
3839 self._downloader.trouble(u'ERROR: format is not available')
3840 return
3841
3842 url_list = self.get_urls(formats, req_format)
3843 file_url = self.check_urls(url_list)
3844 format_param = req_format
3845
3846 # We have audio
3847 self._downloader.increment_downloads()
3848 try:
3849 # Process file information
3850 self._downloader.process_info({
3851 'id': file_id.decode('utf-8'),
3852 'url': file_url.decode('utf-8'),
3853 'uploader': uploader.decode('utf-8'),
3854 'upload_date': u'NA',
3855 'title': json_data['name'],
3856 'stitle': _simplify_title(json_data['name']),
3857 'ext': file_url.split('.')[-1].decode('utf-8'),
3858 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3859 'thumbnail': json_data['thumbnail_url'],
3860 'description': json_data['description'],
3861 'player_url': player_url.decode('utf-8'),
3862 })
3863 except UnavailableVideoError, err:
3864 self._downloader.trouble(u'ERROR: unable to download file')
3865
3866 class StanfordOpenClassroomIE(InfoExtractor):
3867 """Information extractor for Stanford's Open ClassRoom"""
3868
3869 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3870 IE_NAME = u'stanfordoc'
3871
3872 def report_download_webpage(self, objid):
3873 """Report information extraction."""
3874 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3875
3876 def report_extraction(self, video_id):
3877 """Report information extraction."""
3878 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3879
3880 def _real_extract(self, url):
3881 mobj = re.match(self._VALID_URL, url)
3882 if mobj is None:
3883 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3884 return
3885
3886 if mobj.group('course') and mobj.group('video'): # A specific video
3887 course = mobj.group('course')
3888 video = mobj.group('video')
3889 info = {
3890 'id': _simplify_title(course + '_' + video),
3891 }
3892
3893 self.report_extraction(info['id'])
3894 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3895 xmlUrl = baseUrl + video + '.xml'
3896 try:
3897 metaXml = urllib2.urlopen(xmlUrl).read()
3898 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3899 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3900 return
3901 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3902 try:
3903 info['title'] = mdoc.findall('./title')[0].text
3904 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3905 except IndexError:
3906 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3907 return
3908 info['stitle'] = _simplify_title(info['title'])
3909 info['ext'] = info['url'].rpartition('.')[2]
3910 info['format'] = info['ext']
3911 self._downloader.increment_downloads()
3912 try:
3913 self._downloader.process_info(info)
3914 except UnavailableVideoError, err:
3915 self._downloader.trouble(u'\nERROR: unable to download video')
3916 elif mobj.group('course'): # A course page
3917 unescapeHTML = HTMLParser.HTMLParser().unescape
3918
3919 course = mobj.group('course')
3920 info = {
3921 'id': _simplify_title(course),
3922 'type': 'playlist',
3923 }
3924
3925 self.report_download_webpage(info['id'])
3926 try:
3927 coursepage = urllib2.urlopen(url).read()
3928 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3929 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3930 return
3931
3932 m = re.search('<h1>([^<]+)</h1>', coursepage)
3933 if m:
3934 info['title'] = unescapeHTML(m.group(1))
3935 else:
3936 info['title'] = info['id']
3937 info['stitle'] = _simplify_title(info['title'])
3938
3939 m = re.search('<description>([^<]+)</description>', coursepage)
3940 if m:
3941 info['description'] = unescapeHTML(m.group(1))
3942
3943 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3944 info['list'] = [
3945 {
3946 'type': 'reference',
3947 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3948 }
3949 for vpage in links]
3950
3951 for entry in info['list']:
3952 assert entry['type'] == 'reference'
3953 self.extract(entry['url'])
3954 else: # Root page
3955 unescapeHTML = HTMLParser.HTMLParser().unescape
3956
3957 info = {
3958 'id': 'Stanford OpenClassroom',
3959 'type': 'playlist',
3960 }
3961
3962 self.report_download_webpage(info['id'])
3963 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3964 try:
3965 rootpage = urllib2.urlopen(rootURL).read()
3966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3967 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3968 return
3969
3970 info['title'] = info['id']
3971 info['stitle'] = _simplify_title(info['title'])
3972
3973 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3974 info['list'] = [
3975 {
3976 'type': 'reference',
3977 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3978 }
3979 for cpage in links]
3980
3981 for entry in info['list']:
3982 assert entry['type'] == 'reference'
3983 self.extract(entry['url'])
3984
3985 class MTVIE(InfoExtractor):
3986 """Information extractor for MTV.com"""
3987
3988 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3989 IE_NAME = u'mtv'
3990
3991 def report_webpage(self, video_id):
3992 """Report information extraction."""
3993 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3994
3995 def report_extraction(self, video_id):
3996 """Report information extraction."""
3997 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3998
3999 def _real_extract(self, url):
4000 mobj = re.match(self._VALID_URL, url)
4001 if mobj is None:
4002 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4003 return
4004 if not mobj.group('proto'):
4005 url = 'http://' + url
4006 video_id = mobj.group('videoid')
4007 self.report_webpage(video_id)
4008
4009 request = urllib2.Request(url)
4010 try:
4011 webpage = urllib2.urlopen(request).read()
4012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4013 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4014 return
4015
4016 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4017 if mobj is None:
4018 self._downloader.trouble(u'ERROR: unable to extract song name')
4019 return
4020 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4021 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4022 if mobj is None:
4023 self._downloader.trouble(u'ERROR: unable to extract performer')
4024 return
4025 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4026 video_title = performer + ' - ' + song_name
4027
4028 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4029 if mobj is None:
4030 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4031 return
4032 mtvn_uri = mobj.group(1)
4033
4034 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4035 if mobj is None:
4036 self._downloader.trouble(u'ERROR: unable to extract content id')
4037 return
4038 content_id = mobj.group(1)
4039
4040 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4041 self.report_extraction(video_id)
4042 request = urllib2.Request(videogen_url)
4043 try:
4044 metadataXml = urllib2.urlopen(request).read()
4045 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4046 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4047 return
4048
4049 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4050 renditions = mdoc.findall('.//rendition')
4051
4052 # For now, always pick the highest quality.
4053 rendition = renditions[-1]
4054
4055 try:
4056 _,_,ext = rendition.attrib['type'].partition('/')
4057 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4058 video_url = rendition.find('./src').text
4059 except KeyError:
4060 self._downloader.trouble('Invalid rendition field.')
4061 return
4062
4063 self._downloader.increment_downloads()
4064 info = {
4065 'id': video_id,
4066 'url': video_url,
4067 'uploader': performer,
4068 'title': video_title,
4069 'stitle': _simplify_title(video_title),
4070 'ext': ext,
4071 'format': format,
4072 }
4073
4074 try:
4075 self._downloader.process_info(info)
4076 except UnavailableVideoError, err:
4077 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4078
4079
4080 class PostProcessor(object):
4081 """Post Processor class.
4082
4083 PostProcessor objects can be added to downloaders with their
4084 add_post_processor() method. When the downloader has finished a
4085 successful download, it will take its internal chain of PostProcessors
4086 and start calling the run() method on each one of them, first with
4087 an initial argument and then with the returned value of the previous
4088 PostProcessor.
4089
4090 The chain will be stopped if one of them ever returns None or the end
4091 of the chain is reached.
4092
4093 PostProcessor objects follow a "mutual registration" process similar
4094 to InfoExtractor objects.
4095 """
4096
4097 _downloader = None
4098
4099 def __init__(self, downloader=None):
4100 self._downloader = downloader
4101
4102 def set_downloader(self, downloader):
4103 """Sets the downloader for this PP."""
4104 self._downloader = downloader
4105
4106 def run(self, information):
4107 """Run the PostProcessor.
4108
4109 The "information" argument is a dictionary like the ones
4110 composed by InfoExtractors. The only difference is that this
4111 one has an extra field called "filepath" that points to the
4112 downloaded file.
4113
4114 When this method returns None, the postprocessing chain is
4115 stopped. However, this method may return an information
4116 dictionary that will be passed to the next postprocessing
4117 object in the chain. It can be the one it received after
4118 changing some fields.
4119
4120 In addition, this method may raise a PostProcessingError
4121 exception that will be taken into account by the downloader
4122 it was called from.
4123 """
4124 return information # by default, do nothing
4125
4126 class AudioConversionError(BaseException):
4127 def __init__(self, message):
4128 self.message = message
4129
4130 class FFmpegExtractAudioPP(PostProcessor):
4131
4132 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4133 PostProcessor.__init__(self, downloader)
4134 if preferredcodec is None:
4135 preferredcodec = 'best'
4136 self._preferredcodec = preferredcodec
4137 self._preferredquality = preferredquality
4138 self._keepvideo = keepvideo
4139
4140 @staticmethod
4141 def get_audio_codec(path):
4142 try:
4143 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4144 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4145 output = handle.communicate()[0]
4146 if handle.wait() != 0:
4147 return None
4148 except (IOError, OSError):
4149 return None
4150 audio_codec = None
4151 for line in output.split('\n'):
4152 if line.startswith('codec_name='):
4153 audio_codec = line.split('=')[1].strip()
4154 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4155 return audio_codec
4156 return None
4157
4158 @staticmethod
4159 def run_ffmpeg(path, out_path, codec, more_opts):
4160 if codec is None:
4161 acodec_opts = []
4162 else:
4163 acodec_opts = ['-acodec', codec]
4164 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4165 try:
4166 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4167 stdout,stderr = p.communicate()
4168 except (IOError, OSError):
4169 e = sys.exc_info()[1]
4170 if isinstance(e, OSError) and e.errno == 2:
4171 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4172 else:
4173 raise e
4174 if p.returncode != 0:
4175 msg = stderr.strip().split('\n')[-1]
4176 raise AudioConversionError(msg)
4177
4178 def run(self, information):
4179 path = information['filepath']
4180
4181 filecodec = self.get_audio_codec(path)
4182 if filecodec is None:
4183 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4184 return None
4185
4186 more_opts = []
4187 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4188 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4189 # Lossless, but in another container
4190 acodec = 'copy'
4191 extension = self._preferredcodec
4192 more_opts = ['-absf', 'aac_adtstoasc']
4193 elif filecodec in ['aac', 'mp3', 'vorbis']:
4194 # Lossless if possible
4195 acodec = 'copy'
4196 extension = filecodec
4197 if filecodec == 'aac':
4198 more_opts = ['-f', 'adts']
4199 if filecodec == 'vorbis':
4200 extension = 'ogg'
4201 else:
4202 # MP3 otherwise.
4203 acodec = 'libmp3lame'
4204 extension = 'mp3'
4205 more_opts = []
4206 if self._preferredquality is not None:
4207 more_opts += ['-ab', self._preferredquality]
4208 else:
4209 # We convert the audio (lossy)
4210 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4211 extension = self._preferredcodec
4212 more_opts = []
4213 if self._preferredquality is not None:
4214 more_opts += ['-ab', self._preferredquality]
4215 if self._preferredcodec == 'aac':
4216 more_opts += ['-f', 'adts']
4217 if self._preferredcodec == 'm4a':
4218 more_opts += ['-absf', 'aac_adtstoasc']
4219 if self._preferredcodec == 'vorbis':
4220 extension = 'ogg'
4221 if self._preferredcodec == 'wav':
4222 extension = 'wav'
4223 more_opts += ['-f', 'wav']
4224
4225 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4226 new_path = prefix + sep + extension
4227 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4228 try:
4229 self.run_ffmpeg(path, new_path, acodec, more_opts)
4230 except:
4231 etype,e,tb = sys.exc_info()
4232 if isinstance(e, AudioConversionError):
4233 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4234 else:
4235 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4236 return None
4237
4238 # Try to update the date time for extracted audio file.
4239 if information.get('filetime') is not None:
4240 try:
4241 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4242 except:
4243 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4244
4245 if not self._keepvideo:
4246 try:
4247 os.remove(_encodeFilename(path))
4248 except (IOError, OSError):
4249 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4250 return None
4251
4252 information['filepath'] = new_path
4253 return information
4254
4255
4256 def updateSelf(downloader, filename):
4257 ''' Update the program file with the latest version from the repository '''
4258 # Note: downloader only used for options
4259 if not os.access(filename, os.W_OK):
4260 sys.exit('ERROR: no write permissions on %s' % filename)
4261
4262 downloader.to_screen(u'Updating to latest version...')
4263
4264 try:
4265 try:
4266 urlh = urllib.urlopen(UPDATE_URL)
4267 newcontent = urlh.read()
4268
4269 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4270 if vmatch is not None and vmatch.group(1) == __version__:
4271 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4272 return
4273 finally:
4274 urlh.close()
4275 except (IOError, OSError), err:
4276 sys.exit('ERROR: unable to download latest version')
4277
4278 try:
4279 outf = open(filename, 'wb')
4280 try:
4281 outf.write(newcontent)
4282 finally:
4283 outf.close()
4284 except (IOError, OSError), err:
4285 sys.exit('ERROR: unable to overwrite current version')
4286
4287 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4288
4289 def parseOpts():
4290 def _readOptions(filename_bytes):
4291 try:
4292 optionf = open(filename_bytes)
4293 except IOError:
4294 return [] # silently skip if file is not present
4295 try:
4296 res = []
4297 for l in optionf:
4298 res += shlex.split(l, comments=True)
4299 finally:
4300 optionf.close()
4301 return res
4302
4303 def _format_option_string(option):
4304 ''' ('-o', '--option') -> -o, --format METAVAR'''
4305
4306 opts = []
4307
4308 if option._short_opts: opts.append(option._short_opts[0])
4309 if option._long_opts: opts.append(option._long_opts[0])
4310 if len(opts) > 1: opts.insert(1, ', ')
4311
4312 if option.takes_value(): opts.append(' %s' % option.metavar)
4313
4314 return "".join(opts)
4315
4316 def _find_term_columns():
4317 columns = os.environ.get('COLUMNS', None)
4318 if columns:
4319 return int(columns)
4320
4321 try:
4322 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4323 out,err = sp.communicate()
4324 return int(out.split()[1])
4325 except:
4326 pass
4327 return None
4328
4329 max_width = 80
4330 max_help_position = 80
4331
4332 # No need to wrap help messages if we're on a wide console
4333 columns = _find_term_columns()
4334 if columns: max_width = columns
4335
4336 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4337 fmt.format_option_strings = _format_option_string
4338
4339 kw = {
4340 'version' : __version__,
4341 'formatter' : fmt,
4342 'usage' : '%prog [options] url [url...]',
4343 'conflict_handler' : 'resolve',
4344 }
4345
4346 parser = optparse.OptionParser(**kw)
4347
4348 # option groups
4349 general = optparse.OptionGroup(parser, 'General Options')
4350 selection = optparse.OptionGroup(parser, 'Video Selection')
4351 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4352 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4353 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4354 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4355 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4356
4357 general.add_option('-h', '--help',
4358 action='help', help='print this help text and exit')
4359 general.add_option('-v', '--version',
4360 action='version', help='print program version and exit')
4361 general.add_option('-U', '--update',
4362 action='store_true', dest='update_self', help='update this program to latest version')
4363 general.add_option('-i', '--ignore-errors',
4364 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4365 general.add_option('-r', '--rate-limit',
4366 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4367 general.add_option('-R', '--retries',
4368 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4369 general.add_option('--dump-user-agent',
4370 action='store_true', dest='dump_user_agent',
4371 help='display the current browser identification', default=False)
4372 general.add_option('--list-extractors',
4373 action='store_true', dest='list_extractors',
4374 help='List all supported extractors and the URLs they would handle', default=False)
4375
4376 selection.add_option('--playlist-start',
4377 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4378 selection.add_option('--playlist-end',
4379 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4380 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4381 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4382 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4383
4384 authentication.add_option('-u', '--username',
4385 dest='username', metavar='USERNAME', help='account username')
4386 authentication.add_option('-p', '--password',
4387 dest='password', metavar='PASSWORD', help='account password')
4388 authentication.add_option('-n', '--netrc',
4389 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4390
4391
4392 video_format.add_option('-f', '--format',
4393 action='store', dest='format', metavar='FORMAT', help='video format code')
4394 video_format.add_option('--all-formats',
4395 action='store_const', dest='format', help='download all available video formats', const='all')
4396 video_format.add_option('--prefer-free-formats',
4397 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4398 video_format.add_option('--max-quality',
4399 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4400 video_format.add_option('-F', '--list-formats',
4401 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4402 video_format.add_option('--write-srt',
4403 action='store_true', dest='writesubtitles',
4404 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4405 video_format.add_option('--srt-lang',
4406 action='store', dest='subtitleslang', metavar='LANG',
4407 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4408
4409
4410 verbosity.add_option('-q', '--quiet',
4411 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4412 verbosity.add_option('-s', '--simulate',
4413 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4414 verbosity.add_option('--skip-download',
4415 action='store_true', dest='skip_download', help='do not download the video', default=False)
4416 verbosity.add_option('-g', '--get-url',
4417 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4418 verbosity.add_option('-e', '--get-title',
4419 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4420 verbosity.add_option('--get-thumbnail',
4421 action='store_true', dest='getthumbnail',
4422 help='simulate, quiet but print thumbnail URL', default=False)
4423 verbosity.add_option('--get-description',
4424 action='store_true', dest='getdescription',
4425 help='simulate, quiet but print video description', default=False)
4426 verbosity.add_option('--get-filename',
4427 action='store_true', dest='getfilename',
4428 help='simulate, quiet but print output filename', default=False)
4429 verbosity.add_option('--get-format',
4430 action='store_true', dest='getformat',
4431 help='simulate, quiet but print output format', default=False)
4432 verbosity.add_option('--no-progress',
4433 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4434 verbosity.add_option('--console-title',
4435 action='store_true', dest='consoletitle',
4436 help='display progress in console titlebar', default=False)
4437 verbosity.add_option('-v', '--verbose',
4438 action='store_true', dest='verbose', help='print various debugging information', default=False)
4439
4440
4441 filesystem.add_option('-t', '--title',
4442 action='store_true', dest='usetitle', help='use title in file name', default=False)
4443 filesystem.add_option('-l', '--literal',
4444 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4445 filesystem.add_option('-A', '--auto-number',
4446 action='store_true', dest='autonumber',
4447 help='number downloaded files starting from 00000', default=False)
4448 filesystem.add_option('-o', '--output',
4449 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4450 filesystem.add_option('-a', '--batch-file',
4451 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4452 filesystem.add_option('-w', '--no-overwrites',
4453 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4454 filesystem.add_option('-c', '--continue',
4455 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4456 filesystem.add_option('--no-continue',
4457 action='store_false', dest='continue_dl',
4458 help='do not resume partially downloaded files (restart from beginning)')
4459 filesystem.add_option('--cookies',
4460 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4461 filesystem.add_option('--no-part',
4462 action='store_true', dest='nopart', help='do not use .part files', default=False)
4463 filesystem.add_option('--no-mtime',
4464 action='store_false', dest='updatetime',
4465 help='do not use the Last-modified header to set the file modification time', default=True)
4466 filesystem.add_option('--write-description',
4467 action='store_true', dest='writedescription',
4468 help='write video description to a .description file', default=False)
4469 filesystem.add_option('--write-info-json',
4470 action='store_true', dest='writeinfojson',
4471 help='write video metadata to a .info.json file', default=False)
4472
4473
4474 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4475 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4476 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4477 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4478 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4479 help='ffmpeg audio bitrate specification, 128k by default')
4480 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4481 help='keeps the video file on disk after the post-processing; the video is erased by default')
4482
4483
4484 parser.add_option_group(general)
4485 parser.add_option_group(selection)
4486 parser.add_option_group(filesystem)
4487 parser.add_option_group(verbosity)
4488 parser.add_option_group(video_format)
4489 parser.add_option_group(authentication)
4490 parser.add_option_group(postproc)
4491
4492 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4493 if xdg_config_home:
4494 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4495 else:
4496 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4497 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4498 opts, args = parser.parse_args(argv)
4499
4500 return parser, opts, args
4501
4502 def gen_extractors():
4503 """ Return a list of an instance of every supported extractor.
4504 The order does matter; the first extractor matched is the one handling the URL.
4505 """
4506 youtube_ie = YoutubeIE()
4507 google_ie = GoogleIE()
4508 yahoo_ie = YahooIE()
4509 return [
4510 YoutubePlaylistIE(youtube_ie),
4511 YoutubeUserIE(youtube_ie),
4512 YoutubeSearchIE(youtube_ie),
4513 youtube_ie,
4514 MetacafeIE(youtube_ie),
4515 DailymotionIE(),
4516 google_ie,
4517 GoogleSearchIE(google_ie),
4518 PhotobucketIE(),
4519 yahoo_ie,
4520 YahooSearchIE(yahoo_ie),
4521 DepositFilesIE(),
4522 FacebookIE(),
4523 BlipTVIE(),
4524 VimeoIE(),
4525 MyVideoIE(),
4526 ComedyCentralIE(),
4527 EscapistIE(),
4528 CollegeHumorIE(),
4529 XVideosIE(),
4530 SoundcloudIE(),
4531 InfoQIE(),
4532 MixcloudIE(),
4533 StanfordOpenClassroomIE(),
4534 MTVIE(),
4535
4536 GenericIE()
4537 ]
4538
4539 def _real_main():
4540 parser, opts, args = parseOpts()
4541
4542 # Open appropriate CookieJar
4543 if opts.cookiefile is None:
4544 jar = cookielib.CookieJar()
4545 else:
4546 try:
4547 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4548 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4549 jar.load()
4550 except (IOError, OSError), err:
4551 sys.exit(u'ERROR: unable to open cookie file')
4552
4553 # Dump user agent
4554 if opts.dump_user_agent:
4555 print std_headers['User-Agent']
4556 sys.exit(0)
4557
4558 # Batch file verification
4559 batchurls = []
4560 if opts.batchfile is not None:
4561 try:
4562 if opts.batchfile == '-':
4563 batchfd = sys.stdin
4564 else:
4565 batchfd = open(opts.batchfile, 'r')
4566 batchurls = batchfd.readlines()
4567 batchurls = [x.strip() for x in batchurls]
4568 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4569 except IOError:
4570 sys.exit(u'ERROR: batch file could not be read')
4571 all_urls = batchurls + args
4572 all_urls = map(lambda url: url.strip(), all_urls)
4573
4574 # General configuration
4575 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4576 proxy_handler = urllib2.ProxyHandler()
4577 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4578 urllib2.install_opener(opener)
4579 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4580
4581 if opts.verbose:
4582 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4583
4584 extractors = gen_extractors()
4585
4586 if opts.list_extractors:
4587 for ie in extractors:
4588 print(ie.IE_NAME)
4589 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4590 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4591 for mu in matchedUrls:
4592 print(u' ' + mu)
4593 sys.exit(0)
4594
4595 # Conflicting, missing and erroneous options
4596 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4597 parser.error(u'using .netrc conflicts with giving username/password')
4598 if opts.password is not None and opts.username is None:
4599 parser.error(u'account username missing')
4600 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4601 parser.error(u'using output template conflicts with using title, literal title or auto number')
4602 if opts.usetitle and opts.useliteral:
4603 parser.error(u'using title conflicts with using literal title')
4604 if opts.username is not None and opts.password is None:
4605 opts.password = getpass.getpass(u'Type account password and press return:')
4606 if opts.ratelimit is not None:
4607 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4608 if numeric_limit is None:
4609 parser.error(u'invalid rate limit specified')
4610 opts.ratelimit = numeric_limit
4611 if opts.retries is not None:
4612 try:
4613 opts.retries = long(opts.retries)
4614 except (TypeError, ValueError), err:
4615 parser.error(u'invalid retry count specified')
4616 try:
4617 opts.playliststart = int(opts.playliststart)
4618 if opts.playliststart <= 0:
4619 raise ValueError(u'Playlist start must be positive')
4620 except (TypeError, ValueError), err:
4621 parser.error(u'invalid playlist start number specified')
4622 try:
4623 opts.playlistend = int(opts.playlistend)
4624 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4625 raise ValueError(u'Playlist end must be greater than playlist start')
4626 except (TypeError, ValueError), err:
4627 parser.error(u'invalid playlist end number specified')
4628 if opts.extractaudio:
4629 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4630 parser.error(u'invalid audio format specified')
4631
4632 # File downloader
4633 fd = FileDownloader({
4634 'usenetrc': opts.usenetrc,
4635 'username': opts.username,
4636 'password': opts.password,
4637 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4638 'forceurl': opts.geturl,
4639 'forcetitle': opts.gettitle,
4640 'forcethumbnail': opts.getthumbnail,
4641 'forcedescription': opts.getdescription,
4642 'forcefilename': opts.getfilename,
4643 'forceformat': opts.getformat,
4644 'simulate': opts.simulate,
4645 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4646 'format': opts.format,
4647 'format_limit': opts.format_limit,
4648 'listformats': opts.listformats,
4649 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4650 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4651 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4652 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4653 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4654 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4655 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4656 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4657 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4658 or u'%(id)s.%(ext)s'),
4659 'ignoreerrors': opts.ignoreerrors,
4660 'ratelimit': opts.ratelimit,
4661 'nooverwrites': opts.nooverwrites,
4662 'retries': opts.retries,
4663 'continuedl': opts.continue_dl,
4664 'noprogress': opts.noprogress,
4665 'playliststart': opts.playliststart,
4666 'playlistend': opts.playlistend,
4667 'logtostderr': opts.outtmpl == '-',
4668 'consoletitle': opts.consoletitle,
4669 'nopart': opts.nopart,
4670 'updatetime': opts.updatetime,
4671 'writedescription': opts.writedescription,
4672 'writeinfojson': opts.writeinfojson,
4673 'writesubtitles': opts.writesubtitles,
4674 'subtitleslang': opts.subtitleslang,
4675 'matchtitle': opts.matchtitle,
4676 'rejecttitle': opts.rejecttitle,
4677 'max_downloads': opts.max_downloads,
4678 'prefer_free_formats': opts.prefer_free_formats,
4679 'verbose': opts.verbose,
4680 })
4681 for extractor in extractors:
4682 fd.add_info_extractor(extractor)
4683
4684 # PostProcessors
4685 if opts.extractaudio:
4686 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4687
4688 # Update version
4689 if opts.update_self:
4690 updateSelf(fd, sys.argv[0])
4691
4692 # Maybe do nothing
4693 if len(all_urls) < 1:
4694 if not opts.update_self:
4695 parser.error(u'you must provide at least one URL')
4696 else:
4697 sys.exit()
4698
4699 try:
4700 retcode = fd.download(all_urls)
4701 except MaxDownloadsReached:
4702 fd.to_screen(u'--max-download limit reached, aborting.')
4703 retcode = 101
4704
4705 # Dump cookie jar if requested
4706 if opts.cookiefile is not None:
4707 try:
4708 jar.save()
4709 except (IOError, OSError), err:
4710 sys.exit(u'ERROR: unable to save cookie jar')
4711
4712 sys.exit(retcode)
4713
4714 def main():
4715 try:
4716 _real_main()
4717 except DownloadError:
4718 sys.exit(1)
4719 except SameFileError:
4720 sys.exit(u'ERROR: fixed output name but more than one file to download')
4721 except KeyboardInterrupt:
4722 sys.exit(u'\nERROR: Interrupted by user')
4723
4724 if __name__ == '__main__':
4725 main()
4726
4727 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: