]> jfr.im git - yt-dlp.git/blob - youtube-dl
Extract original URL from next_url parameter of verify_age page, before actual extract
[yt-dlp.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52 import ctypes
53
54 try:
55 import email.utils
56 except ImportError: # Python 2.4
57 import email.Utils
58 try:
59 import cStringIO as StringIO
60 except ImportError:
61 import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65 from urlparse import parse_qs
66 except ImportError:
67 from cgi import parse_qs
68
69 try:
70 import lxml.etree
71 except ImportError:
72 pass # Handled below
73
74 try:
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88 import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 import re
91 class json(object):
92 @staticmethod
93 def loads(s):
94 s = s.decode('UTF-8')
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
99 i += 1
100 if expectMore:
101 if i >= len(s):
102 raiseError('Premature end', i)
103 return i
104 def decodeEscape(match):
105 esc = match.group(1)
106 _STATIC = {
107 '"': '"',
108 '\\': '\\',
109 '/': '/',
110 'b': unichr(0x8),
111 'f': unichr(0xc),
112 'n': '\n',
113 'r': '\r',
114 't': '\t',
115 }
116 if esc in _STATIC:
117 return _STATIC[esc]
118 if esc[0] == 'u':
119 if len(esc) == 1+4:
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
126 def parseString(i):
127 i += 1
128 e = i
129 while True:
130 e = s.index('"', e)
131 bslashes = 0
132 while s[e-bslashes-1] == '\\':
133 bslashes += 1
134 if bslashes % 2 == 1:
135 e += 1
136 continue
137 break
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
140 return (e+1,stri)
141 def parseObj(i):
142 i += 1
143 res = {}
144 i = skipSpace(i)
145 if s[i] == '}': # Empty dictionary
146 return (i+1,res)
147 while True:
148 if s[i] != '"':
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
151 i = skipSpace(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
154 i,val = parse(i+1)
155 res[key] = val
156 i = skipSpace(i)
157 if s[i] == '}':
158 return (i+1, res)
159 if s[i] != ',':
160 raiseError('Expected comma or closing curly brace', i)
161 i = skipSpace(i+1)
162 def parseArray(i):
163 res = []
164 i = skipSpace(i+1)
165 if s[i] == ']': # Empty array
166 return (i+1,res)
167 while True:
168 i,val = parse(i)
169 res.append(val)
170 i = skipSpace(i) # Raise exception if premature end
171 if s[i] == ']':
172 return (i+1, res)
173 if s[i] != ',':
174 raiseError('Expected a comma or closing bracket', i)
175 i = skipSpace(i+1)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
179 return (i+len(k), v)
180 raiseError('Not a boolean (or null)', i)
181 def parseNumber(i):
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183 if mobj is None:
184 raiseError('Not a number', i)
185 nums = mobj.group(1)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190 def parse(i):
191 i = skipSpace(i)
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
194 return (i,res)
195 i,res = parse(0)
196 if i < len(s):
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198 return res
199
200 def preferredencoding():
201 """Get preferred encoding.
202
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
205 """
206 def yield_preferredencoding():
207 try:
208 pref = locale.getpreferredencoding()
209 u'TEST'.encode(pref)
210 except:
211 pref = 'UTF-8'
212 while True:
213 yield pref
214 return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
219
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
222 """
223 entity = matchobj.group(1)
224
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
228
229 # Unicode character
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
231 if mobj is not None:
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
234 base = 16
235 numstr = u'0%s' % numstr
236 else:
237 base = 10
238 return unichr(long(numstr, base))
239
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
252
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
256 function.
257
258 It returns the tuple (stream, definitive_file_name).
259 """
260 try:
261 if filename == u'-':
262 if sys.platform == 'win32':
263 import msvcrt
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
275
276
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
279 timestamp = None
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
283 return timestamp
284
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
291 res = []
292 for el in iterable:
293 if el not in res:
294 res.append(el)
295 return res
296
297 def _unescapeHTML(s):
298 """
299 @param s a string (of type unicode)
300 """
301 assert type(s) == type(u'')
302
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307 """
308 @param s The name of the file (of type unicode)
309 """
310
311 assert type(s) == type(u'')
312
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317 return s
318 else:
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322 """Download Error exception.
323
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
326 error message.
327 """
328 pass
329
330
331 class SameFileError(Exception):
332 """Same File exception.
333
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
336 """
337 pass
338
339
340 class PostProcessingError(Exception):
341 """Post Processing exception.
342
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
345 """
346 pass
347
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
350 pass
351
352
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
355
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
358 """
359 pass
360
361
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
364
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
368 """
369 # Both in bytes
370 downloaded = None
371 expected = None
372
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
380
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
387
388 Part of this code was copied from:
389
390 http://techknack.net/python-urllib2-handlers/
391
392 Andrew Rowls, the author of that code, agreed to release it to the
393 public domain.
394 """
395
396 @staticmethod
397 def deflate(data):
398 try:
399 return zlib.decompress(data, -zlib.MAX_WBITS)
400 except zlib.error:
401 return zlib.decompress(data)
402
403 @staticmethod
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
408 ret.code = code
409 return ret
410
411 def http_request(self, req):
412 for h in std_headers:
413 if h in req.headers:
414 del req.headers[h]
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
420 return req
421
422 def http_response(self, req, resp):
423 old_resp = resp
424 # gzip
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
429 # deflate
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
434 return resp
435
436
437 class FileDownloader(object):
438 """File Downloader class.
439
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
446
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
454
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
461
462 Available options:
463
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
494 subtitleslang: Language of the subtitles to download
495 """
496
497 params = None
498 _ies = []
499 _pps = []
500 _download_retcode = None
501 _num_downloads = None
502 _screen_file = None
503
504 def __init__(self, params):
505 """Create a FileDownloader object with the given options."""
506 self._ies = []
507 self._pps = []
508 self._download_retcode = 0
509 self._num_downloads = 0
510 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
511 self.params = params
512
513 @staticmethod
514 def format_bytes(bytes):
515 if bytes is None:
516 return 'N/A'
517 if type(bytes) is str:
518 bytes = float(bytes)
519 if bytes == 0.0:
520 exponent = 0
521 else:
522 exponent = long(math.log(bytes, 1024.0))
523 suffix = 'bkMGTPEZY'[exponent]
524 converted = float(bytes) / float(1024 ** exponent)
525 return '%.2f%s' % (converted, suffix)
526
527 @staticmethod
528 def calc_percent(byte_counter, data_len):
529 if data_len is None:
530 return '---.-%'
531 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532
533 @staticmethod
534 def calc_eta(start, now, total, current):
535 if total is None:
536 return '--:--'
537 dif = now - start
538 if current == 0 or dif < 0.001: # One millisecond
539 return '--:--'
540 rate = float(current) / dif
541 eta = long((float(total) - float(current)) / rate)
542 (eta_mins, eta_secs) = divmod(eta, 60)
543 if eta_mins > 99:
544 return '--:--'
545 return '%02d:%02d' % (eta_mins, eta_secs)
546
547 @staticmethod
548 def calc_speed(start, now, bytes):
549 dif = now - start
550 if bytes == 0 or dif < 0.001: # One millisecond
551 return '%10s' % '---b/s'
552 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553
554 @staticmethod
555 def best_block_size(elapsed_time, bytes):
556 new_min = max(bytes / 2.0, 1.0)
557 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
558 if elapsed_time < 0.001:
559 return long(new_max)
560 rate = bytes / elapsed_time
561 if rate > new_max:
562 return long(new_max)
563 if rate < new_min:
564 return long(new_min)
565 return long(rate)
566
567 @staticmethod
568 def parse_bytes(bytestr):
569 """Parse a string indicating a byte quantity into a long integer."""
570 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571 if matchobj is None:
572 return None
573 number = float(matchobj.group(1))
574 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
575 return long(round(number * multiplier))
576
577 def add_info_extractor(self, ie):
578 """Add an InfoExtractor object to the end of the list."""
579 self._ies.append(ie)
580 ie.set_downloader(self)
581
582 def add_post_processor(self, pp):
583 """Add a PostProcessor object to the end of the chain."""
584 self._pps.append(pp)
585 pp.set_downloader(self)
586
587 def to_screen(self, message, skip_eol=False):
588 """Print message to stdout if not in quiet mode."""
589 assert type(message) == type(u'')
590 if not self.params.get('quiet', False):
591 terminator = [u'\n', u''][skip_eol]
592 output = message + terminator
593
594 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
595 output = output.encode(preferredencoding(), 'ignore')
596 self._screen_file.write(output)
597 self._screen_file.flush()
598
599 def to_stderr(self, message):
600 """Print message to stderr."""
601 print >>sys.stderr, message.encode(preferredencoding())
602
603 def to_cons_title(self, message):
604 """Set console/terminal window title to message."""
605 if not self.params.get('consoletitle', False):
606 return
607 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
608 # c_wchar_p() might not be necessary if `message` is
609 # already of type unicode()
610 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
611 elif 'TERM' in os.environ:
612 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
613
614 def fixed_template(self):
615 """Checks if the output template is fixed."""
616 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
617
618 def trouble(self, message=None):
619 """Determine action to take when a download problem appears.
620
621 Depending on if the downloader has been configured to ignore
622 download errors or not, this method may throw an exception or
623 not when errors are found, after printing the message.
624 """
625 if message is not None:
626 self.to_stderr(message)
627 if not self.params.get('ignoreerrors', False):
628 raise DownloadError(message)
629 self._download_retcode = 1
630
631 def slow_down(self, start_time, byte_counter):
632 """Sleep if the download speed is over the rate limit."""
633 rate_limit = self.params.get('ratelimit', None)
634 if rate_limit is None or byte_counter == 0:
635 return
636 now = time.time()
637 elapsed = now - start_time
638 if elapsed <= 0.0:
639 return
640 speed = float(byte_counter) / elapsed
641 if speed > rate_limit:
642 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
643
644 def temp_name(self, filename):
645 """Returns a temporary filename for the given filename."""
646 if self.params.get('nopart', False) or filename == u'-' or \
647 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
648 return filename
649 return filename + u'.part'
650
651 def undo_temp_name(self, filename):
652 if filename.endswith(u'.part'):
653 return filename[:-len(u'.part')]
654 return filename
655
656 def try_rename(self, old_filename, new_filename):
657 try:
658 if old_filename == new_filename:
659 return
660 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
661 except (IOError, OSError), err:
662 self.trouble(u'ERROR: unable to rename file')
663
664 def try_utime(self, filename, last_modified_hdr):
665 """Try to set the last-modified time of the given file."""
666 if last_modified_hdr is None:
667 return
668 if not os.path.isfile(_encodeFilename(filename)):
669 return
670 timestr = last_modified_hdr
671 if timestr is None:
672 return
673 filetime = timeconvert(timestr)
674 if filetime is None:
675 return filetime
676 try:
677 os.utime(filename, (time.time(), filetime))
678 except:
679 pass
680 return filetime
681
682 def report_writedescription(self, descfn):
683 """ Report that the description file is being written """
684 self.to_screen(u'[info] Writing video description to: ' + descfn)
685
686 def report_writesubtitles(self, srtfn):
687 """ Report that the subtitles file is being written """
688 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
689
690 def report_writeinfojson(self, infofn):
691 """ Report that the metadata file has been written """
692 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
693
694 def report_destination(self, filename):
695 """Report destination filename."""
696 self.to_screen(u'[download] Destination: ' + filename)
697
698 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
699 """Report download progress."""
700 if self.params.get('noprogress', False):
701 return
702 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
703 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
704 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
705 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
706
707 def report_resuming_byte(self, resume_len):
708 """Report attempt to resume at given byte."""
709 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
710
711 def report_retry(self, count, retries):
712 """Report retry in case of HTTP error 5xx"""
713 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
714
715 def report_file_already_downloaded(self, file_name):
716 """Report file has already been fully downloaded."""
717 try:
718 self.to_screen(u'[download] %s has already been downloaded' % file_name)
719 except (UnicodeEncodeError), err:
720 self.to_screen(u'[download] The file has already been downloaded')
721
722 def report_unable_to_resume(self):
723 """Report it was impossible to resume download."""
724 self.to_screen(u'[download] Unable to resume')
725
726 def report_finish(self):
727 """Report download finished."""
728 if self.params.get('noprogress', False):
729 self.to_screen(u'[download] Download completed')
730 else:
731 self.to_screen(u'')
732
733 def increment_downloads(self):
734 """Increment the ordinal that assigns a number to each file."""
735 self._num_downloads += 1
736
737 def prepare_filename(self, info_dict):
738 """Generate the output filename."""
739 try:
740 template_dict = dict(info_dict)
741 template_dict['epoch'] = unicode(long(time.time()))
742 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
743 filename = self.params['outtmpl'] % template_dict
744 return filename
745 except (ValueError, KeyError), err:
746 self.trouble(u'ERROR: invalid system charset or erroneous output template')
747 return None
748
749 def _match_entry(self, info_dict):
750 """ Returns None iff the file should be downloaded """
751
752 title = info_dict['title']
753 matchtitle = self.params.get('matchtitle', False)
754 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
755 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
756 rejecttitle = self.params.get('rejecttitle', False)
757 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
758 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
759 return None
760
761 def process_info(self, info_dict):
762 """Process a single dictionary returned by an InfoExtractor."""
763
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen(u'[download] ' + reason)
767 return
768
769 max_downloads = self.params.get('max_downloads')
770 if max_downloads is not None:
771 if self._num_downloads > int(max_downloads):
772 raise MaxDownloadsReached()
773
774 filename = self.prepare_filename(info_dict)
775
776 # Forced printings
777 if self.params.get('forcetitle', False):
778 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forceurl', False):
780 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
782 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
783 if self.params.get('forcedescription', False) and 'description' in info_dict:
784 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
785 if self.params.get('forcefilename', False) and filename is not None:
786 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
787 if self.params.get('forceformat', False):
788 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
789
790 # Do nothing else if in simulate mode
791 if self.params.get('simulate', False):
792 return
793
794 if filename is None:
795 return
796
797 try:
798 dn = os.path.dirname(_encodeFilename(filename))
799 if dn != '' and not os.path.exists(dn): # dn is already encoded
800 os.makedirs(dn)
801 except (OSError, IOError), err:
802 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
803 return
804
805 if self.params.get('writedescription', False):
806 try:
807 descfn = filename + u'.description'
808 self.report_writedescription(descfn)
809 descfile = open(_encodeFilename(descfn), 'wb')
810 try:
811 descfile.write(info_dict['description'].encode('utf-8'))
812 finally:
813 descfile.close()
814 except (OSError, IOError):
815 self.trouble(u'ERROR: Cannot write description file ' + descfn)
816 return
817
818 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
819 # subtitles download errors are already managed as troubles in relevant IE
820 # that way it will silently go on when used with unsupporting IE
821 try:
822 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
823 self.report_writesubtitles(srtfn)
824 srtfile = open(_encodeFilename(srtfn), 'wb')
825 try:
826 srtfile.write(info_dict['subtitles'].encode('utf-8'))
827 finally:
828 srtfile.close()
829 except (OSError, IOError):
830 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
831 return
832
833 if self.params.get('writeinfojson', False):
834 infofn = filename + u'.info.json'
835 self.report_writeinfojson(infofn)
836 try:
837 json.dump
838 except (NameError,AttributeError):
839 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
840 return
841 try:
842 infof = open(_encodeFilename(infofn), 'wb')
843 try:
844 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
845 json.dump(json_info_dict, infof)
846 finally:
847 infof.close()
848 except (OSError, IOError):
849 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
850 return
851
852 if not self.params.get('skip_download', False):
853 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
854 success = True
855 else:
856 try:
857 success = self._do_download(filename, info_dict)
858 except (OSError, IOError), err:
859 raise UnavailableVideoError
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
862 return
863 except (ContentTooShortError, ), err:
864 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
865 return
866
867 if success:
868 try:
869 self.post_process(filename, info_dict)
870 except (PostProcessingError), err:
871 self.trouble(u'ERROR: postprocessing: %s' % str(err))
872 return
873
874 def download(self, url_list):
875 """Download a given list of URLs."""
876 if len(url_list) > 1 and self.fixed_template():
877 raise SameFileError(self.params['outtmpl'])
878
879 for url in url_list:
880 suitable_found = False
881 for ie in self._ies:
882 # Go to next InfoExtractor if not suitable
883 if not ie.suitable(url):
884 continue
885
886 # Suitable InfoExtractor found
887 suitable_found = True
888
889 # Extract information from URL and process it
890 ie.extract(url)
891
892 # Suitable InfoExtractor had been found; go to next URL
893 break
894
895 if not suitable_found:
896 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
897
898 return self._download_retcode
899
900 def post_process(self, filename, ie_info):
901 """Run the postprocessing chain on the given file."""
902 info = dict(ie_info)
903 info['filepath'] = filename
904 for pp in self._pps:
905 info = pp.run(info)
906 if info is None:
907 break
908
909 def _download_with_rtmpdump(self, filename, url, player_url):
910 self.report_destination(filename)
911 tmpfilename = self.temp_name(filename)
912
913 # Check for rtmpdump first
914 try:
915 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
916 except (OSError, IOError):
917 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
918 return False
919
920 # Download using rtmpdump. rtmpdump returns exit code 2 when
921 # the connection was interrumpted and resuming appears to be
922 # possible. This is part of rtmpdump's normal usage, AFAIK.
923 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
924 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
925 if self.params.get('verbose', False):
926 try:
927 import pipes
928 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
929 except ImportError:
930 shell_quote = repr
931 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
932 retval = subprocess.call(args)
933 while retval == 2 or retval == 1:
934 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
935 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
936 time.sleep(5.0) # This seems to be needed
937 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
938 cursize = os.path.getsize(_encodeFilename(tmpfilename))
939 if prevsize == cursize and retval == 1:
940 break
941 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
942 if prevsize == cursize and retval == 2 and cursize > 1024:
943 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
944 retval = 0
945 break
946 if retval == 0:
947 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
948 self.try_rename(tmpfilename, filename)
949 return True
950 else:
951 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
952 return False
953
954 def _do_download(self, filename, info_dict):
955 url = info_dict['url']
956 player_url = info_dict.get('player_url', None)
957
958 # Check file already present
959 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
960 self.report_file_already_downloaded(filename)
961 return True
962
963 # Attempt to download using rtmpdump
964 if url.startswith('rtmp'):
965 return self._download_with_rtmpdump(filename, url, player_url)
966
967 tmpfilename = self.temp_name(filename)
968 stream = None
969
970 # Do not include the Accept-Encoding header
971 headers = {'Youtubedl-no-compression': 'True'}
972 basic_request = urllib2.Request(url, None, headers)
973 request = urllib2.Request(url, None, headers)
974
975 # Establish possible resume length
976 if os.path.isfile(_encodeFilename(tmpfilename)):
977 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
978 else:
979 resume_len = 0
980
981 open_mode = 'wb'
982 if resume_len != 0:
983 if self.params.get('continuedl', False):
984 self.report_resuming_byte(resume_len)
985 request.add_header('Range','bytes=%d-' % resume_len)
986 open_mode = 'ab'
987 else:
988 resume_len = 0
989
990 count = 0
991 retries = self.params.get('retries', 0)
992 while count <= retries:
993 # Establish connection
994 try:
995 if count == 0 and 'urlhandle' in info_dict:
996 data = info_dict['urlhandle']
997 data = urllib2.urlopen(request)
998 break
999 except (urllib2.HTTPError, ), err:
1000 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001 # Unexpected HTTP error
1002 raise
1003 elif err.code == 416:
1004 # Unable to resume (requested range not satisfiable)
1005 try:
1006 # Open the connection again without the range header
1007 data = urllib2.urlopen(basic_request)
1008 content_length = data.info()['Content-Length']
1009 except (urllib2.HTTPError, ), err:
1010 if err.code < 500 or err.code >= 600:
1011 raise
1012 else:
1013 # Examine the reported length
1014 if (content_length is not None and
1015 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016 # The file had already been fully downloaded.
1017 # Explanation to the above condition: in issue #175 it was revealed that
1018 # YouTube sometimes adds or removes a few bytes from the end of the file,
1019 # changing the file size slightly and causing problems for some users. So
1020 # I decided to implement a suggested change and consider the file
1021 # completely downloaded if the file size differs less than 100 bytes from
1022 # the one in the hard drive.
1023 self.report_file_already_downloaded(filename)
1024 self.try_rename(tmpfilename, filename)
1025 return True
1026 else:
1027 # The length does not match, we start the download over
1028 self.report_unable_to_resume()
1029 open_mode = 'wb'
1030 break
1031 # Retry
1032 count += 1
1033 if count <= retries:
1034 self.report_retry(count, retries)
1035
1036 if count > retries:
1037 self.trouble(u'ERROR: giving up after %s retries' % retries)
1038 return False
1039
1040 data_len = data.info().get('Content-length', None)
1041 if data_len is not None:
1042 data_len = long(data_len) + resume_len
1043 data_len_str = self.format_bytes(data_len)
1044 byte_counter = 0 + resume_len
1045 block_size = 1024
1046 start = time.time()
1047 while True:
1048 # Download and write
1049 before = time.time()
1050 data_block = data.read(block_size)
1051 after = time.time()
1052 if len(data_block) == 0:
1053 break
1054 byte_counter += len(data_block)
1055
1056 # Open file just in time
1057 if stream is None:
1058 try:
1059 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060 assert stream is not None
1061 filename = self.undo_temp_name(tmpfilename)
1062 self.report_destination(filename)
1063 except (OSError, IOError), err:
1064 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1065 return False
1066 try:
1067 stream.write(data_block)
1068 except (IOError, OSError), err:
1069 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1070 return False
1071 block_size = self.best_block_size(after - before, len(data_block))
1072
1073 # Progress message
1074 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075 if data_len is None:
1076 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1077 else:
1078 percent_str = self.calc_percent(byte_counter, data_len)
1079 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1081
1082 # Apply rate limit
1083 self.slow_down(start, byte_counter - resume_len)
1084
1085 if stream is None:
1086 self.trouble(u'\nERROR: Did not get any data blocks')
1087 return False
1088 stream.close()
1089 self.report_finish()
1090 if data_len is not None and byte_counter != data_len:
1091 raise ContentTooShortError(byte_counter, long(data_len))
1092 self.try_rename(tmpfilename, filename)
1093
1094 # Update file modification time
1095 if self.params.get('updatetime', True):
1096 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1097
1098 return True
1099
1100
1101 class InfoExtractor(object):
1102 """Information Extractor class.
1103
1104 Information extractors are the classes that, given a URL, extract
1105 information from the video (or videos) the URL refers to. This
1106 information includes the real video URL, the video title and simplified
1107 title, author and others. The information is stored in a dictionary
1108 which is then passed to the FileDownloader. The FileDownloader
1109 processes this information possibly downloading the video to the file
1110 system, among other possible outcomes. The dictionaries must include
1111 the following fields:
1112
1113 id: Video identifier.
1114 url: Final video URL.
1115 uploader: Nickname of the video uploader.
1116 title: Literal title.
1117 stitle: Simplified title.
1118 ext: Video filename extension.
1119 format: Video format.
1120 player_url: SWF Player URL (may be None).
1121
1122 The following fields are optional. Their primary purpose is to allow
1123 youtube-dl to serve as the backend for a video search function, such
1124 as the one in youtube2mp3. They are only used when their respective
1125 forced printing functions are called:
1126
1127 thumbnail: Full URL to a video thumbnail image.
1128 description: One-line video description.
1129
1130 Subclasses of this one should re-define the _real_initialize() and
1131 _real_extract() methods and define a _VALID_URL regexp.
1132 Probably, they should also be added to the list of extractors.
1133 """
1134
1135 _ready = False
1136 _downloader = None
1137
1138 def __init__(self, downloader=None):
1139 """Constructor. Receives an optional downloader."""
1140 self._ready = False
1141 self.set_downloader(downloader)
1142
1143 def suitable(self, url):
1144 """Receives a URL and returns True if suitable for this IE."""
1145 return re.match(self._VALID_URL, url) is not None
1146
1147 def initialize(self):
1148 """Initializes an instance (authentication, etc)."""
1149 if not self._ready:
1150 self._real_initialize()
1151 self._ready = True
1152
1153 def extract(self, url):
1154 """Extracts URL information and returns it in list of dicts."""
1155 self.initialize()
1156 return self._real_extract(url)
1157
1158 def set_downloader(self, downloader):
1159 """Sets the downloader for this IE."""
1160 self._downloader = downloader
1161
1162 def _real_initialize(self):
1163 """Real initialization process. Redefine in subclasses."""
1164 pass
1165
1166 def _real_extract(self, url):
1167 """Real extraction process. Redefine in subclasses."""
1168 pass
1169
1170
1171 class YoutubeIE(InfoExtractor):
1172 """Information extractor for youtube.com."""
1173
1174 _PREFIX = r'(?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)'
1175 _VALID_URL = r'^('+_PREFIX+r'(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _VALID_URL_WITH_AGE = r'^('+_PREFIX+')verify_age\?next_url=([^&]+)(?:.+)?$'
1177 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1178 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1179 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1180 _NETRC_MACHINE = 'youtube'
1181 # Listed in order of quality
1182 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1183 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1184 _video_extensions = {
1185 '13': '3gp',
1186 '17': 'mp4',
1187 '18': 'mp4',
1188 '22': 'mp4',
1189 '37': 'mp4',
1190 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1191 '43': 'webm',
1192 '44': 'webm',
1193 '45': 'webm',
1194 }
1195 _video_dimensions = {
1196 '5': '240x400',
1197 '6': '???',
1198 '13': '???',
1199 '17': '144x176',
1200 '18': '360x640',
1201 '22': '720x1280',
1202 '34': '360x640',
1203 '35': '480x854',
1204 '37': '1080x1920',
1205 '38': '3072x4096',
1206 '43': '360x640',
1207 '44': '480x854',
1208 '45': '720x1280',
1209 }
1210 IE_NAME = u'youtube'
1211
1212 def report_lang(self):
1213 """Report attempt to set language."""
1214 self._downloader.to_screen(u'[youtube] Setting language')
1215
1216 def report_login(self):
1217 """Report attempt to log in."""
1218 self._downloader.to_screen(u'[youtube] Logging in')
1219
1220 def report_age_confirmation(self):
1221 """Report attempt to confirm age."""
1222 self._downloader.to_screen(u'[youtube] Confirming age')
1223
1224 def report_video_webpage_download(self, video_id):
1225 """Report attempt to download video webpage."""
1226 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227
1228 def report_video_info_webpage_download(self, video_id):
1229 """Report attempt to download video info webpage."""
1230 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231
1232 def report_video_subtitles_download(self, video_id):
1233 """Report attempt to download video info webpage."""
1234 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235
1236 def report_information_extraction(self, video_id):
1237 """Report attempt to extract video information."""
1238 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239
1240 def report_unavailable_format(self, video_id, format):
1241 """Report extracted video URL."""
1242 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243
1244 def report_rtmp_download(self):
1245 """Indicate the download will use the RTMP protocol."""
1246 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247
1248 def _closed_captions_xml_to_srt(self, xml_string):
1249 srt = ''
1250 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251 # TODO parse xml instead of regex
1252 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253 if not dur: dur = '4'
1254 start = float(start)
1255 end = start + float(dur)
1256 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260 srt += str(n) + '\n'
1261 srt += start + ' --> ' + end + '\n'
1262 srt += caption + '\n\n'
1263 return srt
1264
1265 def _print_formats(self, formats):
1266 print 'Available formats:'
1267 for x in formats:
1268 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269
1270 def _real_initialize(self):
1271 if self._downloader is None:
1272 return
1273
1274 username = None
1275 password = None
1276 downloader_params = self._downloader.params
1277
1278 # Attempt to use provided username and password or .netrc data
1279 if downloader_params.get('username', None) is not None:
1280 username = downloader_params['username']
1281 password = downloader_params['password']
1282 elif downloader_params.get('usenetrc', False):
1283 try:
1284 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285 if info is not None:
1286 username = info[0]
1287 password = info[2]
1288 else:
1289 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290 except (IOError, netrc.NetrcParseError), err:
1291 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1292 return
1293
1294 # Set language
1295 request = urllib2.Request(self._LANG_URL)
1296 try:
1297 self.report_lang()
1298 urllib2.urlopen(request).read()
1299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1301 return
1302
1303 # No authentication to be performed
1304 if username is None:
1305 return
1306
1307 # Log in
1308 login_form = {
1309 'current_form': 'loginForm',
1310 'next': '/',
1311 'action_login': 'Log In',
1312 'username': username,
1313 'password': password,
1314 }
1315 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1316 try:
1317 self.report_login()
1318 login_results = urllib2.urlopen(request).read()
1319 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321 return
1322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1324 return
1325
1326 # Confirm age
1327 age_form = {
1328 'next_url': '/',
1329 'action_confirm': 'Confirm',
1330 }
1331 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332 try:
1333 self.report_age_confirmation()
1334 age_results = urllib2.urlopen(request).read()
1335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1337 return
1338
1339 def _real_extract(self, url):
1340 # Extract original video URL from URL with age verification, using next_url parameter
1341 mobj = re.match(self._VALID_URL_WITH_AGE, url)
1342 if mobj:
1343 urldecode = lambda x: re.sub(r'%([0-9a-hA-H][0-9a-hA-H])', lambda m: chr(int(m.group(1), 16)), x)
1344 # Keep original domain. We can probably change to www.youtube.com, but it should not hurt so keep it.
1345 # We just make sure we do not have double //, in URL, so we strip starting slash in next_url.
1346 url = mobj.group(1) + re.sub(r'^/', '', urldecode(mobj.group(2)))
1347
1348 # Extract video id from URL
1349 mobj = re.match(self._VALID_URL, url)
1350 if mobj is None:
1351 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1352 return
1353 video_id = mobj.group(2)
1354
1355 # Get video webpage
1356 self.report_video_webpage_download(video_id)
1357 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1358 try:
1359 video_webpage = urllib2.urlopen(request).read()
1360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1362 return
1363
1364 # Attempt to extract SWF player URL
1365 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1366 if mobj is not None:
1367 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1368 else:
1369 player_url = None
1370
1371 # Get video info
1372 self.report_video_info_webpage_download(video_id)
1373 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1374 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1375 % (video_id, el_type))
1376 request = urllib2.Request(video_info_url)
1377 try:
1378 video_info_webpage = urllib2.urlopen(request).read()
1379 video_info = parse_qs(video_info_webpage)
1380 if 'token' in video_info:
1381 break
1382 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1384 return
1385 if 'token' not in video_info:
1386 if 'reason' in video_info:
1387 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1388 else:
1389 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1390 return
1391
1392 # Start extracting information
1393 self.report_information_extraction(video_id)
1394
1395 # uploader
1396 if 'author' not in video_info:
1397 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1398 return
1399 video_uploader = urllib.unquote_plus(video_info['author'][0])
1400
1401 # title
1402 if 'title' not in video_info:
1403 self._downloader.trouble(u'ERROR: unable to extract video title')
1404 return
1405 video_title = urllib.unquote_plus(video_info['title'][0])
1406 video_title = video_title.decode('utf-8')
1407 video_title = sanitize_title(video_title)
1408
1409 # simplified title
1410 simple_title = _simplify_title(video_title)
1411
1412 # thumbnail image
1413 if 'thumbnail_url' not in video_info:
1414 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1415 video_thumbnail = ''
1416 else: # don't panic if we can't find it
1417 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1418
1419 # upload date
1420 upload_date = u'NA'
1421 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1422 if mobj is not None:
1423 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1424 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1425 for expression in format_expressions:
1426 try:
1427 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1428 except:
1429 pass
1430
1431 # description
1432 try:
1433 lxml.etree
1434 except NameError:
1435 video_description = u'No description available.'
1436 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1437 if mobj is not None:
1438 video_description = mobj.group(1).decode('utf-8')
1439 else:
1440 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1441 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1442 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1443 # TODO use another parser
1444
1445 # closed captions
1446 video_subtitles = None
1447 if self._downloader.params.get('writesubtitles', False):
1448 self.report_video_subtitles_download(video_id)
1449 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1450 try:
1451 srt_list = urllib2.urlopen(request).read()
1452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1454 else:
1455 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1456 if srt_lang_list:
1457 if self._downloader.params.get('subtitleslang', False):
1458 srt_lang = self._downloader.params.get('subtitleslang')
1459 elif 'en' in srt_lang_list:
1460 srt_lang = 'en'
1461 else:
1462 srt_lang = srt_lang_list[0]
1463 if not srt_lang in srt_lang_list:
1464 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1465 else:
1466 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1467 try:
1468 srt_xml = urllib2.urlopen(request).read()
1469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1471 else:
1472 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1473 else:
1474 self._downloader.trouble(u'WARNING: video has no closed captions')
1475
1476 # token
1477 video_token = urllib.unquote_plus(video_info['token'][0])
1478
1479 # Decide which formats to download
1480 req_format = self._downloader.params.get('format', None)
1481
1482 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1483 self.report_rtmp_download()
1484 video_url_list = [(None, video_info['conn'][0])]
1485 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1486 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1487 url_data = [parse_qs(uds) for uds in url_data_strs]
1488 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1489 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1490
1491 format_limit = self._downloader.params.get('format_limit', None)
1492 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1493 if format_limit is not None and format_limit in available_formats:
1494 format_list = available_formats[available_formats.index(format_limit):]
1495 else:
1496 format_list = available_formats
1497 existing_formats = [x for x in format_list if x in url_map]
1498 if len(existing_formats) == 0:
1499 self._downloader.trouble(u'ERROR: no known formats available for video')
1500 return
1501 if self._downloader.params.get('listformats', None):
1502 self._print_formats(existing_formats)
1503 return
1504 if req_format is None or req_format == 'best':
1505 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1506 elif req_format == 'worst':
1507 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1508 elif req_format in ('-1', 'all'):
1509 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1510 else:
1511 # Specific formats. We pick the first in a slash-delimeted sequence.
1512 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1513 req_formats = req_format.split('/')
1514 video_url_list = None
1515 for rf in req_formats:
1516 if rf in url_map:
1517 video_url_list = [(rf, url_map[rf])]
1518 break
1519 if video_url_list is None:
1520 self._downloader.trouble(u'ERROR: requested format not available')
1521 return
1522 else:
1523 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1524 return
1525
1526 for format_param, video_real_url in video_url_list:
1527 # At this point we have a new video
1528 self._downloader.increment_downloads()
1529
1530 # Extension
1531 video_extension = self._video_extensions.get(format_param, 'flv')
1532
1533 try:
1534 # Process video information
1535 self._downloader.process_info({
1536 'id': video_id.decode('utf-8'),
1537 'url': video_real_url.decode('utf-8'),
1538 'uploader': video_uploader.decode('utf-8'),
1539 'upload_date': upload_date,
1540 'title': video_title,
1541 'stitle': simple_title,
1542 'ext': video_extension.decode('utf-8'),
1543 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1544 'thumbnail': video_thumbnail.decode('utf-8'),
1545 'description': video_description,
1546 'player_url': player_url,
1547 'subtitles': video_subtitles
1548 })
1549 except UnavailableVideoError, err:
1550 self._downloader.trouble(u'\nERROR: unable to download video')
1551
1552
1553 class MetacafeIE(InfoExtractor):
1554 """Information Extractor for metacafe.com."""
1555
1556 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1557 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1558 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1559 _youtube_ie = None
1560 IE_NAME = u'metacafe'
1561
1562 def __init__(self, youtube_ie, downloader=None):
1563 InfoExtractor.__init__(self, downloader)
1564 self._youtube_ie = youtube_ie
1565
1566 def report_disclaimer(self):
1567 """Report disclaimer retrieval."""
1568 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1569
1570 def report_age_confirmation(self):
1571 """Report attempt to confirm age."""
1572 self._downloader.to_screen(u'[metacafe] Confirming age')
1573
1574 def report_download_webpage(self, video_id):
1575 """Report webpage download."""
1576 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1577
1578 def report_extraction(self, video_id):
1579 """Report information extraction."""
1580 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1581
1582 def _real_initialize(self):
1583 # Retrieve disclaimer
1584 request = urllib2.Request(self._DISCLAIMER)
1585 try:
1586 self.report_disclaimer()
1587 disclaimer = urllib2.urlopen(request).read()
1588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1590 return
1591
1592 # Confirm age
1593 disclaimer_form = {
1594 'filters': '0',
1595 'submit': "Continue - I'm over 18",
1596 }
1597 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1598 try:
1599 self.report_age_confirmation()
1600 disclaimer = urllib2.urlopen(request).read()
1601 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1602 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1603 return
1604
1605 def _real_extract(self, url):
1606 # Extract id and simplified title from URL
1607 mobj = re.match(self._VALID_URL, url)
1608 if mobj is None:
1609 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1610 return
1611
1612 video_id = mobj.group(1)
1613
1614 # Check if video comes from YouTube
1615 mobj2 = re.match(r'^yt-(.*)$', video_id)
1616 if mobj2 is not None:
1617 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1618 return
1619
1620 # At this point we have a new video
1621 self._downloader.increment_downloads()
1622
1623 simple_title = mobj.group(2).decode('utf-8')
1624
1625 # Retrieve video webpage to extract further information
1626 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1627 try:
1628 self.report_download_webpage(video_id)
1629 webpage = urllib2.urlopen(request).read()
1630 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1631 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1632 return
1633
1634 # Extract URL, uploader and title from webpage
1635 self.report_extraction(video_id)
1636 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1637 if mobj is not None:
1638 mediaURL = urllib.unquote(mobj.group(1))
1639 video_extension = mediaURL[-3:]
1640
1641 # Extract gdaKey if available
1642 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1643 if mobj is None:
1644 video_url = mediaURL
1645 else:
1646 gdaKey = mobj.group(1)
1647 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1648 else:
1649 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1650 if mobj is None:
1651 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652 return
1653 vardict = parse_qs(mobj.group(1))
1654 if 'mediaData' not in vardict:
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1656 return
1657 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1658 if mobj is None:
1659 self._downloader.trouble(u'ERROR: unable to extract media URL')
1660 return
1661 mediaURL = mobj.group(1).replace('\\/', '/')
1662 video_extension = mediaURL[-3:]
1663 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1664
1665 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1666 if mobj is None:
1667 self._downloader.trouble(u'ERROR: unable to extract title')
1668 return
1669 video_title = mobj.group(1).decode('utf-8')
1670 video_title = sanitize_title(video_title)
1671
1672 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1673 if mobj is None:
1674 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1675 return
1676 video_uploader = mobj.group(1)
1677
1678 try:
1679 # Process video information
1680 self._downloader.process_info({
1681 'id': video_id.decode('utf-8'),
1682 'url': video_url.decode('utf-8'),
1683 'uploader': video_uploader.decode('utf-8'),
1684 'upload_date': u'NA',
1685 'title': video_title,
1686 'stitle': simple_title,
1687 'ext': video_extension.decode('utf-8'),
1688 'format': u'NA',
1689 'player_url': None,
1690 })
1691 except UnavailableVideoError:
1692 self._downloader.trouble(u'\nERROR: unable to download video')
1693
1694
1695 class DailymotionIE(InfoExtractor):
1696 """Information Extractor for Dailymotion"""
1697
1698 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1699 IE_NAME = u'dailymotion'
1700
1701 def __init__(self, downloader=None):
1702 InfoExtractor.__init__(self, downloader)
1703
1704 def report_download_webpage(self, video_id):
1705 """Report webpage download."""
1706 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1707
1708 def report_extraction(self, video_id):
1709 """Report information extraction."""
1710 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1711
1712 def _real_extract(self, url):
1713 # Extract id and simplified title from URL
1714 mobj = re.match(self._VALID_URL, url)
1715 if mobj is None:
1716 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1717 return
1718
1719 # At this point we have a new video
1720 self._downloader.increment_downloads()
1721 video_id = mobj.group(1)
1722
1723 video_extension = 'flv'
1724
1725 # Retrieve video webpage to extract further information
1726 request = urllib2.Request(url)
1727 request.add_header('Cookie', 'family_filter=off')
1728 try:
1729 self.report_download_webpage(video_id)
1730 webpage = urllib2.urlopen(request).read()
1731 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1732 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1733 return
1734
1735 # Extract URL, uploader and title from webpage
1736 self.report_extraction(video_id)
1737 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1738 if mobj is None:
1739 self._downloader.trouble(u'ERROR: unable to extract media URL')
1740 return
1741 sequence = urllib.unquote(mobj.group(1))
1742 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1743 if mobj is None:
1744 self._downloader.trouble(u'ERROR: unable to extract media URL')
1745 return
1746 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1747
1748 # if needed add http://www.dailymotion.com/ if relative URL
1749
1750 video_url = mediaURL
1751
1752 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1753 if mobj is None:
1754 self._downloader.trouble(u'ERROR: unable to extract title')
1755 return
1756 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1757 video_title = sanitize_title(video_title)
1758 simple_title = _simplify_title(video_title)
1759
1760 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1761 if mobj is None:
1762 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1763 return
1764 video_uploader = mobj.group(1)
1765
1766 try:
1767 # Process video information
1768 self._downloader.process_info({
1769 'id': video_id.decode('utf-8'),
1770 'url': video_url.decode('utf-8'),
1771 'uploader': video_uploader.decode('utf-8'),
1772 'upload_date': u'NA',
1773 'title': video_title,
1774 'stitle': simple_title,
1775 'ext': video_extension.decode('utf-8'),
1776 'format': u'NA',
1777 'player_url': None,
1778 })
1779 except UnavailableVideoError:
1780 self._downloader.trouble(u'\nERROR: unable to download video')
1781
1782
1783 class GoogleIE(InfoExtractor):
1784 """Information extractor for video.google.com."""
1785
1786 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1787 IE_NAME = u'video.google'
1788
1789 def __init__(self, downloader=None):
1790 InfoExtractor.__init__(self, downloader)
1791
1792 def report_download_webpage(self, video_id):
1793 """Report webpage download."""
1794 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1795
1796 def report_extraction(self, video_id):
1797 """Report information extraction."""
1798 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1799
1800 def _real_extract(self, url):
1801 # Extract id from URL
1802 mobj = re.match(self._VALID_URL, url)
1803 if mobj is None:
1804 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1805 return
1806
1807 # At this point we have a new video
1808 self._downloader.increment_downloads()
1809 video_id = mobj.group(1)
1810
1811 video_extension = 'mp4'
1812
1813 # Retrieve video webpage to extract further information
1814 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1815 try:
1816 self.report_download_webpage(video_id)
1817 webpage = urllib2.urlopen(request).read()
1818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1819 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1820 return
1821
1822 # Extract URL, uploader, and title from webpage
1823 self.report_extraction(video_id)
1824 mobj = re.search(r"download_url:'([^']+)'", webpage)
1825 if mobj is None:
1826 video_extension = 'flv'
1827 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1828 if mobj is None:
1829 self._downloader.trouble(u'ERROR: unable to extract media URL')
1830 return
1831 mediaURL = urllib.unquote(mobj.group(1))
1832 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1833 mediaURL = mediaURL.replace('\\x26', '\x26')
1834
1835 video_url = mediaURL
1836
1837 mobj = re.search(r'<title>(.*)</title>', webpage)
1838 if mobj is None:
1839 self._downloader.trouble(u'ERROR: unable to extract title')
1840 return
1841 video_title = mobj.group(1).decode('utf-8')
1842 video_title = sanitize_title(video_title)
1843 simple_title = _simplify_title(video_title)
1844
1845 # Extract video description
1846 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1847 if mobj is None:
1848 self._downloader.trouble(u'ERROR: unable to extract video description')
1849 return
1850 video_description = mobj.group(1).decode('utf-8')
1851 if not video_description:
1852 video_description = 'No description available.'
1853
1854 # Extract video thumbnail
1855 if self._downloader.params.get('forcethumbnail', False):
1856 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1857 try:
1858 webpage = urllib2.urlopen(request).read()
1859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1861 return
1862 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1863 if mobj is None:
1864 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1865 return
1866 video_thumbnail = mobj.group(1)
1867 else: # we need something to pass to process_info
1868 video_thumbnail = ''
1869
1870 try:
1871 # Process video information
1872 self._downloader.process_info({
1873 'id': video_id.decode('utf-8'),
1874 'url': video_url.decode('utf-8'),
1875 'uploader': u'NA',
1876 'upload_date': u'NA',
1877 'title': video_title,
1878 'stitle': simple_title,
1879 'ext': video_extension.decode('utf-8'),
1880 'format': u'NA',
1881 'player_url': None,
1882 })
1883 except UnavailableVideoError:
1884 self._downloader.trouble(u'\nERROR: unable to download video')
1885
1886
1887 class PhotobucketIE(InfoExtractor):
1888 """Information extractor for photobucket.com."""
1889
1890 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1891 IE_NAME = u'photobucket'
1892
1893 def __init__(self, downloader=None):
1894 InfoExtractor.__init__(self, downloader)
1895
1896 def report_download_webpage(self, video_id):
1897 """Report webpage download."""
1898 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1899
1900 def report_extraction(self, video_id):
1901 """Report information extraction."""
1902 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1903
1904 def _real_extract(self, url):
1905 # Extract id from URL
1906 mobj = re.match(self._VALID_URL, url)
1907 if mobj is None:
1908 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1909 return
1910
1911 # At this point we have a new video
1912 self._downloader.increment_downloads()
1913 video_id = mobj.group(1)
1914
1915 video_extension = 'flv'
1916
1917 # Retrieve video webpage to extract further information
1918 request = urllib2.Request(url)
1919 try:
1920 self.report_download_webpage(video_id)
1921 webpage = urllib2.urlopen(request).read()
1922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1923 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1924 return
1925
1926 # Extract URL, uploader, and title from webpage
1927 self.report_extraction(video_id)
1928 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1929 if mobj is None:
1930 self._downloader.trouble(u'ERROR: unable to extract media URL')
1931 return
1932 mediaURL = urllib.unquote(mobj.group(1))
1933
1934 video_url = mediaURL
1935
1936 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1937 if mobj is None:
1938 self._downloader.trouble(u'ERROR: unable to extract title')
1939 return
1940 video_title = mobj.group(1).decode('utf-8')
1941 video_title = sanitize_title(video_title)
1942 simple_title = _simplify_title(vide_title)
1943
1944 video_uploader = mobj.group(2).decode('utf-8')
1945
1946 try:
1947 # Process video information
1948 self._downloader.process_info({
1949 'id': video_id.decode('utf-8'),
1950 'url': video_url.decode('utf-8'),
1951 'uploader': video_uploader,
1952 'upload_date': u'NA',
1953 'title': video_title,
1954 'stitle': simple_title,
1955 'ext': video_extension.decode('utf-8'),
1956 'format': u'NA',
1957 'player_url': None,
1958 })
1959 except UnavailableVideoError:
1960 self._downloader.trouble(u'\nERROR: unable to download video')
1961
1962
1963 class YahooIE(InfoExtractor):
1964 """Information extractor for video.yahoo.com."""
1965
1966 # _VALID_URL matches all Yahoo! Video URLs
1967 # _VPAGE_URL matches only the extractable '/watch/' URLs
1968 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1969 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1970 IE_NAME = u'video.yahoo'
1971
1972 def __init__(self, downloader=None):
1973 InfoExtractor.__init__(self, downloader)
1974
1975 def report_download_webpage(self, video_id):
1976 """Report webpage download."""
1977 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1978
1979 def report_extraction(self, video_id):
1980 """Report information extraction."""
1981 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1982
1983 def _real_extract(self, url, new_video=True):
1984 # Extract ID from URL
1985 mobj = re.match(self._VALID_URL, url)
1986 if mobj is None:
1987 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1988 return
1989
1990 # At this point we have a new video
1991 self._downloader.increment_downloads()
1992 video_id = mobj.group(2)
1993 video_extension = 'flv'
1994
1995 # Rewrite valid but non-extractable URLs as
1996 # extractable English language /watch/ URLs
1997 if re.match(self._VPAGE_URL, url) is None:
1998 request = urllib2.Request(url)
1999 try:
2000 webpage = urllib2.urlopen(request).read()
2001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2003 return
2004
2005 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2006 if mobj is None:
2007 self._downloader.trouble(u'ERROR: Unable to extract id field')
2008 return
2009 yahoo_id = mobj.group(1)
2010
2011 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2012 if mobj is None:
2013 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2014 return
2015 yahoo_vid = mobj.group(1)
2016
2017 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2018 return self._real_extract(url, new_video=False)
2019
2020 # Retrieve video webpage to extract further information
2021 request = urllib2.Request(url)
2022 try:
2023 self.report_download_webpage(video_id)
2024 webpage = urllib2.urlopen(request).read()
2025 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2027 return
2028
2029 # Extract uploader and title from webpage
2030 self.report_extraction(video_id)
2031 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2032 if mobj is None:
2033 self._downloader.trouble(u'ERROR: unable to extract video title')
2034 return
2035 video_title = mobj.group(1).decode('utf-8')
2036 simple_title = _simplify_title(video_title)
2037
2038 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2039 if mobj is None:
2040 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2041 return
2042 video_uploader = mobj.group(1).decode('utf-8')
2043
2044 # Extract video thumbnail
2045 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2046 if mobj is None:
2047 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2048 return
2049 video_thumbnail = mobj.group(1).decode('utf-8')
2050
2051 # Extract video description
2052 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2053 if mobj is None:
2054 self._downloader.trouble(u'ERROR: unable to extract video description')
2055 return
2056 video_description = mobj.group(1).decode('utf-8')
2057 if not video_description:
2058 video_description = 'No description available.'
2059
2060 # Extract video height and width
2061 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2062 if mobj is None:
2063 self._downloader.trouble(u'ERROR: unable to extract video height')
2064 return
2065 yv_video_height = mobj.group(1)
2066
2067 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2068 if mobj is None:
2069 self._downloader.trouble(u'ERROR: unable to extract video width')
2070 return
2071 yv_video_width = mobj.group(1)
2072
2073 # Retrieve video playlist to extract media URL
2074 # I'm not completely sure what all these options are, but we
2075 # seem to need most of them, otherwise the server sends a 401.
2076 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2077 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2078 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2079 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2080 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2081 try:
2082 self.report_download_webpage(video_id)
2083 webpage = urllib2.urlopen(request).read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2086 return
2087
2088 # Extract media URL from playlist XML
2089 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2090 if mobj is None:
2091 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2092 return
2093 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2094 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2095
2096 try:
2097 # Process video information
2098 self._downloader.process_info({
2099 'id': video_id.decode('utf-8'),
2100 'url': video_url,
2101 'uploader': video_uploader,
2102 'upload_date': u'NA',
2103 'title': video_title,
2104 'stitle': simple_title,
2105 'ext': video_extension.decode('utf-8'),
2106 'thumbnail': video_thumbnail.decode('utf-8'),
2107 'description': video_description,
2108 'thumbnail': video_thumbnail,
2109 'player_url': None,
2110 })
2111 except UnavailableVideoError:
2112 self._downloader.trouble(u'\nERROR: unable to download video')
2113
2114
2115 class VimeoIE(InfoExtractor):
2116 """Information extractor for vimeo.com."""
2117
2118 # _VALID_URL matches Vimeo URLs
2119 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2120 IE_NAME = u'vimeo'
2121
2122 def __init__(self, downloader=None):
2123 InfoExtractor.__init__(self, downloader)
2124
2125 def report_download_webpage(self, video_id):
2126 """Report webpage download."""
2127 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2128
2129 def report_extraction(self, video_id):
2130 """Report information extraction."""
2131 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2132
2133 def _real_extract(self, url, new_video=True):
2134 # Extract ID from URL
2135 mobj = re.match(self._VALID_URL, url)
2136 if mobj is None:
2137 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2138 return
2139
2140 # At this point we have a new video
2141 self._downloader.increment_downloads()
2142 video_id = mobj.group(1)
2143
2144 # Retrieve video webpage to extract further information
2145 request = urllib2.Request(url, None, std_headers)
2146 try:
2147 self.report_download_webpage(video_id)
2148 webpage = urllib2.urlopen(request).read()
2149 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2150 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2151 return
2152
2153 # Now we begin extracting as much information as we can from what we
2154 # retrieved. First we extract the information common to all extractors,
2155 # and latter we extract those that are Vimeo specific.
2156 self.report_extraction(video_id)
2157
2158 # Extract the config JSON
2159 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2160 try:
2161 config = json.loads(config)
2162 except:
2163 self._downloader.trouble(u'ERROR: unable to extract info section')
2164 return
2165
2166 # Extract title
2167 video_title = config["video"]["title"]
2168 simple_title = _simplify_title(video_title)
2169
2170 # Extract uploader
2171 video_uploader = config["video"]["owner"]["name"]
2172
2173 # Extract video thumbnail
2174 video_thumbnail = config["video"]["thumbnail"]
2175
2176 # Extract video description
2177 try:
2178 lxml.etree
2179 except NameError:
2180 video_description = u'No description available.'
2181 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2182 if mobj is not None:
2183 video_description = mobj.group(1)
2184 else:
2185 html_parser = lxml.etree.HTMLParser()
2186 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2187 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2188 # TODO use another parser
2189
2190 # Extract upload date
2191 video_upload_date = u'NA'
2192 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2193 if mobj is not None:
2194 video_upload_date = mobj.group(1)
2195
2196 # Vimeo specific: extract request signature and timestamp
2197 sig = config['request']['signature']
2198 timestamp = config['request']['timestamp']
2199
2200 # Vimeo specific: extract video codec and quality information
2201 # TODO bind to format param
2202 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2203 for codec in codecs:
2204 if codec[0] in config["video"]["files"]:
2205 video_codec = codec[0]
2206 video_extension = codec[1]
2207 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2208 else: quality = 'sd'
2209 break
2210 else:
2211 self._downloader.trouble(u'ERROR: no known codec found')
2212 return
2213
2214 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2215 %(video_id, sig, timestamp, quality, video_codec.upper())
2216
2217 try:
2218 # Process video information
2219 self._downloader.process_info({
2220 'id': video_id,
2221 'url': video_url,
2222 'uploader': video_uploader,
2223 'upload_date': video_upload_date,
2224 'title': video_title,
2225 'stitle': simple_title,
2226 'ext': video_extension,
2227 'thumbnail': video_thumbnail,
2228 'description': video_description,
2229 'player_url': None,
2230 })
2231 except UnavailableVideoError:
2232 self._downloader.trouble(u'ERROR: unable to download video')
2233
2234
2235 class GenericIE(InfoExtractor):
2236 """Generic last-resort information extractor."""
2237
2238 _VALID_URL = r'.*'
2239 IE_NAME = u'generic'
2240
2241 def __init__(self, downloader=None):
2242 InfoExtractor.__init__(self, downloader)
2243
2244 def report_download_webpage(self, video_id):
2245 """Report webpage download."""
2246 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2247 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2248
2249 def report_extraction(self, video_id):
2250 """Report information extraction."""
2251 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2252
2253 def _real_extract(self, url):
2254 # At this point we have a new video
2255 self._downloader.increment_downloads()
2256
2257 video_id = url.split('/')[-1]
2258 request = urllib2.Request(url)
2259 try:
2260 self.report_download_webpage(video_id)
2261 webpage = urllib2.urlopen(request).read()
2262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2263 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2264 return
2265 except ValueError, err:
2266 # since this is the last-resort InfoExtractor, if
2267 # this error is thrown, it'll be thrown here
2268 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2269 return
2270
2271 self.report_extraction(video_id)
2272 # Start with something easy: JW Player in SWFObject
2273 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2274 if mobj is None:
2275 # Broaden the search a little bit
2276 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2277 if mobj is None:
2278 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2279 return
2280
2281 # It's possible that one of the regexes
2282 # matched, but returned an empty group:
2283 if mobj.group(1) is None:
2284 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2285 return
2286
2287 video_url = urllib.unquote(mobj.group(1))
2288 video_id = os.path.basename(video_url)
2289
2290 # here's a fun little line of code for you:
2291 video_extension = os.path.splitext(video_id)[1][1:]
2292 video_id = os.path.splitext(video_id)[0]
2293
2294 # it's tempting to parse this further, but you would
2295 # have to take into account all the variations like
2296 # Video Title - Site Name
2297 # Site Name | Video Title
2298 # Video Title - Tagline | Site Name
2299 # and so on and so forth; it's just not practical
2300 mobj = re.search(r'<title>(.*)</title>', webpage)
2301 if mobj is None:
2302 self._downloader.trouble(u'ERROR: unable to extract title')
2303 return
2304 video_title = mobj.group(1).decode('utf-8')
2305 video_title = sanitize_title(video_title)
2306 simple_title = _simplify_title(video_title)
2307
2308 # video uploader is domain name
2309 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2310 if mobj is None:
2311 self._downloader.trouble(u'ERROR: unable to extract title')
2312 return
2313 video_uploader = mobj.group(1).decode('utf-8')
2314
2315 try:
2316 # Process video information
2317 self._downloader.process_info({
2318 'id': video_id.decode('utf-8'),
2319 'url': video_url.decode('utf-8'),
2320 'uploader': video_uploader,
2321 'upload_date': u'NA',
2322 'title': video_title,
2323 'stitle': simple_title,
2324 'ext': video_extension.decode('utf-8'),
2325 'format': u'NA',
2326 'player_url': None,
2327 })
2328 except UnavailableVideoError, err:
2329 self._downloader.trouble(u'\nERROR: unable to download video')
2330
2331
2332 class YoutubeSearchIE(InfoExtractor):
2333 """Information Extractor for YouTube search queries."""
2334 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2335 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2336 _youtube_ie = None
2337 _max_youtube_results = 1000
2338 IE_NAME = u'youtube:search'
2339
2340 def __init__(self, youtube_ie, downloader=None):
2341 InfoExtractor.__init__(self, downloader)
2342 self._youtube_ie = youtube_ie
2343
2344 def report_download_page(self, query, pagenum):
2345 """Report attempt to download playlist page with given number."""
2346 query = query.decode(preferredencoding())
2347 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2348
2349 def _real_initialize(self):
2350 self._youtube_ie.initialize()
2351
2352 def _real_extract(self, query):
2353 mobj = re.match(self._VALID_URL, query)
2354 if mobj is None:
2355 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2356 return
2357
2358 prefix, query = query.split(':')
2359 prefix = prefix[8:]
2360 query = query.encode('utf-8')
2361 if prefix == '':
2362 self._download_n_results(query, 1)
2363 return
2364 elif prefix == 'all':
2365 self._download_n_results(query, self._max_youtube_results)
2366 return
2367 else:
2368 try:
2369 n = long(prefix)
2370 if n <= 0:
2371 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2372 return
2373 elif n > self._max_youtube_results:
2374 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2375 n = self._max_youtube_results
2376 self._download_n_results(query, n)
2377 return
2378 except ValueError: # parsing prefix as integer fails
2379 self._download_n_results(query, 1)
2380 return
2381
2382 def _download_n_results(self, query, n):
2383 """Downloads a specified number of results for a query"""
2384
2385 video_ids = []
2386 pagenum = 0
2387 limit = n
2388
2389 while (50 * pagenum) < limit:
2390 self.report_download_page(query, pagenum+1)
2391 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2392 request = urllib2.Request(result_url)
2393 try:
2394 data = urllib2.urlopen(request).read()
2395 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2396 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2397 return
2398 api_response = json.loads(data)['data']
2399
2400 new_ids = list(video['id'] for video in api_response['items'])
2401 video_ids += new_ids
2402
2403 limit = min(n, api_response['totalItems'])
2404 pagenum += 1
2405
2406 if len(video_ids) > n:
2407 video_ids = video_ids[:n]
2408 for id in video_ids:
2409 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2410 return
2411
2412
2413 class GoogleSearchIE(InfoExtractor):
2414 """Information Extractor for Google Video search queries."""
2415 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2416 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2417 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2418 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2419 _google_ie = None
2420 _max_google_results = 1000
2421 IE_NAME = u'video.google:search'
2422
2423 def __init__(self, google_ie, downloader=None):
2424 InfoExtractor.__init__(self, downloader)
2425 self._google_ie = google_ie
2426
2427 def report_download_page(self, query, pagenum):
2428 """Report attempt to download playlist page with given number."""
2429 query = query.decode(preferredencoding())
2430 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2431
2432 def _real_initialize(self):
2433 self._google_ie.initialize()
2434
2435 def _real_extract(self, query):
2436 mobj = re.match(self._VALID_URL, query)
2437 if mobj is None:
2438 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2439 return
2440
2441 prefix, query = query.split(':')
2442 prefix = prefix[8:]
2443 query = query.encode('utf-8')
2444 if prefix == '':
2445 self._download_n_results(query, 1)
2446 return
2447 elif prefix == 'all':
2448 self._download_n_results(query, self._max_google_results)
2449 return
2450 else:
2451 try:
2452 n = long(prefix)
2453 if n <= 0:
2454 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2455 return
2456 elif n > self._max_google_results:
2457 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2458 n = self._max_google_results
2459 self._download_n_results(query, n)
2460 return
2461 except ValueError: # parsing prefix as integer fails
2462 self._download_n_results(query, 1)
2463 return
2464
2465 def _download_n_results(self, query, n):
2466 """Downloads a specified number of results for a query"""
2467
2468 video_ids = []
2469 pagenum = 0
2470
2471 while True:
2472 self.report_download_page(query, pagenum)
2473 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2474 request = urllib2.Request(result_url)
2475 try:
2476 page = urllib2.urlopen(request).read()
2477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2479 return
2480
2481 # Extract video identifiers
2482 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2483 video_id = mobj.group(1)
2484 if video_id not in video_ids:
2485 video_ids.append(video_id)
2486 if len(video_ids) == n:
2487 # Specified n videos reached
2488 for id in video_ids:
2489 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2490 return
2491
2492 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2493 for id in video_ids:
2494 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2495 return
2496
2497 pagenum = pagenum + 1
2498
2499
2500 class YahooSearchIE(InfoExtractor):
2501 """Information Extractor for Yahoo! Video search queries."""
2502 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2503 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2504 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2505 _MORE_PAGES_INDICATOR = r'\s*Next'
2506 _yahoo_ie = None
2507 _max_yahoo_results = 1000
2508 IE_NAME = u'video.yahoo:search'
2509
2510 def __init__(self, yahoo_ie, downloader=None):
2511 InfoExtractor.__init__(self, downloader)
2512 self._yahoo_ie = yahoo_ie
2513
2514 def report_download_page(self, query, pagenum):
2515 """Report attempt to download playlist page with given number."""
2516 query = query.decode(preferredencoding())
2517 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2518
2519 def _real_initialize(self):
2520 self._yahoo_ie.initialize()
2521
2522 def _real_extract(self, query):
2523 mobj = re.match(self._VALID_URL, query)
2524 if mobj is None:
2525 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2526 return
2527
2528 prefix, query = query.split(':')
2529 prefix = prefix[8:]
2530 query = query.encode('utf-8')
2531 if prefix == '':
2532 self._download_n_results(query, 1)
2533 return
2534 elif prefix == 'all':
2535 self._download_n_results(query, self._max_yahoo_results)
2536 return
2537 else:
2538 try:
2539 n = long(prefix)
2540 if n <= 0:
2541 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2542 return
2543 elif n > self._max_yahoo_results:
2544 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2545 n = self._max_yahoo_results
2546 self._download_n_results(query, n)
2547 return
2548 except ValueError: # parsing prefix as integer fails
2549 self._download_n_results(query, 1)
2550 return
2551
2552 def _download_n_results(self, query, n):
2553 """Downloads a specified number of results for a query"""
2554
2555 video_ids = []
2556 already_seen = set()
2557 pagenum = 1
2558
2559 while True:
2560 self.report_download_page(query, pagenum)
2561 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2562 request = urllib2.Request(result_url)
2563 try:
2564 page = urllib2.urlopen(request).read()
2565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2566 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2567 return
2568
2569 # Extract video identifiers
2570 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2571 video_id = mobj.group(1)
2572 if video_id not in already_seen:
2573 video_ids.append(video_id)
2574 already_seen.add(video_id)
2575 if len(video_ids) == n:
2576 # Specified n videos reached
2577 for id in video_ids:
2578 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2579 return
2580
2581 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2582 for id in video_ids:
2583 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2584 return
2585
2586 pagenum = pagenum + 1
2587
2588
2589 class YoutubePlaylistIE(InfoExtractor):
2590 """Information Extractor for YouTube playlists."""
2591
2592 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2593 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2594 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2595 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2596 _youtube_ie = None
2597 IE_NAME = u'youtube:playlist'
2598
2599 def __init__(self, youtube_ie, downloader=None):
2600 InfoExtractor.__init__(self, downloader)
2601 self._youtube_ie = youtube_ie
2602
2603 def report_download_page(self, playlist_id, pagenum):
2604 """Report attempt to download playlist page with given number."""
2605 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2606
2607 def _real_initialize(self):
2608 self._youtube_ie.initialize()
2609
2610 def _real_extract(self, url):
2611 # Extract playlist id
2612 mobj = re.match(self._VALID_URL, url)
2613 if mobj is None:
2614 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2615 return
2616
2617 # Single video case
2618 if mobj.group(3) is not None:
2619 self._youtube_ie.extract(mobj.group(3))
2620 return
2621
2622 # Download playlist pages
2623 # prefix is 'p' as default for playlists but there are other types that need extra care
2624 playlist_prefix = mobj.group(1)
2625 if playlist_prefix == 'a':
2626 playlist_access = 'artist'
2627 else:
2628 playlist_prefix = 'p'
2629 playlist_access = 'view_play_list'
2630 playlist_id = mobj.group(2)
2631 video_ids = []
2632 pagenum = 1
2633
2634 while True:
2635 self.report_download_page(playlist_id, pagenum)
2636 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2637 request = urllib2.Request(url)
2638 try:
2639 page = urllib2.urlopen(request).read()
2640 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2641 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2642 return
2643
2644 # Extract video identifiers
2645 ids_in_page = []
2646 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2647 if mobj.group(1) not in ids_in_page:
2648 ids_in_page.append(mobj.group(1))
2649 video_ids.extend(ids_in_page)
2650
2651 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2652 break
2653 pagenum = pagenum + 1
2654
2655 playliststart = self._downloader.params.get('playliststart', 1) - 1
2656 playlistend = self._downloader.params.get('playlistend', -1)
2657 if playlistend == -1:
2658 video_ids = video_ids[playliststart:]
2659 else:
2660 video_ids = video_ids[playliststart:playlistend]
2661
2662 for id in video_ids:
2663 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2664 return
2665
2666
2667 class YoutubeUserIE(InfoExtractor):
2668 """Information Extractor for YouTube users."""
2669
2670 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2671 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2672 _GDATA_PAGE_SIZE = 50
2673 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2674 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2675 _youtube_ie = None
2676 IE_NAME = u'youtube:user'
2677
2678 def __init__(self, youtube_ie, downloader=None):
2679 InfoExtractor.__init__(self, downloader)
2680 self._youtube_ie = youtube_ie
2681
2682 def report_download_page(self, username, start_index):
2683 """Report attempt to download user page."""
2684 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2685 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2686
2687 def _real_initialize(self):
2688 self._youtube_ie.initialize()
2689
2690 def _real_extract(self, url):
2691 # Extract username
2692 mobj = re.match(self._VALID_URL, url)
2693 if mobj is None:
2694 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2695 return
2696
2697 username = mobj.group(1)
2698
2699 # Download video ids using YouTube Data API. Result size per
2700 # query is limited (currently to 50 videos) so we need to query
2701 # page by page until there are no video ids - it means we got
2702 # all of them.
2703
2704 video_ids = []
2705 pagenum = 0
2706
2707 while True:
2708 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2709 self.report_download_page(username, start_index)
2710
2711 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2712
2713 try:
2714 page = urllib2.urlopen(request).read()
2715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2717 return
2718
2719 # Extract video identifiers
2720 ids_in_page = []
2721
2722 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2723 if mobj.group(1) not in ids_in_page:
2724 ids_in_page.append(mobj.group(1))
2725
2726 video_ids.extend(ids_in_page)
2727
2728 # A little optimization - if current page is not
2729 # "full", ie. does not contain PAGE_SIZE video ids then
2730 # we can assume that this page is the last one - there
2731 # are no more ids on further pages - no need to query
2732 # again.
2733
2734 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2735 break
2736
2737 pagenum += 1
2738
2739 all_ids_count = len(video_ids)
2740 playliststart = self._downloader.params.get('playliststart', 1) - 1
2741 playlistend = self._downloader.params.get('playlistend', -1)
2742
2743 if playlistend == -1:
2744 video_ids = video_ids[playliststart:]
2745 else:
2746 video_ids = video_ids[playliststart:playlistend]
2747
2748 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2749 (username, all_ids_count, len(video_ids)))
2750
2751 for video_id in video_ids:
2752 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2753
2754
2755 class DepositFilesIE(InfoExtractor):
2756 """Information extractor for depositfiles.com"""
2757
2758 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2759 IE_NAME = u'DepositFiles'
2760
2761 def __init__(self, downloader=None):
2762 InfoExtractor.__init__(self, downloader)
2763
2764 def report_download_webpage(self, file_id):
2765 """Report webpage download."""
2766 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2767
2768 def report_extraction(self, file_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2771
2772 def _real_extract(self, url):
2773 # At this point we have a new file
2774 self._downloader.increment_downloads()
2775
2776 file_id = url.split('/')[-1]
2777 # Rebuild url in english locale
2778 url = 'http://depositfiles.com/en/files/' + file_id
2779
2780 # Retrieve file webpage with 'Free download' button pressed
2781 free_download_indication = { 'gateway_result' : '1' }
2782 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2783 try:
2784 self.report_download_webpage(file_id)
2785 webpage = urllib2.urlopen(request).read()
2786 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2787 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2788 return
2789
2790 # Search for the real file URL
2791 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2792 if (mobj is None) or (mobj.group(1) is None):
2793 # Try to figure out reason of the error.
2794 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2795 if (mobj is not None) and (mobj.group(1) is not None):
2796 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2797 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2798 else:
2799 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2800 return
2801
2802 file_url = mobj.group(1)
2803 file_extension = os.path.splitext(file_url)[1][1:]
2804
2805 # Search for file title
2806 mobj = re.search(r'<b title="(.*?)">', webpage)
2807 if mobj is None:
2808 self._downloader.trouble(u'ERROR: unable to extract title')
2809 return
2810 file_title = mobj.group(1).decode('utf-8')
2811
2812 try:
2813 # Process file information
2814 self._downloader.process_info({
2815 'id': file_id.decode('utf-8'),
2816 'url': file_url.decode('utf-8'),
2817 'uploader': u'NA',
2818 'upload_date': u'NA',
2819 'title': file_title,
2820 'stitle': file_title,
2821 'ext': file_extension.decode('utf-8'),
2822 'format': u'NA',
2823 'player_url': None,
2824 })
2825 except UnavailableVideoError, err:
2826 self._downloader.trouble(u'ERROR: unable to download file')
2827
2828
2829 class FacebookIE(InfoExtractor):
2830 """Information Extractor for Facebook"""
2831
2832 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2833 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2834 _NETRC_MACHINE = 'facebook'
2835 _available_formats = ['video', 'highqual', 'lowqual']
2836 _video_extensions = {
2837 'video': 'mp4',
2838 'highqual': 'mp4',
2839 'lowqual': 'mp4',
2840 }
2841 IE_NAME = u'facebook'
2842
2843 def __init__(self, downloader=None):
2844 InfoExtractor.__init__(self, downloader)
2845
2846 def _reporter(self, message):
2847 """Add header and report message."""
2848 self._downloader.to_screen(u'[facebook] %s' % message)
2849
2850 def report_login(self):
2851 """Report attempt to log in."""
2852 self._reporter(u'Logging in')
2853
2854 def report_video_webpage_download(self, video_id):
2855 """Report attempt to download video webpage."""
2856 self._reporter(u'%s: Downloading video webpage' % video_id)
2857
2858 def report_information_extraction(self, video_id):
2859 """Report attempt to extract video information."""
2860 self._reporter(u'%s: Extracting video information' % video_id)
2861
2862 def _parse_page(self, video_webpage):
2863 """Extract video information from page"""
2864 # General data
2865 data = {'title': r'\("video_title", "(.*?)"\)',
2866 'description': r'<div class="datawrap">(.*?)</div>',
2867 'owner': r'\("video_owner_name", "(.*?)"\)',
2868 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2869 }
2870 video_info = {}
2871 for piece in data.keys():
2872 mobj = re.search(data[piece], video_webpage)
2873 if mobj is not None:
2874 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2875
2876 # Video urls
2877 video_urls = {}
2878 for fmt in self._available_formats:
2879 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2880 if mobj is not None:
2881 # URL is in a Javascript segment inside an escaped Unicode format within
2882 # the generally utf-8 page
2883 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2884 video_info['video_urls'] = video_urls
2885
2886 return video_info
2887
2888 def _real_initialize(self):
2889 if self._downloader is None:
2890 return
2891
2892 useremail = None
2893 password = None
2894 downloader_params = self._downloader.params
2895
2896 # Attempt to use provided username and password or .netrc data
2897 if downloader_params.get('username', None) is not None:
2898 useremail = downloader_params['username']
2899 password = downloader_params['password']
2900 elif downloader_params.get('usenetrc', False):
2901 try:
2902 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2903 if info is not None:
2904 useremail = info[0]
2905 password = info[2]
2906 else:
2907 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2908 except (IOError, netrc.NetrcParseError), err:
2909 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2910 return
2911
2912 if useremail is None:
2913 return
2914
2915 # Log in
2916 login_form = {
2917 'email': useremail,
2918 'pass': password,
2919 'login': 'Log+In'
2920 }
2921 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2922 try:
2923 self.report_login()
2924 login_results = urllib2.urlopen(request).read()
2925 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2926 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2927 return
2928 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2929 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2930 return
2931
2932 def _real_extract(self, url):
2933 mobj = re.match(self._VALID_URL, url)
2934 if mobj is None:
2935 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2936 return
2937 video_id = mobj.group('ID')
2938
2939 # Get video webpage
2940 self.report_video_webpage_download(video_id)
2941 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2942 try:
2943 page = urllib2.urlopen(request)
2944 video_webpage = page.read()
2945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2946 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2947 return
2948
2949 # Start extracting information
2950 self.report_information_extraction(video_id)
2951
2952 # Extract information
2953 video_info = self._parse_page(video_webpage)
2954
2955 # uploader
2956 if 'owner' not in video_info:
2957 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2958 return
2959 video_uploader = video_info['owner']
2960
2961 # title
2962 if 'title' not in video_info:
2963 self._downloader.trouble(u'ERROR: unable to extract video title')
2964 return
2965 video_title = video_info['title']
2966 video_title = video_title.decode('utf-8')
2967 video_title = sanitize_title(video_title)
2968
2969 simple_title = _simplify_title(video_title)
2970
2971 # thumbnail image
2972 if 'thumbnail' not in video_info:
2973 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2974 video_thumbnail = ''
2975 else:
2976 video_thumbnail = video_info['thumbnail']
2977
2978 # upload date
2979 upload_date = u'NA'
2980 if 'upload_date' in video_info:
2981 upload_time = video_info['upload_date']
2982 timetuple = email.utils.parsedate_tz(upload_time)
2983 if timetuple is not None:
2984 try:
2985 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2986 except:
2987 pass
2988
2989 # description
2990 video_description = video_info.get('description', 'No description available.')
2991
2992 url_map = video_info['video_urls']
2993 if len(url_map.keys()) > 0:
2994 # Decide which formats to download
2995 req_format = self._downloader.params.get('format', None)
2996 format_limit = self._downloader.params.get('format_limit', None)
2997
2998 if format_limit is not None and format_limit in self._available_formats:
2999 format_list = self._available_formats[self._available_formats.index(format_limit):]
3000 else:
3001 format_list = self._available_formats
3002 existing_formats = [x for x in format_list if x in url_map]
3003 if len(existing_formats) == 0:
3004 self._downloader.trouble(u'ERROR: no known formats available for video')
3005 return
3006 if req_format is None:
3007 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3008 elif req_format == 'worst':
3009 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3010 elif req_format == '-1':
3011 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3012 else:
3013 # Specific format
3014 if req_format not in url_map:
3015 self._downloader.trouble(u'ERROR: requested format not available')
3016 return
3017 video_url_list = [(req_format, url_map[req_format])] # Specific format
3018
3019 for format_param, video_real_url in video_url_list:
3020
3021 # At this point we have a new video
3022 self._downloader.increment_downloads()
3023
3024 # Extension
3025 video_extension = self._video_extensions.get(format_param, 'mp4')
3026
3027 try:
3028 # Process video information
3029 self._downloader.process_info({
3030 'id': video_id.decode('utf-8'),
3031 'url': video_real_url.decode('utf-8'),
3032 'uploader': video_uploader.decode('utf-8'),
3033 'upload_date': upload_date,
3034 'title': video_title,
3035 'stitle': simple_title,
3036 'ext': video_extension.decode('utf-8'),
3037 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3038 'thumbnail': video_thumbnail.decode('utf-8'),
3039 'description': video_description.decode('utf-8'),
3040 'player_url': None,
3041 })
3042 except UnavailableVideoError, err:
3043 self._downloader.trouble(u'\nERROR: unable to download video')
3044
3045 class BlipTVIE(InfoExtractor):
3046 """Information extractor for blip.tv"""
3047
3048 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3049 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3050 IE_NAME = u'blip.tv'
3051
3052 def report_extraction(self, file_id):
3053 """Report information extraction."""
3054 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3055
3056 def report_direct_download(self, title):
3057 """Report information extraction."""
3058 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3059
3060 def _real_extract(self, url):
3061 mobj = re.match(self._VALID_URL, url)
3062 if mobj is None:
3063 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3064 return
3065
3066 if '?' in url:
3067 cchar = '&'
3068 else:
3069 cchar = '?'
3070 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3071 request = urllib2.Request(json_url)
3072 self.report_extraction(mobj.group(1))
3073 info = None
3074 try:
3075 urlh = urllib2.urlopen(request)
3076 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3077 basename = url.split('/')[-1]
3078 title,ext = os.path.splitext(basename)
3079 title = title.decode('UTF-8')
3080 ext = ext.replace('.', '')
3081 self.report_direct_download(title)
3082 info = {
3083 'id': title,
3084 'url': url,
3085 'title': title,
3086 'stitle': _simplify_title(title),
3087 'ext': ext,
3088 'urlhandle': urlh
3089 }
3090 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3092 return
3093 if info is None: # Regular URL
3094 try:
3095 json_code = urlh.read()
3096 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3097 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3098 return
3099
3100 try:
3101 json_data = json.loads(json_code)
3102 if 'Post' in json_data:
3103 data = json_data['Post']
3104 else:
3105 data = json_data
3106
3107 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3108 video_url = data['media']['url']
3109 umobj = re.match(self._URL_EXT, video_url)
3110 if umobj is None:
3111 raise ValueError('Can not determine filename extension')
3112 ext = umobj.group(1)
3113
3114 info = {
3115 'id': data['item_id'],
3116 'url': video_url,
3117 'uploader': data['display_name'],
3118 'upload_date': upload_date,
3119 'title': data['title'],
3120 'stitle': _simplify_title(data['title']),
3121 'ext': ext,
3122 'format': data['media']['mimeType'],
3123 'thumbnail': data['thumbnailUrl'],
3124 'description': data['description'],
3125 'player_url': data['embedUrl']
3126 }
3127 except (ValueError,KeyError), err:
3128 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3129 return
3130
3131 self._downloader.increment_downloads()
3132
3133 try:
3134 self._downloader.process_info(info)
3135 except UnavailableVideoError, err:
3136 self._downloader.trouble(u'\nERROR: unable to download video')
3137
3138
3139 class MyVideoIE(InfoExtractor):
3140 """Information Extractor for myvideo.de."""
3141
3142 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3143 IE_NAME = u'myvideo'
3144
3145 def __init__(self, downloader=None):
3146 InfoExtractor.__init__(self, downloader)
3147
3148 def report_download_webpage(self, video_id):
3149 """Report webpage download."""
3150 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3151
3152 def report_extraction(self, video_id):
3153 """Report information extraction."""
3154 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3155
3156 def _real_extract(self,url):
3157 mobj = re.match(self._VALID_URL, url)
3158 if mobj is None:
3159 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3160 return
3161
3162 video_id = mobj.group(1)
3163
3164 # Get video webpage
3165 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3166 try:
3167 self.report_download_webpage(video_id)
3168 webpage = urllib2.urlopen(request).read()
3169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3170 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3171 return
3172
3173 self.report_extraction(video_id)
3174 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3175 webpage)
3176 if mobj is None:
3177 self._downloader.trouble(u'ERROR: unable to extract media URL')
3178 return
3179 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3180
3181 mobj = re.search('<title>([^<]+)</title>', webpage)
3182 if mobj is None:
3183 self._downloader.trouble(u'ERROR: unable to extract title')
3184 return
3185
3186 video_title = mobj.group(1)
3187 video_title = sanitize_title(video_title)
3188
3189 simple_title = _simplify_title(video_title)
3190
3191 try:
3192 self._downloader.process_info({
3193 'id': video_id,
3194 'url': video_url,
3195 'uploader': u'NA',
3196 'upload_date': u'NA',
3197 'title': video_title,
3198 'stitle': simple_title,
3199 'ext': u'flv',
3200 'format': u'NA',
3201 'player_url': None,
3202 })
3203 except UnavailableVideoError:
3204 self._downloader.trouble(u'\nERROR: Unable to download video')
3205
3206 class ComedyCentralIE(InfoExtractor):
3207 """Information extractor for The Daily Show and Colbert Report """
3208
3209 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3210 IE_NAME = u'comedycentral'
3211
3212 def report_extraction(self, episode_id):
3213 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3214
3215 def report_config_download(self, episode_id):
3216 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3217
3218 def report_index_download(self, episode_id):
3219 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3220
3221 def report_player_url(self, episode_id):
3222 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3223
3224 def _real_extract(self, url):
3225 mobj = re.match(self._VALID_URL, url)
3226 if mobj is None:
3227 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3228 return
3229
3230 if mobj.group('shortname'):
3231 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3232 url = u'http://www.thedailyshow.com/full-episodes/'
3233 else:
3234 url = u'http://www.colbertnation.com/full-episodes/'
3235 mobj = re.match(self._VALID_URL, url)
3236 assert mobj is not None
3237
3238 dlNewest = not mobj.group('episode')
3239 if dlNewest:
3240 epTitle = mobj.group('showname')
3241 else:
3242 epTitle = mobj.group('episode')
3243
3244 req = urllib2.Request(url)
3245 self.report_extraction(epTitle)
3246 try:
3247 htmlHandle = urllib2.urlopen(req)
3248 html = htmlHandle.read()
3249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3250 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3251 return
3252 if dlNewest:
3253 url = htmlHandle.geturl()
3254 mobj = re.match(self._VALID_URL, url)
3255 if mobj is None:
3256 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3257 return
3258 if mobj.group('episode') == '':
3259 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3260 return
3261 epTitle = mobj.group('episode')
3262
3263 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3264 if len(mMovieParams) == 0:
3265 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3266 return
3267
3268 playerUrl_raw = mMovieParams[0][0]
3269 self.report_player_url(epTitle)
3270 try:
3271 urlHandle = urllib2.urlopen(playerUrl_raw)
3272 playerUrl = urlHandle.geturl()
3273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3275 return
3276
3277 uri = mMovieParams[0][1]
3278 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3279 self.report_index_download(epTitle)
3280 try:
3281 indexXml = urllib2.urlopen(indexUrl).read()
3282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3283 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3284 return
3285
3286 idoc = xml.etree.ElementTree.fromstring(indexXml)
3287 itemEls = idoc.findall('.//item')
3288 for itemEl in itemEls:
3289 mediaId = itemEl.findall('./guid')[0].text
3290 shortMediaId = mediaId.split(':')[-1]
3291 showId = mediaId.split(':')[-2].replace('.com', '')
3292 officialTitle = itemEl.findall('./title')[0].text
3293 officialDate = itemEl.findall('./pubDate')[0].text
3294
3295 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3296 urllib.urlencode({'uri': mediaId}))
3297 configReq = urllib2.Request(configUrl)
3298 self.report_config_download(epTitle)
3299 try:
3300 configXml = urllib2.urlopen(configReq).read()
3301 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3302 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3303 return
3304
3305 cdoc = xml.etree.ElementTree.fromstring(configXml)
3306 turls = []
3307 for rendition in cdoc.findall('.//rendition'):
3308 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3309 turls.append(finfo)
3310
3311 if len(turls) == 0:
3312 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3313 continue
3314
3315 # For now, just pick the highest bitrate
3316 format,video_url = turls[-1]
3317
3318 self._downloader.increment_downloads()
3319
3320 effTitle = showId + u'-' + epTitle
3321 info = {
3322 'id': shortMediaId,
3323 'url': video_url,
3324 'uploader': showId,
3325 'upload_date': officialDate,
3326 'title': effTitle,
3327 'stitle': _simplify_title(effTitle),
3328 'ext': 'mp4',
3329 'format': format,
3330 'thumbnail': None,
3331 'description': officialTitle,
3332 'player_url': playerUrl
3333 }
3334
3335 try:
3336 self._downloader.process_info(info)
3337 except UnavailableVideoError, err:
3338 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3339 continue
3340
3341
3342 class EscapistIE(InfoExtractor):
3343 """Information extractor for The Escapist """
3344
3345 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3346 IE_NAME = u'escapist'
3347
3348 def report_extraction(self, showName):
3349 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3350
3351 def report_config_download(self, showName):
3352 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3353
3354 def _real_extract(self, url):
3355 htmlParser = HTMLParser.HTMLParser()
3356
3357 mobj = re.match(self._VALID_URL, url)
3358 if mobj is None:
3359 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3360 return
3361 showName = mobj.group('showname')
3362 videoId = mobj.group('episode')
3363
3364 self.report_extraction(showName)
3365 try:
3366 webPage = urllib2.urlopen(url).read()
3367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3368 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3369 return
3370
3371 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3372 description = htmlParser.unescape(descMatch.group(1))
3373 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3374 imgUrl = htmlParser.unescape(imgMatch.group(1))
3375 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3376 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3377 configUrlMatch = re.search('config=(.*)$', playerUrl)
3378 configUrl = urllib2.unquote(configUrlMatch.group(1))
3379
3380 self.report_config_download(showName)
3381 try:
3382 configJSON = urllib2.urlopen(configUrl).read()
3383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3384 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3385 return
3386
3387 # Technically, it's JavaScript, not JSON
3388 configJSON = configJSON.replace("'", '"')
3389
3390 try:
3391 config = json.loads(configJSON)
3392 except (ValueError,), err:
3393 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3394 return
3395
3396 playlist = config['playlist']
3397 videoUrl = playlist[1]['url']
3398
3399 self._downloader.increment_downloads()
3400 info = {
3401 'id': videoId,
3402 'url': videoUrl,
3403 'uploader': showName,
3404 'upload_date': None,
3405 'title': showName,
3406 'stitle': _simplify_title(showName),
3407 'ext': 'flv',
3408 'format': 'flv',
3409 'thumbnail': imgUrl,
3410 'description': description,
3411 'player_url': playerUrl,
3412 }
3413
3414 try:
3415 self._downloader.process_info(info)
3416 except UnavailableVideoError, err:
3417 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3418
3419
3420 class CollegeHumorIE(InfoExtractor):
3421 """Information extractor for collegehumor.com"""
3422
3423 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3424 IE_NAME = u'collegehumor'
3425
3426 def report_webpage(self, video_id):
3427 """Report information extraction."""
3428 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3429
3430 def report_extraction(self, video_id):
3431 """Report information extraction."""
3432 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3433
3434 def _real_extract(self, url):
3435 htmlParser = HTMLParser.HTMLParser()
3436
3437 mobj = re.match(self._VALID_URL, url)
3438 if mobj is None:
3439 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3440 return
3441 video_id = mobj.group('videoid')
3442
3443 self.report_webpage(video_id)
3444 request = urllib2.Request(url)
3445 try:
3446 webpage = urllib2.urlopen(request).read()
3447 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3448 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3449 return
3450
3451 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3452 if m is None:
3453 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3454 return
3455 internal_video_id = m.group('internalvideoid')
3456
3457 info = {
3458 'id': video_id,
3459 'internal_id': internal_video_id,
3460 }
3461
3462 self.report_extraction(video_id)
3463 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3464 try:
3465 metaXml = urllib2.urlopen(xmlUrl).read()
3466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3467 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3468 return
3469
3470 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3471 try:
3472 videoNode = mdoc.findall('./video')[0]
3473 info['description'] = videoNode.findall('./description')[0].text
3474 info['title'] = videoNode.findall('./caption')[0].text
3475 info['stitle'] = _simplify_title(info['title'])
3476 info['url'] = videoNode.findall('./file')[0].text
3477 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3478 info['ext'] = info['url'].rpartition('.')[2]
3479 info['format'] = info['ext']
3480 except IndexError:
3481 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3482 return
3483
3484 self._downloader.increment_downloads()
3485
3486 try:
3487 self._downloader.process_info(info)
3488 except UnavailableVideoError, err:
3489 self._downloader.trouble(u'\nERROR: unable to download video')
3490
3491
3492 class XVideosIE(InfoExtractor):
3493 """Information extractor for xvideos.com"""
3494
3495 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3496 IE_NAME = u'xvideos'
3497
3498 def report_webpage(self, video_id):
3499 """Report information extraction."""
3500 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3501
3502 def report_extraction(self, video_id):
3503 """Report information extraction."""
3504 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3505
3506 def _real_extract(self, url):
3507 htmlParser = HTMLParser.HTMLParser()
3508
3509 mobj = re.match(self._VALID_URL, url)
3510 if mobj is None:
3511 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3512 return
3513 video_id = mobj.group(1).decode('utf-8')
3514
3515 self.report_webpage(video_id)
3516
3517 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3518 try:
3519 webpage = urllib2.urlopen(request).read()
3520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3521 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3522 return
3523
3524 self.report_extraction(video_id)
3525
3526
3527 # Extract video URL
3528 mobj = re.search(r'flv_url=(.+?)&', webpage)
3529 if mobj is None:
3530 self._downloader.trouble(u'ERROR: unable to extract video url')
3531 return
3532 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3533
3534
3535 # Extract title
3536 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3537 if mobj is None:
3538 self._downloader.trouble(u'ERROR: unable to extract video title')
3539 return
3540 video_title = mobj.group(1).decode('utf-8')
3541
3542
3543 # Extract video thumbnail
3544 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3545 if mobj is None:
3546 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3547 return
3548 video_thumbnail = mobj.group(1).decode('utf-8')
3549
3550
3551
3552 self._downloader.increment_downloads()
3553 info = {
3554 'id': video_id,
3555 'url': video_url,
3556 'uploader': None,
3557 'upload_date': None,
3558 'title': video_title,
3559 'stitle': _simplify_title(video_title),
3560 'ext': 'flv',
3561 'format': 'flv',
3562 'thumbnail': video_thumbnail,
3563 'description': None,
3564 'player_url': None,
3565 }
3566
3567 try:
3568 self._downloader.process_info(info)
3569 except UnavailableVideoError, err:
3570 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3571
3572
3573 class SoundcloudIE(InfoExtractor):
3574 """Information extractor for soundcloud.com
3575 To access the media, the uid of the song and a stream token
3576 must be extracted from the page source and the script must make
3577 a request to media.soundcloud.com/crossdomain.xml. Then
3578 the media can be grabbed by requesting from an url composed
3579 of the stream token and uid
3580 """
3581
3582 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3583 IE_NAME = u'soundcloud'
3584
3585 def __init__(self, downloader=None):
3586 InfoExtractor.__init__(self, downloader)
3587
3588 def report_webpage(self, video_id):
3589 """Report information extraction."""
3590 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3591
3592 def report_extraction(self, video_id):
3593 """Report information extraction."""
3594 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3595
3596 def _real_extract(self, url):
3597 htmlParser = HTMLParser.HTMLParser()
3598
3599 mobj = re.match(self._VALID_URL, url)
3600 if mobj is None:
3601 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3602 return
3603
3604 # extract uploader (which is in the url)
3605 uploader = mobj.group(1).decode('utf-8')
3606 # extract simple title (uploader + slug of song title)
3607 slug_title = mobj.group(2).decode('utf-8')
3608 simple_title = uploader + '-' + slug_title
3609
3610 self.report_webpage('%s/%s' % (uploader, slug_title))
3611
3612 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3613 try:
3614 webpage = urllib2.urlopen(request).read()
3615 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3616 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3617 return
3618
3619 self.report_extraction('%s/%s' % (uploader, slug_title))
3620
3621 # extract uid and stream token that soundcloud hands out for access
3622 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3623 if mobj:
3624 video_id = mobj.group(1)
3625 stream_token = mobj.group(2)
3626
3627 # extract unsimplified title
3628 mobj = re.search('"title":"(.*?)",', webpage)
3629 if mobj:
3630 title = mobj.group(1)
3631
3632 # construct media url (with uid/token)
3633 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3634 mediaURL = mediaURL % (video_id, stream_token)
3635
3636 # description
3637 description = u'No description available'
3638 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3639 if mobj:
3640 description = mobj.group(1)
3641
3642 # upload date
3643 upload_date = None
3644 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3645 if mobj:
3646 try:
3647 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3648 except Exception, e:
3649 print str(e)
3650
3651 # for soundcloud, a request to a cross domain is required for cookies
3652 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3653
3654 try:
3655 self._downloader.process_info({
3656 'id': video_id.decode('utf-8'),
3657 'url': mediaURL,
3658 'uploader': uploader.decode('utf-8'),
3659 'upload_date': upload_date,
3660 'title': simple_title.decode('utf-8'),
3661 'stitle': simple_title.decode('utf-8'),
3662 'ext': u'mp3',
3663 'format': u'NA',
3664 'player_url': None,
3665 'description': description.decode('utf-8')
3666 })
3667 except UnavailableVideoError:
3668 self._downloader.trouble(u'\nERROR: unable to download video')
3669
3670
3671 class InfoQIE(InfoExtractor):
3672 """Information extractor for infoq.com"""
3673
3674 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3675 IE_NAME = u'infoq'
3676
3677 def report_webpage(self, video_id):
3678 """Report information extraction."""
3679 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3680
3681 def report_extraction(self, video_id):
3682 """Report information extraction."""
3683 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3684
3685 def _real_extract(self, url):
3686 htmlParser = HTMLParser.HTMLParser()
3687
3688 mobj = re.match(self._VALID_URL, url)
3689 if mobj is None:
3690 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3691 return
3692
3693 self.report_webpage(url)
3694
3695 request = urllib2.Request(url)
3696 try:
3697 webpage = urllib2.urlopen(request).read()
3698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3699 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3700 return
3701
3702 self.report_extraction(url)
3703
3704
3705 # Extract video URL
3706 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3707 if mobj is None:
3708 self._downloader.trouble(u'ERROR: unable to extract video url')
3709 return
3710 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3711
3712
3713 # Extract title
3714 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3715 if mobj is None:
3716 self._downloader.trouble(u'ERROR: unable to extract video title')
3717 return
3718 video_title = mobj.group(1).decode('utf-8')
3719
3720 # Extract description
3721 video_description = u'No description available.'
3722 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3723 if mobj is not None:
3724 video_description = mobj.group(1).decode('utf-8')
3725
3726 video_filename = video_url.split('/')[-1]
3727 video_id, extension = video_filename.split('.')
3728
3729 self._downloader.increment_downloads()
3730 info = {
3731 'id': video_id,
3732 'url': video_url,
3733 'uploader': None,
3734 'upload_date': None,
3735 'title': video_title,
3736 'stitle': _simplify_title(video_title),
3737 'ext': extension,
3738 'format': extension, # Extension is always(?) mp4, but seems to be flv
3739 'thumbnail': None,
3740 'description': video_description,
3741 'player_url': None,
3742 }
3743
3744 try:
3745 self._downloader.process_info(info)
3746 except UnavailableVideoError, err:
3747 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3748
3749 class MixcloudIE(InfoExtractor):
3750 """Information extractor for www.mixcloud.com"""
3751 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3752 IE_NAME = u'mixcloud'
3753
3754 def __init__(self, downloader=None):
3755 InfoExtractor.__init__(self, downloader)
3756
3757 def report_download_json(self, file_id):
3758 """Report JSON download."""
3759 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3760
3761 def report_extraction(self, file_id):
3762 """Report information extraction."""
3763 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3764
3765 def get_urls(self, jsonData, fmt, bitrate='best'):
3766 """Get urls from 'audio_formats' section in json"""
3767 file_url = None
3768 try:
3769 bitrate_list = jsonData[fmt]
3770 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3771 bitrate = max(bitrate_list) # select highest
3772
3773 url_list = jsonData[fmt][bitrate]
3774 except TypeError: # we have no bitrate info.
3775 url_list = jsonData[fmt]
3776
3777 return url_list
3778
3779 def check_urls(self, url_list):
3780 """Returns 1st active url from list"""
3781 for url in url_list:
3782 try:
3783 urllib2.urlopen(url)
3784 return url
3785 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3786 url = None
3787
3788 return None
3789
3790 def _print_formats(self, formats):
3791 print 'Available formats:'
3792 for fmt in formats.keys():
3793 for b in formats[fmt]:
3794 try:
3795 ext = formats[fmt][b][0]
3796 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3797 except TypeError: # we have no bitrate info
3798 ext = formats[fmt][0]
3799 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3800 break
3801
3802 def _real_extract(self, url):
3803 mobj = re.match(self._VALID_URL, url)
3804 if mobj is None:
3805 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3806 return
3807 # extract uploader & filename from url
3808 uploader = mobj.group(1).decode('utf-8')
3809 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3810
3811 # construct API request
3812 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3813 # retrieve .json file with links to files
3814 request = urllib2.Request(file_url)
3815 try:
3816 self.report_download_json(file_url)
3817 jsonData = urllib2.urlopen(request).read()
3818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3819 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3820 return
3821
3822 # parse JSON
3823 json_data = json.loads(jsonData)
3824 player_url = json_data['player_swf_url']
3825 formats = dict(json_data['audio_formats'])
3826
3827 req_format = self._downloader.params.get('format', None)
3828 bitrate = None
3829
3830 if self._downloader.params.get('listformats', None):
3831 self._print_formats(formats)
3832 return
3833
3834 if req_format is None or req_format == 'best':
3835 for format_param in formats.keys():
3836 url_list = self.get_urls(formats, format_param)
3837 # check urls
3838 file_url = self.check_urls(url_list)
3839 if file_url is not None:
3840 break # got it!
3841 else:
3842 if req_format not in formats.keys():
3843 self._downloader.trouble(u'ERROR: format is not available')
3844 return
3845
3846 url_list = self.get_urls(formats, req_format)
3847 file_url = self.check_urls(url_list)
3848 format_param = req_format
3849
3850 # We have audio
3851 self._downloader.increment_downloads()
3852 try:
3853 # Process file information
3854 self._downloader.process_info({
3855 'id': file_id.decode('utf-8'),
3856 'url': file_url.decode('utf-8'),
3857 'uploader': uploader.decode('utf-8'),
3858 'upload_date': u'NA',
3859 'title': json_data['name'],
3860 'stitle': _simplify_title(json_data['name']),
3861 'ext': file_url.split('.')[-1].decode('utf-8'),
3862 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3863 'thumbnail': json_data['thumbnail_url'],
3864 'description': json_data['description'],
3865 'player_url': player_url.decode('utf-8'),
3866 })
3867 except UnavailableVideoError, err:
3868 self._downloader.trouble(u'ERROR: unable to download file')
3869
3870 class StanfordOpenClassroomIE(InfoExtractor):
3871 """Information extractor for Stanford's Open ClassRoom"""
3872
3873 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3874 IE_NAME = u'stanfordoc'
3875
3876 def report_download_webpage(self, objid):
3877 """Report information extraction."""
3878 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3879
3880 def report_extraction(self, video_id):
3881 """Report information extraction."""
3882 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3883
3884 def _real_extract(self, url):
3885 mobj = re.match(self._VALID_URL, url)
3886 if mobj is None:
3887 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3888 return
3889
3890 if mobj.group('course') and mobj.group('video'): # A specific video
3891 course = mobj.group('course')
3892 video = mobj.group('video')
3893 info = {
3894 'id': _simplify_title(course + '_' + video),
3895 }
3896
3897 self.report_extraction(info['id'])
3898 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3899 xmlUrl = baseUrl + video + '.xml'
3900 try:
3901 metaXml = urllib2.urlopen(xmlUrl).read()
3902 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3903 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3904 return
3905 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3906 try:
3907 info['title'] = mdoc.findall('./title')[0].text
3908 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3909 except IndexError:
3910 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3911 return
3912 info['stitle'] = _simplify_title(info['title'])
3913 info['ext'] = info['url'].rpartition('.')[2]
3914 info['format'] = info['ext']
3915 self._downloader.increment_downloads()
3916 try:
3917 self._downloader.process_info(info)
3918 except UnavailableVideoError, err:
3919 self._downloader.trouble(u'\nERROR: unable to download video')
3920 elif mobj.group('course'): # A course page
3921 unescapeHTML = HTMLParser.HTMLParser().unescape
3922
3923 course = mobj.group('course')
3924 info = {
3925 'id': _simplify_title(course),
3926 'type': 'playlist',
3927 }
3928
3929 self.report_download_webpage(info['id'])
3930 try:
3931 coursepage = urllib2.urlopen(url).read()
3932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3933 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3934 return
3935
3936 m = re.search('<h1>([^<]+)</h1>', coursepage)
3937 if m:
3938 info['title'] = unescapeHTML(m.group(1))
3939 else:
3940 info['title'] = info['id']
3941 info['stitle'] = _simplify_title(info['title'])
3942
3943 m = re.search('<description>([^<]+)</description>', coursepage)
3944 if m:
3945 info['description'] = unescapeHTML(m.group(1))
3946
3947 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3948 info['list'] = [
3949 {
3950 'type': 'reference',
3951 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3952 }
3953 for vpage in links]
3954
3955 for entry in info['list']:
3956 assert entry['type'] == 'reference'
3957 self.extract(entry['url'])
3958 else: # Root page
3959 unescapeHTML = HTMLParser.HTMLParser().unescape
3960
3961 info = {
3962 'id': 'Stanford OpenClassroom',
3963 'type': 'playlist',
3964 }
3965
3966 self.report_download_webpage(info['id'])
3967 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3968 try:
3969 rootpage = urllib2.urlopen(rootURL).read()
3970 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3971 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3972 return
3973
3974 info['title'] = info['id']
3975 info['stitle'] = _simplify_title(info['title'])
3976
3977 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3978 info['list'] = [
3979 {
3980 'type': 'reference',
3981 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3982 }
3983 for cpage in links]
3984
3985 for entry in info['list']:
3986 assert entry['type'] == 'reference'
3987 self.extract(entry['url'])
3988
3989 class MTVIE(InfoExtractor):
3990 """Information extractor for MTV.com"""
3991
3992 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3993 IE_NAME = u'mtv'
3994
3995 def report_webpage(self, video_id):
3996 """Report information extraction."""
3997 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3998
3999 def report_extraction(self, video_id):
4000 """Report information extraction."""
4001 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4002
4003 def _real_extract(self, url):
4004 mobj = re.match(self._VALID_URL, url)
4005 if mobj is None:
4006 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4007 return
4008 if not mobj.group('proto'):
4009 url = 'http://' + url
4010 video_id = mobj.group('videoid')
4011 self.report_webpage(video_id)
4012
4013 request = urllib2.Request(url)
4014 try:
4015 webpage = urllib2.urlopen(request).read()
4016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4017 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4018 return
4019
4020 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4021 if mobj is None:
4022 self._downloader.trouble(u'ERROR: unable to extract song name')
4023 return
4024 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4025 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4026 if mobj is None:
4027 self._downloader.trouble(u'ERROR: unable to extract performer')
4028 return
4029 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4030 video_title = performer + ' - ' + song_name
4031
4032 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4033 if mobj is None:
4034 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4035 return
4036 mtvn_uri = mobj.group(1)
4037
4038 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4039 if mobj is None:
4040 self._downloader.trouble(u'ERROR: unable to extract content id')
4041 return
4042 content_id = mobj.group(1)
4043
4044 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4045 self.report_extraction(video_id)
4046 request = urllib2.Request(videogen_url)
4047 try:
4048 metadataXml = urllib2.urlopen(request).read()
4049 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4050 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4051 return
4052
4053 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4054 renditions = mdoc.findall('.//rendition')
4055
4056 # For now, always pick the highest quality.
4057 rendition = renditions[-1]
4058
4059 try:
4060 _,_,ext = rendition.attrib['type'].partition('/')
4061 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4062 video_url = rendition.find('./src').text
4063 except KeyError:
4064 self._downloader.trouble('Invalid rendition field.')
4065 return
4066
4067 self._downloader.increment_downloads()
4068 info = {
4069 'id': video_id,
4070 'url': video_url,
4071 'uploader': performer,
4072 'title': video_title,
4073 'stitle': _simplify_title(video_title),
4074 'ext': ext,
4075 'format': format,
4076 }
4077
4078 try:
4079 self._downloader.process_info(info)
4080 except UnavailableVideoError, err:
4081 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4082
4083
4084 class PostProcessor(object):
4085 """Post Processor class.
4086
4087 PostProcessor objects can be added to downloaders with their
4088 add_post_processor() method. When the downloader has finished a
4089 successful download, it will take its internal chain of PostProcessors
4090 and start calling the run() method on each one of them, first with
4091 an initial argument and then with the returned value of the previous
4092 PostProcessor.
4093
4094 The chain will be stopped if one of them ever returns None or the end
4095 of the chain is reached.
4096
4097 PostProcessor objects follow a "mutual registration" process similar
4098 to InfoExtractor objects.
4099 """
4100
4101 _downloader = None
4102
4103 def __init__(self, downloader=None):
4104 self._downloader = downloader
4105
4106 def set_downloader(self, downloader):
4107 """Sets the downloader for this PP."""
4108 self._downloader = downloader
4109
4110 def run(self, information):
4111 """Run the PostProcessor.
4112
4113 The "information" argument is a dictionary like the ones
4114 composed by InfoExtractors. The only difference is that this
4115 one has an extra field called "filepath" that points to the
4116 downloaded file.
4117
4118 When this method returns None, the postprocessing chain is
4119 stopped. However, this method may return an information
4120 dictionary that will be passed to the next postprocessing
4121 object in the chain. It can be the one it received after
4122 changing some fields.
4123
4124 In addition, this method may raise a PostProcessingError
4125 exception that will be taken into account by the downloader
4126 it was called from.
4127 """
4128 return information # by default, do nothing
4129
4130 class AudioConversionError(BaseException):
4131 def __init__(self, message):
4132 self.message = message
4133
4134 class FFmpegExtractAudioPP(PostProcessor):
4135
4136 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4137 PostProcessor.__init__(self, downloader)
4138 if preferredcodec is None:
4139 preferredcodec = 'best'
4140 self._preferredcodec = preferredcodec
4141 self._preferredquality = preferredquality
4142 self._keepvideo = keepvideo
4143
4144 @staticmethod
4145 def get_audio_codec(path):
4146 try:
4147 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4148 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4149 output = handle.communicate()[0]
4150 if handle.wait() != 0:
4151 return None
4152 except (IOError, OSError):
4153 return None
4154 audio_codec = None
4155 for line in output.split('\n'):
4156 if line.startswith('codec_name='):
4157 audio_codec = line.split('=')[1].strip()
4158 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4159 return audio_codec
4160 return None
4161
4162 @staticmethod
4163 def run_ffmpeg(path, out_path, codec, more_opts):
4164 if codec is None:
4165 acodec_opts = []
4166 else:
4167 acodec_opts = ['-acodec', codec]
4168 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4169 try:
4170 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4171 stdout,stderr = p.communicate()
4172 except (IOError, OSError):
4173 e = sys.exc_info()[1]
4174 if isinstance(e, OSError) and e.errno == 2:
4175 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4176 else:
4177 raise e
4178 if p.returncode != 0:
4179 msg = stderr.strip().split('\n')[-1]
4180 raise AudioConversionError(msg)
4181
4182 def run(self, information):
4183 path = information['filepath']
4184
4185 filecodec = self.get_audio_codec(path)
4186 if filecodec is None:
4187 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4188 return None
4189
4190 more_opts = []
4191 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4192 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4193 # Lossless, but in another container
4194 acodec = 'copy'
4195 extension = self._preferredcodec
4196 more_opts = ['-absf', 'aac_adtstoasc']
4197 elif filecodec in ['aac', 'mp3', 'vorbis']:
4198 # Lossless if possible
4199 acodec = 'copy'
4200 extension = filecodec
4201 if filecodec == 'aac':
4202 more_opts = ['-f', 'adts']
4203 if filecodec == 'vorbis':
4204 extension = 'ogg'
4205 else:
4206 # MP3 otherwise.
4207 acodec = 'libmp3lame'
4208 extension = 'mp3'
4209 more_opts = []
4210 if self._preferredquality is not None:
4211 more_opts += ['-ab', self._preferredquality]
4212 else:
4213 # We convert the audio (lossy)
4214 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4215 extension = self._preferredcodec
4216 more_opts = []
4217 if self._preferredquality is not None:
4218 more_opts += ['-ab', self._preferredquality]
4219 if self._preferredcodec == 'aac':
4220 more_opts += ['-f', 'adts']
4221 if self._preferredcodec == 'm4a':
4222 more_opts += ['-absf', 'aac_adtstoasc']
4223 if self._preferredcodec == 'vorbis':
4224 extension = 'ogg'
4225 if self._preferredcodec == 'wav':
4226 extension = 'wav'
4227 more_opts += ['-f', 'wav']
4228
4229 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4230 new_path = prefix + sep + extension
4231 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4232 try:
4233 self.run_ffmpeg(path, new_path, acodec, more_opts)
4234 except:
4235 etype,e,tb = sys.exc_info()
4236 if isinstance(e, AudioConversionError):
4237 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4238 else:
4239 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4240 return None
4241
4242 # Try to update the date time for extracted audio file.
4243 if information.get('filetime') is not None:
4244 try:
4245 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4246 except:
4247 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4248
4249 if not self._keepvideo:
4250 try:
4251 os.remove(_encodeFilename(path))
4252 except (IOError, OSError):
4253 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4254 return None
4255
4256 information['filepath'] = new_path
4257 return information
4258
4259
4260 def updateSelf(downloader, filename):
4261 ''' Update the program file with the latest version from the repository '''
4262 # Note: downloader only used for options
4263 if not os.access(filename, os.W_OK):
4264 sys.exit('ERROR: no write permissions on %s' % filename)
4265
4266 downloader.to_screen(u'Updating to latest version...')
4267
4268 try:
4269 try:
4270 urlh = urllib.urlopen(UPDATE_URL)
4271 newcontent = urlh.read()
4272
4273 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4274 if vmatch is not None and vmatch.group(1) == __version__:
4275 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4276 return
4277 finally:
4278 urlh.close()
4279 except (IOError, OSError), err:
4280 sys.exit('ERROR: unable to download latest version')
4281
4282 try:
4283 outf = open(filename, 'wb')
4284 try:
4285 outf.write(newcontent)
4286 finally:
4287 outf.close()
4288 except (IOError, OSError), err:
4289 sys.exit('ERROR: unable to overwrite current version')
4290
4291 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4292
4293 def parseOpts():
4294 def _readOptions(filename_bytes):
4295 try:
4296 optionf = open(filename_bytes)
4297 except IOError:
4298 return [] # silently skip if file is not present
4299 try:
4300 res = []
4301 for l in optionf:
4302 res += shlex.split(l, comments=True)
4303 finally:
4304 optionf.close()
4305 return res
4306
4307 def _format_option_string(option):
4308 ''' ('-o', '--option') -> -o, --format METAVAR'''
4309
4310 opts = []
4311
4312 if option._short_opts: opts.append(option._short_opts[0])
4313 if option._long_opts: opts.append(option._long_opts[0])
4314 if len(opts) > 1: opts.insert(1, ', ')
4315
4316 if option.takes_value(): opts.append(' %s' % option.metavar)
4317
4318 return "".join(opts)
4319
4320 def _find_term_columns():
4321 columns = os.environ.get('COLUMNS', None)
4322 if columns:
4323 return int(columns)
4324
4325 try:
4326 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4327 out,err = sp.communicate()
4328 return int(out.split()[1])
4329 except:
4330 pass
4331 return None
4332
4333 max_width = 80
4334 max_help_position = 80
4335
4336 # No need to wrap help messages if we're on a wide console
4337 columns = _find_term_columns()
4338 if columns: max_width = columns
4339
4340 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4341 fmt.format_option_strings = _format_option_string
4342
4343 kw = {
4344 'version' : __version__,
4345 'formatter' : fmt,
4346 'usage' : '%prog [options] url [url...]',
4347 'conflict_handler' : 'resolve',
4348 }
4349
4350 parser = optparse.OptionParser(**kw)
4351
4352 # option groups
4353 general = optparse.OptionGroup(parser, 'General Options')
4354 selection = optparse.OptionGroup(parser, 'Video Selection')
4355 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4356 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4357 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4358 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4359 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4360
4361 general.add_option('-h', '--help',
4362 action='help', help='print this help text and exit')
4363 general.add_option('-v', '--version',
4364 action='version', help='print program version and exit')
4365 general.add_option('-U', '--update',
4366 action='store_true', dest='update_self', help='update this program to latest version')
4367 general.add_option('-i', '--ignore-errors',
4368 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4369 general.add_option('-r', '--rate-limit',
4370 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4371 general.add_option('-R', '--retries',
4372 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4373 general.add_option('--dump-user-agent',
4374 action='store_true', dest='dump_user_agent',
4375 help='display the current browser identification', default=False)
4376 general.add_option('--list-extractors',
4377 action='store_true', dest='list_extractors',
4378 help='List all supported extractors and the URLs they would handle', default=False)
4379
4380 selection.add_option('--playlist-start',
4381 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4382 selection.add_option('--playlist-end',
4383 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4384 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4385 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4386 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4387
4388 authentication.add_option('-u', '--username',
4389 dest='username', metavar='USERNAME', help='account username')
4390 authentication.add_option('-p', '--password',
4391 dest='password', metavar='PASSWORD', help='account password')
4392 authentication.add_option('-n', '--netrc',
4393 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4394
4395
4396 video_format.add_option('-f', '--format',
4397 action='store', dest='format', metavar='FORMAT', help='video format code')
4398 video_format.add_option('--all-formats',
4399 action='store_const', dest='format', help='download all available video formats', const='all')
4400 video_format.add_option('--prefer-free-formats',
4401 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4402 video_format.add_option('--max-quality',
4403 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4404 video_format.add_option('-F', '--list-formats',
4405 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4406 video_format.add_option('--write-srt',
4407 action='store_true', dest='writesubtitles',
4408 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4409 video_format.add_option('--srt-lang',
4410 action='store', dest='subtitleslang', metavar='LANG',
4411 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4412
4413
4414 verbosity.add_option('-q', '--quiet',
4415 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4416 verbosity.add_option('-s', '--simulate',
4417 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4418 verbosity.add_option('--skip-download',
4419 action='store_true', dest='skip_download', help='do not download the video', default=False)
4420 verbosity.add_option('-g', '--get-url',
4421 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4422 verbosity.add_option('-e', '--get-title',
4423 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4424 verbosity.add_option('--get-thumbnail',
4425 action='store_true', dest='getthumbnail',
4426 help='simulate, quiet but print thumbnail URL', default=False)
4427 verbosity.add_option('--get-description',
4428 action='store_true', dest='getdescription',
4429 help='simulate, quiet but print video description', default=False)
4430 verbosity.add_option('--get-filename',
4431 action='store_true', dest='getfilename',
4432 help='simulate, quiet but print output filename', default=False)
4433 verbosity.add_option('--get-format',
4434 action='store_true', dest='getformat',
4435 help='simulate, quiet but print output format', default=False)
4436 verbosity.add_option('--no-progress',
4437 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4438 verbosity.add_option('--console-title',
4439 action='store_true', dest='consoletitle',
4440 help='display progress in console titlebar', default=False)
4441 verbosity.add_option('-v', '--verbose',
4442 action='store_true', dest='verbose', help='print various debugging information', default=False)
4443
4444
4445 filesystem.add_option('-t', '--title',
4446 action='store_true', dest='usetitle', help='use title in file name', default=False)
4447 filesystem.add_option('-l', '--literal',
4448 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4449 filesystem.add_option('-A', '--auto-number',
4450 action='store_true', dest='autonumber',
4451 help='number downloaded files starting from 00000', default=False)
4452 filesystem.add_option('-o', '--output',
4453 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4454 filesystem.add_option('-a', '--batch-file',
4455 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4456 filesystem.add_option('-w', '--no-overwrites',
4457 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4458 filesystem.add_option('-c', '--continue',
4459 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4460 filesystem.add_option('--no-continue',
4461 action='store_false', dest='continue_dl',
4462 help='do not resume partially downloaded files (restart from beginning)')
4463 filesystem.add_option('--cookies',
4464 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4465 filesystem.add_option('--no-part',
4466 action='store_true', dest='nopart', help='do not use .part files', default=False)
4467 filesystem.add_option('--no-mtime',
4468 action='store_false', dest='updatetime',
4469 help='do not use the Last-modified header to set the file modification time', default=True)
4470 filesystem.add_option('--write-description',
4471 action='store_true', dest='writedescription',
4472 help='write video description to a .description file', default=False)
4473 filesystem.add_option('--write-info-json',
4474 action='store_true', dest='writeinfojson',
4475 help='write video metadata to a .info.json file', default=False)
4476
4477
4478 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4479 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4480 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4481 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4482 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4483 help='ffmpeg audio bitrate specification, 128k by default')
4484 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4485 help='keeps the video file on disk after the post-processing; the video is erased by default')
4486
4487
4488 parser.add_option_group(general)
4489 parser.add_option_group(selection)
4490 parser.add_option_group(filesystem)
4491 parser.add_option_group(verbosity)
4492 parser.add_option_group(video_format)
4493 parser.add_option_group(authentication)
4494 parser.add_option_group(postproc)
4495
4496 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4497 if xdg_config_home:
4498 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4499 else:
4500 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4501 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4502 opts, args = parser.parse_args(argv)
4503
4504 return parser, opts, args
4505
4506 def gen_extractors():
4507 """ Return a list of an instance of every supported extractor.
4508 The order does matter; the first extractor matched is the one handling the URL.
4509 """
4510 youtube_ie = YoutubeIE()
4511 google_ie = GoogleIE()
4512 yahoo_ie = YahooIE()
4513 return [
4514 YoutubePlaylistIE(youtube_ie),
4515 YoutubeUserIE(youtube_ie),
4516 YoutubeSearchIE(youtube_ie),
4517 youtube_ie,
4518 MetacafeIE(youtube_ie),
4519 DailymotionIE(),
4520 google_ie,
4521 GoogleSearchIE(google_ie),
4522 PhotobucketIE(),
4523 yahoo_ie,
4524 YahooSearchIE(yahoo_ie),
4525 DepositFilesIE(),
4526 FacebookIE(),
4527 BlipTVIE(),
4528 VimeoIE(),
4529 MyVideoIE(),
4530 ComedyCentralIE(),
4531 EscapistIE(),
4532 CollegeHumorIE(),
4533 XVideosIE(),
4534 SoundcloudIE(),
4535 InfoQIE(),
4536 MixcloudIE(),
4537 StanfordOpenClassroomIE(),
4538 MTVIE(),
4539
4540 GenericIE()
4541 ]
4542
4543 def _real_main():
4544 parser, opts, args = parseOpts()
4545
4546 # Open appropriate CookieJar
4547 if opts.cookiefile is None:
4548 jar = cookielib.CookieJar()
4549 else:
4550 try:
4551 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4552 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4553 jar.load()
4554 except (IOError, OSError), err:
4555 sys.exit(u'ERROR: unable to open cookie file')
4556
4557 # Dump user agent
4558 if opts.dump_user_agent:
4559 print std_headers['User-Agent']
4560 sys.exit(0)
4561
4562 # Batch file verification
4563 batchurls = []
4564 if opts.batchfile is not None:
4565 try:
4566 if opts.batchfile == '-':
4567 batchfd = sys.stdin
4568 else:
4569 batchfd = open(opts.batchfile, 'r')
4570 batchurls = batchfd.readlines()
4571 batchurls = [x.strip() for x in batchurls]
4572 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4573 except IOError:
4574 sys.exit(u'ERROR: batch file could not be read')
4575 all_urls = batchurls + args
4576 all_urls = map(lambda url: url.strip(), all_urls)
4577
4578 # General configuration
4579 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4580 proxy_handler = urllib2.ProxyHandler()
4581 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4582 urllib2.install_opener(opener)
4583 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4584
4585 if opts.verbose:
4586 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4587
4588 extractors = gen_extractors()
4589
4590 if opts.list_extractors:
4591 for ie in extractors:
4592 print(ie.IE_NAME)
4593 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4594 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4595 for mu in matchedUrls:
4596 print(u' ' + mu)
4597 sys.exit(0)
4598
4599 # Conflicting, missing and erroneous options
4600 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4601 parser.error(u'using .netrc conflicts with giving username/password')
4602 if opts.password is not None and opts.username is None:
4603 parser.error(u'account username missing')
4604 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4605 parser.error(u'using output template conflicts with using title, literal title or auto number')
4606 if opts.usetitle and opts.useliteral:
4607 parser.error(u'using title conflicts with using literal title')
4608 if opts.username is not None and opts.password is None:
4609 opts.password = getpass.getpass(u'Type account password and press return:')
4610 if opts.ratelimit is not None:
4611 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4612 if numeric_limit is None:
4613 parser.error(u'invalid rate limit specified')
4614 opts.ratelimit = numeric_limit
4615 if opts.retries is not None:
4616 try:
4617 opts.retries = long(opts.retries)
4618 except (TypeError, ValueError), err:
4619 parser.error(u'invalid retry count specified')
4620 try:
4621 opts.playliststart = int(opts.playliststart)
4622 if opts.playliststart <= 0:
4623 raise ValueError(u'Playlist start must be positive')
4624 except (TypeError, ValueError), err:
4625 parser.error(u'invalid playlist start number specified')
4626 try:
4627 opts.playlistend = int(opts.playlistend)
4628 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4629 raise ValueError(u'Playlist end must be greater than playlist start')
4630 except (TypeError, ValueError), err:
4631 parser.error(u'invalid playlist end number specified')
4632 if opts.extractaudio:
4633 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4634 parser.error(u'invalid audio format specified')
4635
4636 # File downloader
4637 fd = FileDownloader({
4638 'usenetrc': opts.usenetrc,
4639 'username': opts.username,
4640 'password': opts.password,
4641 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4642 'forceurl': opts.geturl,
4643 'forcetitle': opts.gettitle,
4644 'forcethumbnail': opts.getthumbnail,
4645 'forcedescription': opts.getdescription,
4646 'forcefilename': opts.getfilename,
4647 'forceformat': opts.getformat,
4648 'simulate': opts.simulate,
4649 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4650 'format': opts.format,
4651 'format_limit': opts.format_limit,
4652 'listformats': opts.listformats,
4653 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4654 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4655 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4656 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4657 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4658 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4659 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4660 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4661 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4662 or u'%(id)s.%(ext)s'),
4663 'ignoreerrors': opts.ignoreerrors,
4664 'ratelimit': opts.ratelimit,
4665 'nooverwrites': opts.nooverwrites,
4666 'retries': opts.retries,
4667 'continuedl': opts.continue_dl,
4668 'noprogress': opts.noprogress,
4669 'playliststart': opts.playliststart,
4670 'playlistend': opts.playlistend,
4671 'logtostderr': opts.outtmpl == '-',
4672 'consoletitle': opts.consoletitle,
4673 'nopart': opts.nopart,
4674 'updatetime': opts.updatetime,
4675 'writedescription': opts.writedescription,
4676 'writeinfojson': opts.writeinfojson,
4677 'writesubtitles': opts.writesubtitles,
4678 'subtitleslang': opts.subtitleslang,
4679 'matchtitle': opts.matchtitle,
4680 'rejecttitle': opts.rejecttitle,
4681 'max_downloads': opts.max_downloads,
4682 'prefer_free_formats': opts.prefer_free_formats,
4683 'verbose': opts.verbose,
4684 })
4685 for extractor in extractors:
4686 fd.add_info_extractor(extractor)
4687
4688 # PostProcessors
4689 if opts.extractaudio:
4690 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4691
4692 # Update version
4693 if opts.update_self:
4694 updateSelf(fd, sys.argv[0])
4695
4696 # Maybe do nothing
4697 if len(all_urls) < 1:
4698 if not opts.update_self:
4699 parser.error(u'you must provide at least one URL')
4700 else:
4701 sys.exit()
4702
4703 try:
4704 retcode = fd.download(all_urls)
4705 except MaxDownloadsReached:
4706 fd.to_screen(u'--max-download limit reached, aborting.')
4707 retcode = 101
4708
4709 # Dump cookie jar if requested
4710 if opts.cookiefile is not None:
4711 try:
4712 jar.save()
4713 except (IOError, OSError), err:
4714 sys.exit(u'ERROR: unable to save cookie jar')
4715
4716 sys.exit(retcode)
4717
4718 def main():
4719 try:
4720 _real_main()
4721 except DownloadError:
4722 sys.exit(1)
4723 except SameFileError:
4724 sys.exit(u'ERROR: fixed output name but more than one file to download')
4725 except KeyboardInterrupt:
4726 sys.exit(u'\nERROR: Interrupted by user')
4727
4728 if __name__ == '__main__':
4729 main()
4730
4731 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: