]> jfr.im git - yt-dlp.git/blob - youtube_dl/__init__.py
Only skip download if files exists; convert audio
[yt-dlp.git] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2011.12.15'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48 import ctypes
49
50 try:
51 import email.utils
52 except ImportError: # Python 2.4
53 import email.Utils
54 try:
55 import cStringIO as StringIO
56 except ImportError:
57 import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61 from urlparse import parse_qs
62 except ImportError:
63 from cgi import parse_qs
64
65 try:
66 import lxml.etree
67 except ImportError:
68 pass # Handled below
69
70 try:
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84 import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86 import re
87 class json(object):
88 @staticmethod
89 def loads(s):
90 s = s.decode('UTF-8')
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
95 i += 1
96 if expectMore:
97 if i >= len(s):
98 raiseError('Premature end', i)
99 return i
100 def decodeEscape(match):
101 esc = match.group(1)
102 _STATIC = {
103 '"': '"',
104 '\\': '\\',
105 '/': '/',
106 'b': unichr(0x8),
107 'f': unichr(0xc),
108 'n': '\n',
109 'r': '\r',
110 't': '\t',
111 }
112 if esc in _STATIC:
113 return _STATIC[esc]
114 if esc[0] == 'u':
115 if len(esc) == 1+4:
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
122 def parseString(i):
123 i += 1
124 e = i
125 while True:
126 e = s.index('"', e)
127 bslashes = 0
128 while s[e-bslashes-1] == '\\':
129 bslashes += 1
130 if bslashes % 2 == 1:
131 e += 1
132 continue
133 break
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
136 return (e+1,stri)
137 def parseObj(i):
138 i += 1
139 res = {}
140 i = skipSpace(i)
141 if s[i] == '}': # Empty dictionary
142 return (i+1,res)
143 while True:
144 if s[i] != '"':
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
147 i = skipSpace(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
150 i,val = parse(i+1)
151 res[key] = val
152 i = skipSpace(i)
153 if s[i] == '}':
154 return (i+1, res)
155 if s[i] != ',':
156 raiseError('Expected comma or closing curly brace', i)
157 i = skipSpace(i+1)
158 def parseArray(i):
159 res = []
160 i = skipSpace(i+1)
161 if s[i] == ']': # Empty array
162 return (i+1,res)
163 while True:
164 i,val = parse(i)
165 res.append(val)
166 i = skipSpace(i) # Raise exception if premature end
167 if s[i] == ']':
168 return (i+1, res)
169 if s[i] != ',':
170 raiseError('Expected a comma or closing bracket', i)
171 i = skipSpace(i+1)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
175 return (i+len(k), v)
176 raiseError('Not a boolean (or null)', i)
177 def parseNumber(i):
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 if mobj is None:
180 raiseError('Not a number', i)
181 nums = mobj.group(1)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186 def parse(i):
187 i = skipSpace(i)
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
190 return (i,res)
191 i,res = parse(0)
192 if i < len(s):
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194 return res
195
196 def preferredencoding():
197 """Get preferred encoding.
198
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
201 """
202 def yield_preferredencoding():
203 try:
204 pref = locale.getpreferredencoding()
205 u'TEST'.encode(pref)
206 except:
207 pref = 'UTF-8'
208 while True:
209 yield pref
210 return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
215
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
218 """
219 entity = matchobj.group(1)
220
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
224
225 # Unicode character
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 if mobj is not None:
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
230 base = 16
231 numstr = u'0%s' % numstr
232 else:
233 base = 10
234 return unichr(long(numstr, base))
235
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
248
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
252 function.
253
254 It returns the tuple (stream, definitive_file_name).
255 """
256 try:
257 if filename == u'-':
258 if sys.platform == 'win32':
259 import msvcrt
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
271
272
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
275 timestamp = None
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
279 return timestamp
280
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
287 res = []
288 for el in iterable:
289 if el not in res:
290 res.append(el)
291 return res
292
293 class DownloadError(Exception):
294 """Download Error exception.
295
296 This exception may be thrown by FileDownloader objects if they are not
297 configured to continue on errors. They will contain the appropriate
298 error message.
299 """
300 pass
301
302
303 class SameFileError(Exception):
304 """Same File exception.
305
306 This exception will be thrown by FileDownloader objects if they detect
307 multiple files would have to be downloaded to the same file on disk.
308 """
309 pass
310
311
312 class PostProcessingError(Exception):
313 """Post Processing exception.
314
315 This exception may be raised by PostProcessor's .run() method to
316 indicate an error in the postprocessing task.
317 """
318 pass
319
320 class MaxDownloadsReached(Exception):
321 """ --max-downloads limit has been reached. """
322 pass
323
324
325 class UnavailableVideoError(Exception):
326 """Unavailable Format exception.
327
328 This exception will be thrown when a video is requested
329 in a format that is not available for that video.
330 """
331 pass
332
333
334 class ContentTooShortError(Exception):
335 """Content Too Short exception.
336
337 This exception may be raised by FileDownloader objects when a file they
338 download is too small for what the server announced first, indicating
339 the connection was probably interrupted.
340 """
341 # Both in bytes
342 downloaded = None
343 expected = None
344
345 def __init__(self, downloaded, expected):
346 self.downloaded = downloaded
347 self.expected = expected
348
349
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351 """Handler for HTTP requests and responses.
352
353 This class, when installed with an OpenerDirector, automatically adds
354 the standard headers to every HTTP request and handles gzipped and
355 deflated responses from web servers. If compression is to be avoided in
356 a particular request, the original request in the program code only has
357 to include the HTTP header "Youtubedl-No-Compression", which will be
358 removed before making the real request.
359
360 Part of this code was copied from:
361
362 http://techknack.net/python-urllib2-handlers/
363
364 Andrew Rowls, the author of that code, agreed to release it to the
365 public domain.
366 """
367
368 @staticmethod
369 def deflate(data):
370 try:
371 return zlib.decompress(data, -zlib.MAX_WBITS)
372 except zlib.error:
373 return zlib.decompress(data)
374
375 @staticmethod
376 def addinfourl_wrapper(stream, headers, url, code):
377 if hasattr(urllib2.addinfourl, 'getcode'):
378 return urllib2.addinfourl(stream, headers, url, code)
379 ret = urllib2.addinfourl(stream, headers, url)
380 ret.code = code
381 return ret
382
383 def http_request(self, req):
384 for h in std_headers:
385 if h in req.headers:
386 del req.headers[h]
387 req.add_header(h, std_headers[h])
388 if 'Youtubedl-no-compression' in req.headers:
389 if 'Accept-encoding' in req.headers:
390 del req.headers['Accept-encoding']
391 del req.headers['Youtubedl-no-compression']
392 return req
393
394 def http_response(self, req, resp):
395 old_resp = resp
396 # gzip
397 if resp.headers.get('Content-encoding', '') == 'gzip':
398 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400 resp.msg = old_resp.msg
401 # deflate
402 if resp.headers.get('Content-encoding', '') == 'deflate':
403 gz = StringIO.StringIO(self.deflate(resp.read()))
404 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405 resp.msg = old_resp.msg
406 return resp
407
408
409 class FileDownloader(object):
410 """File Downloader class.
411
412 File downloader objects are the ones responsible of downloading the
413 actual video file and writing it to disk if the user has requested
414 it, among some other tasks. In most cases there should be one per
415 program. As, given a video URL, the downloader doesn't know how to
416 extract all the needed information, task that InfoExtractors do, it
417 has to pass the URL to one of them.
418
419 For this, file downloader objects have a method that allows
420 InfoExtractors to be registered in a given order. When it is passed
421 a URL, the file downloader handles it to the first InfoExtractor it
422 finds that reports being able to handle it. The InfoExtractor extracts
423 all the information about the video or videos the URL refers to, and
424 asks the FileDownloader to process the video information, possibly
425 downloading the video.
426
427 File downloaders accept a lot of parameters. In order not to saturate
428 the object constructor with arguments, it receives a dictionary of
429 options instead. These options are available through the params
430 attribute for the InfoExtractors to use. The FileDownloader also
431 registers itself as the downloader in charge for the InfoExtractors
432 that are added to it, so this is a "mutual registration".
433
434 Available options:
435
436 username: Username for authentication purposes.
437 password: Password for authentication purposes.
438 usenetrc: Use netrc for authentication instead.
439 quiet: Do not print messages to stdout.
440 forceurl: Force printing final URL.
441 forcetitle: Force printing title.
442 forcethumbnail: Force printing thumbnail URL.
443 forcedescription: Force printing description.
444 forcefilename: Force printing final filename.
445 simulate: Do not download the video files.
446 format: Video format code.
447 format_limit: Highest quality format to try.
448 outtmpl: Template for output names.
449 ignoreerrors: Do not stop on download errors.
450 ratelimit: Download speed limit, in bytes/sec.
451 nooverwrites: Prevent overwriting files.
452 retries: Number of times to retry for HTTP error 5xx
453 continuedl: Try to continue downloads if possible.
454 noprogress: Do not print the progress bar.
455 playliststart: Playlist item to start at.
456 playlistend: Playlist item to end at.
457 matchtitle: Download only matching titles.
458 rejecttitle: Reject downloads for matching titles.
459 logtostderr: Log messages to stderr instead of stdout.
460 consoletitle: Display progress in console window's titlebar.
461 nopart: Do not use temporary .part files.
462 updatetime: Use the Last-modified header to set output file timestamps.
463 writedescription: Write the video description to a .description file
464 writeinfojson: Write the video description to a .info.json file
465 """
466
467 params = None
468 _ies = []
469 _pps = []
470 _download_retcode = None
471 _num_downloads = None
472 _screen_file = None
473
474 def __init__(self, params):
475 """Create a FileDownloader object with the given options."""
476 self._ies = []
477 self._pps = []
478 self._download_retcode = 0
479 self._num_downloads = 0
480 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
481 self.params = params
482
483 @staticmethod
484 def format_bytes(bytes):
485 if bytes is None:
486 return 'N/A'
487 if type(bytes) is str:
488 bytes = float(bytes)
489 if bytes == 0.0:
490 exponent = 0
491 else:
492 exponent = long(math.log(bytes, 1024.0))
493 suffix = 'bkMGTPEZY'[exponent]
494 converted = float(bytes) / float(1024 ** exponent)
495 return '%.2f%s' % (converted, suffix)
496
497 @staticmethod
498 def calc_percent(byte_counter, data_len):
499 if data_len is None:
500 return '---.-%'
501 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
502
503 @staticmethod
504 def calc_eta(start, now, total, current):
505 if total is None:
506 return '--:--'
507 dif = now - start
508 if current == 0 or dif < 0.001: # One millisecond
509 return '--:--'
510 rate = float(current) / dif
511 eta = long((float(total) - float(current)) / rate)
512 (eta_mins, eta_secs) = divmod(eta, 60)
513 if eta_mins > 99:
514 return '--:--'
515 return '%02d:%02d' % (eta_mins, eta_secs)
516
517 @staticmethod
518 def calc_speed(start, now, bytes):
519 dif = now - start
520 if bytes == 0 or dif < 0.001: # One millisecond
521 return '%10s' % '---b/s'
522 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
523
524 @staticmethod
525 def best_block_size(elapsed_time, bytes):
526 new_min = max(bytes / 2.0, 1.0)
527 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528 if elapsed_time < 0.001:
529 return long(new_max)
530 rate = bytes / elapsed_time
531 if rate > new_max:
532 return long(new_max)
533 if rate < new_min:
534 return long(new_min)
535 return long(rate)
536
537 @staticmethod
538 def parse_bytes(bytestr):
539 """Parse a string indicating a byte quantity into a long integer."""
540 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
541 if matchobj is None:
542 return None
543 number = float(matchobj.group(1))
544 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545 return long(round(number * multiplier))
546
547 def add_info_extractor(self, ie):
548 """Add an InfoExtractor object to the end of the list."""
549 self._ies.append(ie)
550 ie.set_downloader(self)
551
552 def add_post_processor(self, pp):
553 """Add a PostProcessor object to the end of the chain."""
554 self._pps.append(pp)
555 pp.set_downloader(self)
556
557 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558 """Print message to stdout if not in quiet mode."""
559 try:
560 if not self.params.get('quiet', False):
561 terminator = [u'\n', u''][skip_eol]
562 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563 self._screen_file.flush()
564 except (UnicodeEncodeError), err:
565 if not ignore_encoding_errors:
566 raise
567
568 def to_stderr(self, message):
569 """Print message to stderr."""
570 print >>sys.stderr, message.encode(preferredencoding())
571
572 def to_cons_title(self, message):
573 """Set console/terminal window title to message."""
574 if not self.params.get('consoletitle', False):
575 return
576 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577 # c_wchar_p() might not be necessary if `message` is
578 # already of type unicode()
579 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580 elif 'TERM' in os.environ:
581 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
582
583 def fixed_template(self):
584 """Checks if the output template is fixed."""
585 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
586
587 def trouble(self, message=None):
588 """Determine action to take when a download problem appears.
589
590 Depending on if the downloader has been configured to ignore
591 download errors or not, this method may throw an exception or
592 not when errors are found, after printing the message.
593 """
594 if message is not None:
595 self.to_stderr(message)
596 if not self.params.get('ignoreerrors', False):
597 raise DownloadError(message)
598 self._download_retcode = 1
599
600 def slow_down(self, start_time, byte_counter):
601 """Sleep if the download speed is over the rate limit."""
602 rate_limit = self.params.get('ratelimit', None)
603 if rate_limit is None or byte_counter == 0:
604 return
605 now = time.time()
606 elapsed = now - start_time
607 if elapsed <= 0.0:
608 return
609 speed = float(byte_counter) / elapsed
610 if speed > rate_limit:
611 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
612
613 def temp_name(self, filename):
614 """Returns a temporary filename for the given filename."""
615 if self.params.get('nopart', False) or filename == u'-' or \
616 (os.path.exists(filename) and not os.path.isfile(filename)):
617 return filename
618 return filename + u'.part'
619
620 def undo_temp_name(self, filename):
621 if filename.endswith(u'.part'):
622 return filename[:-len(u'.part')]
623 return filename
624
625 def try_rename(self, old_filename, new_filename):
626 try:
627 if old_filename == new_filename:
628 return
629 os.rename(old_filename, new_filename)
630 except (IOError, OSError), err:
631 self.trouble(u'ERROR: unable to rename file')
632
633 def try_utime(self, filename, last_modified_hdr):
634 """Try to set the last-modified time of the given file."""
635 if last_modified_hdr is None:
636 return
637 if not os.path.isfile(filename):
638 return
639 timestr = last_modified_hdr
640 if timestr is None:
641 return
642 filetime = timeconvert(timestr)
643 if filetime is None:
644 return filetime
645 try:
646 os.utime(filename, (time.time(), filetime))
647 except:
648 pass
649 return filetime
650
651 def report_writedescription(self, descfn):
652 """ Report that the description file is being written """
653 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
654
655 def report_writeinfojson(self, infofn):
656 """ Report that the metadata file has been written """
657 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
658
659 def report_destination(self, filename):
660 """Report destination filename."""
661 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
662
663 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664 """Report download progress."""
665 if self.params.get('noprogress', False):
666 return
667 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
671
672 def report_resuming_byte(self, resume_len):
673 """Report attempt to resume at given byte."""
674 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
675
676 def report_retry(self, count, retries):
677 """Report retry in case of HTTP error 5xx"""
678 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
679
680 def report_file_already_downloaded(self, file_name):
681 """Report file has already been fully downloaded."""
682 try:
683 self.to_screen(u'[download] %s has already been downloaded' % file_name)
684 except (UnicodeEncodeError), err:
685 self.to_screen(u'[download] The file has already been downloaded')
686
687 def report_unable_to_resume(self):
688 """Report it was impossible to resume download."""
689 self.to_screen(u'[download] Unable to resume')
690
691 def report_finish(self):
692 """Report download finished."""
693 if self.params.get('noprogress', False):
694 self.to_screen(u'[download] Download completed')
695 else:
696 self.to_screen(u'')
697
698 def increment_downloads(self):
699 """Increment the ordinal that assigns a number to each file."""
700 self._num_downloads += 1
701
702 def prepare_filename(self, info_dict):
703 """Generate the output filename."""
704 try:
705 template_dict = dict(info_dict)
706 template_dict['epoch'] = unicode(long(time.time()))
707 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708 filename = self.params['outtmpl'] % template_dict
709 return filename
710 except (ValueError, KeyError), err:
711 self.trouble(u'ERROR: invalid system charset or erroneous output template')
712 return None
713
714 def _match_entry(self, info_dict):
715 """ Returns None iff the file should be downloaded """
716
717 title = info_dict['title']
718 matchtitle = self.params.get('matchtitle', False)
719 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721 rejecttitle = self.params.get('rejecttitle', False)
722 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
724 return None
725
726 def process_info(self, info_dict):
727 """Process a single dictionary returned by an InfoExtractor."""
728
729 reason = self._match_entry(info_dict)
730 if reason is not None:
731 self.to_screen(u'[download] ' + reason)
732 return
733
734 max_downloads = self.params.get('max_downloads')
735 if max_downloads is not None:
736 if self._num_downloads > int(max_downloads):
737 raise MaxDownloadsReached()
738
739 filename = self.prepare_filename(info_dict)
740
741 # Forced printings
742 if self.params.get('forcetitle', False):
743 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744 if self.params.get('forceurl', False):
745 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748 if self.params.get('forcedescription', False) and 'description' in info_dict:
749 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750 if self.params.get('forcefilename', False) and filename is not None:
751 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752 if self.params.get('forceformat', False):
753 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
754
755 # Do nothing else if in simulate mode
756 if self.params.get('simulate', False):
757 return
758
759 if filename is None:
760 return
761
762 try:
763 dn = os.path.dirname(filename)
764 if dn != '' and not os.path.exists(dn):
765 os.makedirs(dn)
766 except (OSError, IOError), err:
767 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
768 return
769
770 if self.params.get('writedescription', False):
771 try:
772 descfn = filename + '.description'
773 self.report_writedescription(descfn)
774 descfile = open(descfn, 'wb')
775 try:
776 descfile.write(info_dict['description'].encode('utf-8'))
777 finally:
778 descfile.close()
779 except (OSError, IOError):
780 self.trouble(u'ERROR: Cannot write description file ' + descfn)
781 return
782
783 if self.params.get('writeinfojson', False):
784 infofn = filename + '.info.json'
785 self.report_writeinfojson(infofn)
786 try:
787 json.dump
788 except (NameError,AttributeError):
789 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
790 return
791 try:
792 infof = open(infofn, 'wb')
793 try:
794 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
795 json.dump(json_info_dict, infof)
796 finally:
797 infof.close()
798 except (OSError, IOError):
799 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
800 return
801
802 if not self.params.get('skip_download', False):
803 if self.params.get('nooverwrites', False) and os.path.exists(filename):
804 success = True
805 else:
806 try:
807 success = self._do_download(filename, info_dict)
808 except (OSError, IOError), err:
809 raise UnavailableVideoError
810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
811 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
812 return
813 except (ContentTooShortError, ), err:
814 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
815 return
816
817 if success:
818 try:
819 self.post_process(filename, info_dict)
820 except (PostProcessingError), err:
821 self.trouble(u'ERROR: postprocessing: %s' % str(err))
822 return
823
824 def download(self, url_list):
825 """Download a given list of URLs."""
826 if len(url_list) > 1 and self.fixed_template():
827 raise SameFileError(self.params['outtmpl'])
828
829 for url in url_list:
830 suitable_found = False
831 for ie in self._ies:
832 # Go to next InfoExtractor if not suitable
833 if not ie.suitable(url):
834 continue
835
836 # Suitable InfoExtractor found
837 suitable_found = True
838
839 # Extract information from URL and process it
840 ie.extract(url)
841
842 # Suitable InfoExtractor had been found; go to next URL
843 break
844
845 if not suitable_found:
846 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
847
848 return self._download_retcode
849
850 def post_process(self, filename, ie_info):
851 """Run the postprocessing chain on the given file."""
852 info = dict(ie_info)
853 info['filepath'] = filename
854 for pp in self._pps:
855 info = pp.run(info)
856 if info is None:
857 break
858
859 def _download_with_rtmpdump(self, filename, url, player_url):
860 self.report_destination(filename)
861 tmpfilename = self.temp_name(filename)
862
863 # Check for rtmpdump first
864 try:
865 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
866 except (OSError, IOError):
867 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
868 return False
869
870 # Download using rtmpdump. rtmpdump returns exit code 2 when
871 # the connection was interrumpted and resuming appears to be
872 # possible. This is part of rtmpdump's normal usage, AFAIK.
873 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
874 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
875 while retval == 2 or retval == 1:
876 prevsize = os.path.getsize(tmpfilename)
877 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
878 time.sleep(5.0) # This seems to be needed
879 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
880 cursize = os.path.getsize(tmpfilename)
881 if prevsize == cursize and retval == 1:
882 break
883 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
884 if prevsize == cursize and retval == 2 and cursize > 1024:
885 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
886 retval = 0
887 break
888 if retval == 0:
889 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
890 self.try_rename(tmpfilename, filename)
891 return True
892 else:
893 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
894 return False
895
896 def _do_download(self, filename, info_dict):
897 url = info_dict['url']
898 player_url = info_dict.get('player_url', None)
899
900 # Check file already present
901 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
902 self.report_file_already_downloaded(filename)
903 return True
904
905 # Attempt to download using rtmpdump
906 if url.startswith('rtmp'):
907 return self._download_with_rtmpdump(filename, url, player_url)
908
909 tmpfilename = self.temp_name(filename)
910 stream = None
911
912 # Do not include the Accept-Encoding header
913 headers = {'Youtubedl-no-compression': 'True'}
914 basic_request = urllib2.Request(url, None, headers)
915 request = urllib2.Request(url, None, headers)
916
917 # Establish possible resume length
918 if os.path.isfile(tmpfilename):
919 resume_len = os.path.getsize(tmpfilename)
920 else:
921 resume_len = 0
922
923 open_mode = 'wb'
924 if resume_len != 0:
925 if self.params.get('continuedl', False):
926 self.report_resuming_byte(resume_len)
927 request.add_header('Range','bytes=%d-' % resume_len)
928 open_mode = 'ab'
929 else:
930 resume_len = 0
931
932 count = 0
933 retries = self.params.get('retries', 0)
934 while count <= retries:
935 # Establish connection
936 try:
937 if count == 0 and 'urlhandle' in info_dict:
938 data = info_dict['urlhandle']
939 data = urllib2.urlopen(request)
940 break
941 except (urllib2.HTTPError, ), err:
942 if (err.code < 500 or err.code >= 600) and err.code != 416:
943 # Unexpected HTTP error
944 raise
945 elif err.code == 416:
946 # Unable to resume (requested range not satisfiable)
947 try:
948 # Open the connection again without the range header
949 data = urllib2.urlopen(basic_request)
950 content_length = data.info()['Content-Length']
951 except (urllib2.HTTPError, ), err:
952 if err.code < 500 or err.code >= 600:
953 raise
954 else:
955 # Examine the reported length
956 if (content_length is not None and
957 (resume_len - 100 < long(content_length) < resume_len + 100)):
958 # The file had already been fully downloaded.
959 # Explanation to the above condition: in issue #175 it was revealed that
960 # YouTube sometimes adds or removes a few bytes from the end of the file,
961 # changing the file size slightly and causing problems for some users. So
962 # I decided to implement a suggested change and consider the file
963 # completely downloaded if the file size differs less than 100 bytes from
964 # the one in the hard drive.
965 self.report_file_already_downloaded(filename)
966 self.try_rename(tmpfilename, filename)
967 return True
968 else:
969 # The length does not match, we start the download over
970 self.report_unable_to_resume()
971 open_mode = 'wb'
972 break
973 # Retry
974 count += 1
975 if count <= retries:
976 self.report_retry(count, retries)
977
978 if count > retries:
979 self.trouble(u'ERROR: giving up after %s retries' % retries)
980 return False
981
982 data_len = data.info().get('Content-length', None)
983 if data_len is not None:
984 data_len = long(data_len) + resume_len
985 data_len_str = self.format_bytes(data_len)
986 byte_counter = 0 + resume_len
987 block_size = 1024
988 start = time.time()
989 while True:
990 # Download and write
991 before = time.time()
992 data_block = data.read(block_size)
993 after = time.time()
994 if len(data_block) == 0:
995 break
996 byte_counter += len(data_block)
997
998 # Open file just in time
999 if stream is None:
1000 try:
1001 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1002 assert stream is not None
1003 filename = self.undo_temp_name(tmpfilename)
1004 self.report_destination(filename)
1005 except (OSError, IOError), err:
1006 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1007 return False
1008 try:
1009 stream.write(data_block)
1010 except (IOError, OSError), err:
1011 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1012 return False
1013 block_size = self.best_block_size(after - before, len(data_block))
1014
1015 # Progress message
1016 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1017 if data_len is None:
1018 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1019 else:
1020 percent_str = self.calc_percent(byte_counter, data_len)
1021 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1022 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1023
1024 # Apply rate limit
1025 self.slow_down(start, byte_counter - resume_len)
1026
1027 if stream is None:
1028 self.trouble(u'\nERROR: Did not get any data blocks')
1029 return False
1030 stream.close()
1031 self.report_finish()
1032 if data_len is not None and byte_counter != data_len:
1033 raise ContentTooShortError(byte_counter, long(data_len))
1034 self.try_rename(tmpfilename, filename)
1035
1036 # Update file modification time
1037 if self.params.get('updatetime', True):
1038 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1039
1040 return True
1041
1042
1043 class InfoExtractor(object):
1044 """Information Extractor class.
1045
1046 Information extractors are the classes that, given a URL, extract
1047 information from the video (or videos) the URL refers to. This
1048 information includes the real video URL, the video title and simplified
1049 title, author and others. The information is stored in a dictionary
1050 which is then passed to the FileDownloader. The FileDownloader
1051 processes this information possibly downloading the video to the file
1052 system, among other possible outcomes. The dictionaries must include
1053 the following fields:
1054
1055 id: Video identifier.
1056 url: Final video URL.
1057 uploader: Nickname of the video uploader.
1058 title: Literal title.
1059 stitle: Simplified title.
1060 ext: Video filename extension.
1061 format: Video format.
1062 player_url: SWF Player URL (may be None).
1063
1064 The following fields are optional. Their primary purpose is to allow
1065 youtube-dl to serve as the backend for a video search function, such
1066 as the one in youtube2mp3. They are only used when their respective
1067 forced printing functions are called:
1068
1069 thumbnail: Full URL to a video thumbnail image.
1070 description: One-line video description.
1071
1072 Subclasses of this one should re-define the _real_initialize() and
1073 _real_extract() methods and define a _VALID_URL regexp.
1074 Probably, they should also be added to the list of extractors.
1075 """
1076
1077 _ready = False
1078 _downloader = None
1079
1080 def __init__(self, downloader=None):
1081 """Constructor. Receives an optional downloader."""
1082 self._ready = False
1083 self.set_downloader(downloader)
1084
1085 def suitable(self, url):
1086 """Receives a URL and returns True if suitable for this IE."""
1087 return re.match(self._VALID_URL, url) is not None
1088
1089 def initialize(self):
1090 """Initializes an instance (authentication, etc)."""
1091 if not self._ready:
1092 self._real_initialize()
1093 self._ready = True
1094
1095 def extract(self, url):
1096 """Extracts URL information and returns it in list of dicts."""
1097 self.initialize()
1098 return self._real_extract(url)
1099
1100 def set_downloader(self, downloader):
1101 """Sets the downloader for this IE."""
1102 self._downloader = downloader
1103
1104 def _real_initialize(self):
1105 """Real initialization process. Redefine in subclasses."""
1106 pass
1107
1108 def _real_extract(self, url):
1109 """Real extraction process. Redefine in subclasses."""
1110 pass
1111
1112
1113 class YoutubeIE(InfoExtractor):
1114 """Information extractor for youtube.com."""
1115
1116 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1117 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1118 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1119 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1120 _NETRC_MACHINE = 'youtube'
1121 # Listed in order of quality
1122 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1123 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1124 _video_extensions = {
1125 '13': '3gp',
1126 '17': 'mp4',
1127 '18': 'mp4',
1128 '22': 'mp4',
1129 '37': 'mp4',
1130 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1131 '43': 'webm',
1132 '44': 'webm',
1133 '45': 'webm',
1134 }
1135 _video_dimensions = {
1136 '5': '240x400',
1137 '6': '???',
1138 '13': '???',
1139 '17': '144x176',
1140 '18': '360x640',
1141 '22': '720x1280',
1142 '34': '360x640',
1143 '35': '480x854',
1144 '37': '1080x1920',
1145 '38': '3072x4096',
1146 '43': '360x640',
1147 '44': '480x854',
1148 '45': '720x1280',
1149 }
1150 IE_NAME = u'youtube'
1151
1152 def report_lang(self):
1153 """Report attempt to set language."""
1154 self._downloader.to_screen(u'[youtube] Setting language')
1155
1156 def report_login(self):
1157 """Report attempt to log in."""
1158 self._downloader.to_screen(u'[youtube] Logging in')
1159
1160 def report_age_confirmation(self):
1161 """Report attempt to confirm age."""
1162 self._downloader.to_screen(u'[youtube] Confirming age')
1163
1164 def report_video_webpage_download(self, video_id):
1165 """Report attempt to download video webpage."""
1166 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1167
1168 def report_video_info_webpage_download(self, video_id):
1169 """Report attempt to download video info webpage."""
1170 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1171
1172 def report_information_extraction(self, video_id):
1173 """Report attempt to extract video information."""
1174 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1175
1176 def report_unavailable_format(self, video_id, format):
1177 """Report extracted video URL."""
1178 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1179
1180 def report_rtmp_download(self):
1181 """Indicate the download will use the RTMP protocol."""
1182 self._downloader.to_screen(u'[youtube] RTMP download detected')
1183
1184 def _print_formats(self, formats):
1185 print 'Available formats:'
1186 for x in formats:
1187 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1188
1189 def _real_initialize(self):
1190 if self._downloader is None:
1191 return
1192
1193 username = None
1194 password = None
1195 downloader_params = self._downloader.params
1196
1197 # Attempt to use provided username and password or .netrc data
1198 if downloader_params.get('username', None) is not None:
1199 username = downloader_params['username']
1200 password = downloader_params['password']
1201 elif downloader_params.get('usenetrc', False):
1202 try:
1203 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1204 if info is not None:
1205 username = info[0]
1206 password = info[2]
1207 else:
1208 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1209 except (IOError, netrc.NetrcParseError), err:
1210 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1211 return
1212
1213 # Set language
1214 request = urllib2.Request(self._LANG_URL)
1215 try:
1216 self.report_lang()
1217 urllib2.urlopen(request).read()
1218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1220 return
1221
1222 # No authentication to be performed
1223 if username is None:
1224 return
1225
1226 # Log in
1227 login_form = {
1228 'current_form': 'loginForm',
1229 'next': '/',
1230 'action_login': 'Log In',
1231 'username': username,
1232 'password': password,
1233 }
1234 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1235 try:
1236 self.report_login()
1237 login_results = urllib2.urlopen(request).read()
1238 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1239 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1240 return
1241 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1243 return
1244
1245 # Confirm age
1246 age_form = {
1247 'next_url': '/',
1248 'action_confirm': 'Confirm',
1249 }
1250 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1251 try:
1252 self.report_age_confirmation()
1253 age_results = urllib2.urlopen(request).read()
1254 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1255 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1256 return
1257
1258 def _real_extract(self, url):
1259 # Extract video id from URL
1260 mobj = re.match(self._VALID_URL, url)
1261 if mobj is None:
1262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1263 return
1264 video_id = mobj.group(2)
1265
1266 # Get video webpage
1267 self.report_video_webpage_download(video_id)
1268 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1269 try:
1270 video_webpage = urllib2.urlopen(request).read()
1271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1272 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1273 return
1274
1275 # Attempt to extract SWF player URL
1276 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1277 if mobj is not None:
1278 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1279 else:
1280 player_url = None
1281
1282 # Get video info
1283 self.report_video_info_webpage_download(video_id)
1284 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1285 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1286 % (video_id, el_type))
1287 request = urllib2.Request(video_info_url)
1288 try:
1289 video_info_webpage = urllib2.urlopen(request).read()
1290 video_info = parse_qs(video_info_webpage)
1291 if 'token' in video_info:
1292 break
1293 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1295 return
1296 if 'token' not in video_info:
1297 if 'reason' in video_info:
1298 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1299 else:
1300 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1301 return
1302
1303 # Start extracting information
1304 self.report_information_extraction(video_id)
1305
1306 # uploader
1307 if 'author' not in video_info:
1308 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1309 return
1310 video_uploader = urllib.unquote_plus(video_info['author'][0])
1311
1312 # title
1313 if 'title' not in video_info:
1314 self._downloader.trouble(u'ERROR: unable to extract video title')
1315 return
1316 video_title = urllib.unquote_plus(video_info['title'][0])
1317 video_title = video_title.decode('utf-8')
1318 video_title = sanitize_title(video_title)
1319
1320 # simplified title
1321 simple_title = _simplify_title(video_title)
1322
1323 # thumbnail image
1324 if 'thumbnail_url' not in video_info:
1325 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1326 video_thumbnail = ''
1327 else: # don't panic if we can't find it
1328 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1329
1330 # upload date
1331 upload_date = u'NA'
1332 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1333 if mobj is not None:
1334 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1335 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1336 for expression in format_expressions:
1337 try:
1338 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1339 except:
1340 pass
1341
1342 # description
1343 try:
1344 lxml.etree
1345 except NameError:
1346 video_description = u'No description available.'
1347 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1348 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1349 if mobj is not None:
1350 video_description = mobj.group(1).decode('utf-8')
1351 else:
1352 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1353 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1354 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1355 # TODO use another parser
1356
1357 # token
1358 video_token = urllib.unquote_plus(video_info['token'][0])
1359
1360 # Decide which formats to download
1361 req_format = self._downloader.params.get('format', None)
1362
1363 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1364 self.report_rtmp_download()
1365 video_url_list = [(None, video_info['conn'][0])]
1366 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1367 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1368 url_data = [parse_qs(uds) for uds in url_data_strs]
1369 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1370 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1371
1372 format_limit = self._downloader.params.get('format_limit', None)
1373 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1374 if format_limit is not None and format_limit in available_formats:
1375 format_list = available_formats[available_formats.index(format_limit):]
1376 else:
1377 format_list = available_formats
1378 existing_formats = [x for x in format_list if x in url_map]
1379 if len(existing_formats) == 0:
1380 self._downloader.trouble(u'ERROR: no known formats available for video')
1381 return
1382 if self._downloader.params.get('listformats', None):
1383 self._print_formats(existing_formats)
1384 return
1385 if req_format is None or req_format == 'best':
1386 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1387 elif req_format == 'worst':
1388 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1389 elif req_format in ('-1', 'all'):
1390 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1391 else:
1392 # Specific formats. We pick the first in a slash-delimeted sequence.
1393 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1394 req_formats = req_format.split('/')
1395 video_url_list = None
1396 for rf in req_formats:
1397 if rf in url_map:
1398 video_url_list = [(rf, url_map[rf])]
1399 break
1400 if video_url_list is None:
1401 self._downloader.trouble(u'ERROR: requested format not available')
1402 return
1403 else:
1404 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1405 return
1406
1407 for format_param, video_real_url in video_url_list:
1408 # At this point we have a new video
1409 self._downloader.increment_downloads()
1410
1411 # Extension
1412 video_extension = self._video_extensions.get(format_param, 'flv')
1413
1414 try:
1415 # Process video information
1416 self._downloader.process_info({
1417 'id': video_id.decode('utf-8'),
1418 'url': video_real_url.decode('utf-8'),
1419 'uploader': video_uploader.decode('utf-8'),
1420 'upload_date': upload_date,
1421 'title': video_title,
1422 'stitle': simple_title,
1423 'ext': video_extension.decode('utf-8'),
1424 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1425 'thumbnail': video_thumbnail.decode('utf-8'),
1426 'description': video_description,
1427 'player_url': player_url,
1428 })
1429 except UnavailableVideoError, err:
1430 self._downloader.trouble(u'\nERROR: unable to download video')
1431
1432
1433 class MetacafeIE(InfoExtractor):
1434 """Information Extractor for metacafe.com."""
1435
1436 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1437 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1438 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1439 _youtube_ie = None
1440 IE_NAME = u'metacafe'
1441
1442 def __init__(self, youtube_ie, downloader=None):
1443 InfoExtractor.__init__(self, downloader)
1444 self._youtube_ie = youtube_ie
1445
1446 def report_disclaimer(self):
1447 """Report disclaimer retrieval."""
1448 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1449
1450 def report_age_confirmation(self):
1451 """Report attempt to confirm age."""
1452 self._downloader.to_screen(u'[metacafe] Confirming age')
1453
1454 def report_download_webpage(self, video_id):
1455 """Report webpage download."""
1456 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1457
1458 def report_extraction(self, video_id):
1459 """Report information extraction."""
1460 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1461
1462 def _real_initialize(self):
1463 # Retrieve disclaimer
1464 request = urllib2.Request(self._DISCLAIMER)
1465 try:
1466 self.report_disclaimer()
1467 disclaimer = urllib2.urlopen(request).read()
1468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1469 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1470 return
1471
1472 # Confirm age
1473 disclaimer_form = {
1474 'filters': '0',
1475 'submit': "Continue - I'm over 18",
1476 }
1477 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1478 try:
1479 self.report_age_confirmation()
1480 disclaimer = urllib2.urlopen(request).read()
1481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1483 return
1484
1485 def _real_extract(self, url):
1486 # Extract id and simplified title from URL
1487 mobj = re.match(self._VALID_URL, url)
1488 if mobj is None:
1489 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1490 return
1491
1492 video_id = mobj.group(1)
1493
1494 # Check if video comes from YouTube
1495 mobj2 = re.match(r'^yt-(.*)$', video_id)
1496 if mobj2 is not None:
1497 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1498 return
1499
1500 # At this point we have a new video
1501 self._downloader.increment_downloads()
1502
1503 simple_title = mobj.group(2).decode('utf-8')
1504
1505 # Retrieve video webpage to extract further information
1506 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1507 try:
1508 self.report_download_webpage(video_id)
1509 webpage = urllib2.urlopen(request).read()
1510 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1511 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1512 return
1513
1514 # Extract URL, uploader and title from webpage
1515 self.report_extraction(video_id)
1516 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1517 if mobj is not None:
1518 mediaURL = urllib.unquote(mobj.group(1))
1519 video_extension = mediaURL[-3:]
1520
1521 # Extract gdaKey if available
1522 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1523 if mobj is None:
1524 video_url = mediaURL
1525 else:
1526 gdaKey = mobj.group(1)
1527 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1528 else:
1529 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1530 if mobj is None:
1531 self._downloader.trouble(u'ERROR: unable to extract media URL')
1532 return
1533 vardict = parse_qs(mobj.group(1))
1534 if 'mediaData' not in vardict:
1535 self._downloader.trouble(u'ERROR: unable to extract media URL')
1536 return
1537 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1538 if mobj is None:
1539 self._downloader.trouble(u'ERROR: unable to extract media URL')
1540 return
1541 mediaURL = mobj.group(1).replace('\\/', '/')
1542 video_extension = mediaURL[-3:]
1543 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1544
1545 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1546 if mobj is None:
1547 self._downloader.trouble(u'ERROR: unable to extract title')
1548 return
1549 video_title = mobj.group(1).decode('utf-8')
1550 video_title = sanitize_title(video_title)
1551
1552 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1553 if mobj is None:
1554 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1555 return
1556 video_uploader = mobj.group(1)
1557
1558 try:
1559 # Process video information
1560 self._downloader.process_info({
1561 'id': video_id.decode('utf-8'),
1562 'url': video_url.decode('utf-8'),
1563 'uploader': video_uploader.decode('utf-8'),
1564 'upload_date': u'NA',
1565 'title': video_title,
1566 'stitle': simple_title,
1567 'ext': video_extension.decode('utf-8'),
1568 'format': u'NA',
1569 'player_url': None,
1570 })
1571 except UnavailableVideoError:
1572 self._downloader.trouble(u'\nERROR: unable to download video')
1573
1574
1575 class DailymotionIE(InfoExtractor):
1576 """Information Extractor for Dailymotion"""
1577
1578 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1579 IE_NAME = u'dailymotion'
1580
1581 def __init__(self, downloader=None):
1582 InfoExtractor.__init__(self, downloader)
1583
1584 def report_download_webpage(self, video_id):
1585 """Report webpage download."""
1586 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1587
1588 def report_extraction(self, video_id):
1589 """Report information extraction."""
1590 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1591
1592 def _real_extract(self, url):
1593 htmlParser = HTMLParser.HTMLParser()
1594
1595 # Extract id and simplified title from URL
1596 mobj = re.match(self._VALID_URL, url)
1597 if mobj is None:
1598 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1599 return
1600
1601 # At this point we have a new video
1602 self._downloader.increment_downloads()
1603 video_id = mobj.group(1)
1604
1605 video_extension = 'flv'
1606
1607 # Retrieve video webpage to extract further information
1608 request = urllib2.Request(url)
1609 request.add_header('Cookie', 'family_filter=off')
1610 try:
1611 self.report_download_webpage(video_id)
1612 webpage = urllib2.urlopen(request).read()
1613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1615 return
1616
1617 # Extract URL, uploader and title from webpage
1618 self.report_extraction(video_id)
1619 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1620 if mobj is None:
1621 self._downloader.trouble(u'ERROR: unable to extract media URL')
1622 return
1623 sequence = urllib.unquote(mobj.group(1))
1624 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1625 if mobj is None:
1626 self._downloader.trouble(u'ERROR: unable to extract media URL')
1627 return
1628 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1629
1630 # if needed add http://www.dailymotion.com/ if relative URL
1631
1632 video_url = mediaURL
1633
1634 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1635 if mobj is None:
1636 self._downloader.trouble(u'ERROR: unable to extract title')
1637 return
1638 video_title = htmlParser.unescape(mobj.group('title')).decode('utf-8')
1639 video_title = sanitize_title(video_title)
1640 simple_title = _simplify_title(video_title)
1641
1642 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1643 if mobj is None:
1644 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1645 return
1646 video_uploader = mobj.group(1)
1647
1648 try:
1649 # Process video information
1650 self._downloader.process_info({
1651 'id': video_id.decode('utf-8'),
1652 'url': video_url.decode('utf-8'),
1653 'uploader': video_uploader.decode('utf-8'),
1654 'upload_date': u'NA',
1655 'title': video_title,
1656 'stitle': simple_title,
1657 'ext': video_extension.decode('utf-8'),
1658 'format': u'NA',
1659 'player_url': None,
1660 })
1661 except UnavailableVideoError:
1662 self._downloader.trouble(u'\nERROR: unable to download video')
1663
1664
1665 class GoogleIE(InfoExtractor):
1666 """Information extractor for video.google.com."""
1667
1668 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1669 IE_NAME = u'video.google'
1670
1671 def __init__(self, downloader=None):
1672 InfoExtractor.__init__(self, downloader)
1673
1674 def report_download_webpage(self, video_id):
1675 """Report webpage download."""
1676 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1677
1678 def report_extraction(self, video_id):
1679 """Report information extraction."""
1680 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1681
1682 def _real_extract(self, url):
1683 # Extract id from URL
1684 mobj = re.match(self._VALID_URL, url)
1685 if mobj is None:
1686 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1687 return
1688
1689 # At this point we have a new video
1690 self._downloader.increment_downloads()
1691 video_id = mobj.group(1)
1692
1693 video_extension = 'mp4'
1694
1695 # Retrieve video webpage to extract further information
1696 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1697 try:
1698 self.report_download_webpage(video_id)
1699 webpage = urllib2.urlopen(request).read()
1700 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1702 return
1703
1704 # Extract URL, uploader, and title from webpage
1705 self.report_extraction(video_id)
1706 mobj = re.search(r"download_url:'([^']+)'", webpage)
1707 if mobj is None:
1708 video_extension = 'flv'
1709 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1710 if mobj is None:
1711 self._downloader.trouble(u'ERROR: unable to extract media URL')
1712 return
1713 mediaURL = urllib.unquote(mobj.group(1))
1714 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1715 mediaURL = mediaURL.replace('\\x26', '\x26')
1716
1717 video_url = mediaURL
1718
1719 mobj = re.search(r'<title>(.*)</title>', webpage)
1720 if mobj is None:
1721 self._downloader.trouble(u'ERROR: unable to extract title')
1722 return
1723 video_title = mobj.group(1).decode('utf-8')
1724 video_title = sanitize_title(video_title)
1725 simple_title = _simplify_title(video_title)
1726
1727 # Extract video description
1728 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1729 if mobj is None:
1730 self._downloader.trouble(u'ERROR: unable to extract video description')
1731 return
1732 video_description = mobj.group(1).decode('utf-8')
1733 if not video_description:
1734 video_description = 'No description available.'
1735
1736 # Extract video thumbnail
1737 if self._downloader.params.get('forcethumbnail', False):
1738 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1739 try:
1740 webpage = urllib2.urlopen(request).read()
1741 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1742 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1743 return
1744 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1745 if mobj is None:
1746 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1747 return
1748 video_thumbnail = mobj.group(1)
1749 else: # we need something to pass to process_info
1750 video_thumbnail = ''
1751
1752 try:
1753 # Process video information
1754 self._downloader.process_info({
1755 'id': video_id.decode('utf-8'),
1756 'url': video_url.decode('utf-8'),
1757 'uploader': u'NA',
1758 'upload_date': u'NA',
1759 'title': video_title,
1760 'stitle': simple_title,
1761 'ext': video_extension.decode('utf-8'),
1762 'format': u'NA',
1763 'player_url': None,
1764 })
1765 except UnavailableVideoError:
1766 self._downloader.trouble(u'\nERROR: unable to download video')
1767
1768
1769 class PhotobucketIE(InfoExtractor):
1770 """Information extractor for photobucket.com."""
1771
1772 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1773 IE_NAME = u'photobucket'
1774
1775 def __init__(self, downloader=None):
1776 InfoExtractor.__init__(self, downloader)
1777
1778 def report_download_webpage(self, video_id):
1779 """Report webpage download."""
1780 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1781
1782 def report_extraction(self, video_id):
1783 """Report information extraction."""
1784 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1785
1786 def _real_extract(self, url):
1787 # Extract id from URL
1788 mobj = re.match(self._VALID_URL, url)
1789 if mobj is None:
1790 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1791 return
1792
1793 # At this point we have a new video
1794 self._downloader.increment_downloads()
1795 video_id = mobj.group(1)
1796
1797 video_extension = 'flv'
1798
1799 # Retrieve video webpage to extract further information
1800 request = urllib2.Request(url)
1801 try:
1802 self.report_download_webpage(video_id)
1803 webpage = urllib2.urlopen(request).read()
1804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1806 return
1807
1808 # Extract URL, uploader, and title from webpage
1809 self.report_extraction(video_id)
1810 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1811 if mobj is None:
1812 self._downloader.trouble(u'ERROR: unable to extract media URL')
1813 return
1814 mediaURL = urllib.unquote(mobj.group(1))
1815
1816 video_url = mediaURL
1817
1818 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1819 if mobj is None:
1820 self._downloader.trouble(u'ERROR: unable to extract title')
1821 return
1822 video_title = mobj.group(1).decode('utf-8')
1823 video_title = sanitize_title(video_title)
1824 simple_title = _simplify_title(vide_title)
1825
1826 video_uploader = mobj.group(2).decode('utf-8')
1827
1828 try:
1829 # Process video information
1830 self._downloader.process_info({
1831 'id': video_id.decode('utf-8'),
1832 'url': video_url.decode('utf-8'),
1833 'uploader': video_uploader,
1834 'upload_date': u'NA',
1835 'title': video_title,
1836 'stitle': simple_title,
1837 'ext': video_extension.decode('utf-8'),
1838 'format': u'NA',
1839 'player_url': None,
1840 })
1841 except UnavailableVideoError:
1842 self._downloader.trouble(u'\nERROR: unable to download video')
1843
1844
1845 class YahooIE(InfoExtractor):
1846 """Information extractor for video.yahoo.com."""
1847
1848 # _VALID_URL matches all Yahoo! Video URLs
1849 # _VPAGE_URL matches only the extractable '/watch/' URLs
1850 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1851 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1852 IE_NAME = u'video.yahoo'
1853
1854 def __init__(self, downloader=None):
1855 InfoExtractor.__init__(self, downloader)
1856
1857 def report_download_webpage(self, video_id):
1858 """Report webpage download."""
1859 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1860
1861 def report_extraction(self, video_id):
1862 """Report information extraction."""
1863 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1864
1865 def _real_extract(self, url, new_video=True):
1866 # Extract ID from URL
1867 mobj = re.match(self._VALID_URL, url)
1868 if mobj is None:
1869 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1870 return
1871
1872 # At this point we have a new video
1873 self._downloader.increment_downloads()
1874 video_id = mobj.group(2)
1875 video_extension = 'flv'
1876
1877 # Rewrite valid but non-extractable URLs as
1878 # extractable English language /watch/ URLs
1879 if re.match(self._VPAGE_URL, url) is None:
1880 request = urllib2.Request(url)
1881 try:
1882 webpage = urllib2.urlopen(request).read()
1883 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1885 return
1886
1887 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1888 if mobj is None:
1889 self._downloader.trouble(u'ERROR: Unable to extract id field')
1890 return
1891 yahoo_id = mobj.group(1)
1892
1893 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1894 if mobj is None:
1895 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1896 return
1897 yahoo_vid = mobj.group(1)
1898
1899 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1900 return self._real_extract(url, new_video=False)
1901
1902 # Retrieve video webpage to extract further information
1903 request = urllib2.Request(url)
1904 try:
1905 self.report_download_webpage(video_id)
1906 webpage = urllib2.urlopen(request).read()
1907 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1909 return
1910
1911 # Extract uploader and title from webpage
1912 self.report_extraction(video_id)
1913 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1914 if mobj is None:
1915 self._downloader.trouble(u'ERROR: unable to extract video title')
1916 return
1917 video_title = mobj.group(1).decode('utf-8')
1918 simple_title = _simplify_title(video_title)
1919
1920 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1921 if mobj is None:
1922 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1923 return
1924 video_uploader = mobj.group(1).decode('utf-8')
1925
1926 # Extract video thumbnail
1927 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1928 if mobj is None:
1929 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1930 return
1931 video_thumbnail = mobj.group(1).decode('utf-8')
1932
1933 # Extract video description
1934 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1935 if mobj is None:
1936 self._downloader.trouble(u'ERROR: unable to extract video description')
1937 return
1938 video_description = mobj.group(1).decode('utf-8')
1939 if not video_description:
1940 video_description = 'No description available.'
1941
1942 # Extract video height and width
1943 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1944 if mobj is None:
1945 self._downloader.trouble(u'ERROR: unable to extract video height')
1946 return
1947 yv_video_height = mobj.group(1)
1948
1949 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1950 if mobj is None:
1951 self._downloader.trouble(u'ERROR: unable to extract video width')
1952 return
1953 yv_video_width = mobj.group(1)
1954
1955 # Retrieve video playlist to extract media URL
1956 # I'm not completely sure what all these options are, but we
1957 # seem to need most of them, otherwise the server sends a 401.
1958 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1959 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1960 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1961 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1962 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1963 try:
1964 self.report_download_webpage(video_id)
1965 webpage = urllib2.urlopen(request).read()
1966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1967 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1968 return
1969
1970 # Extract media URL from playlist XML
1971 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1972 if mobj is None:
1973 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1974 return
1975 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1976 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1977
1978 try:
1979 # Process video information
1980 self._downloader.process_info({
1981 'id': video_id.decode('utf-8'),
1982 'url': video_url,
1983 'uploader': video_uploader,
1984 'upload_date': u'NA',
1985 'title': video_title,
1986 'stitle': simple_title,
1987 'ext': video_extension.decode('utf-8'),
1988 'thumbnail': video_thumbnail.decode('utf-8'),
1989 'description': video_description,
1990 'thumbnail': video_thumbnail,
1991 'player_url': None,
1992 })
1993 except UnavailableVideoError:
1994 self._downloader.trouble(u'\nERROR: unable to download video')
1995
1996
1997 class VimeoIE(InfoExtractor):
1998 """Information extractor for vimeo.com."""
1999
2000 # _VALID_URL matches Vimeo URLs
2001 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2002 IE_NAME = u'vimeo'
2003
2004 def __init__(self, downloader=None):
2005 InfoExtractor.__init__(self, downloader)
2006
2007 def report_download_webpage(self, video_id):
2008 """Report webpage download."""
2009 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2010
2011 def report_extraction(self, video_id):
2012 """Report information extraction."""
2013 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2014
2015 def _real_extract(self, url, new_video=True):
2016 # Extract ID from URL
2017 mobj = re.match(self._VALID_URL, url)
2018 if mobj is None:
2019 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2020 return
2021
2022 # At this point we have a new video
2023 self._downloader.increment_downloads()
2024 video_id = mobj.group(1)
2025
2026 # Retrieve video webpage to extract further information
2027 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2028 try:
2029 self.report_download_webpage(video_id)
2030 webpage = urllib2.urlopen(request).read()
2031 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2032 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2033 return
2034
2035 # Now we begin extracting as much information as we can from what we
2036 # retrieved. First we extract the information common to all extractors,
2037 # and latter we extract those that are Vimeo specific.
2038 self.report_extraction(video_id)
2039
2040 # Extract title
2041 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2042 if mobj is None:
2043 self._downloader.trouble(u'ERROR: unable to extract video title')
2044 return
2045 video_title = mobj.group(1).decode('utf-8')
2046 simple_title = _simplify_title(video_title)
2047
2048 # Extract uploader
2049 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2050 if mobj is None:
2051 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2052 return
2053 video_uploader = mobj.group(1).decode('utf-8')
2054
2055 # Extract video thumbnail
2056 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2057 if mobj is None:
2058 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2059 return
2060 video_thumbnail = mobj.group(1).decode('utf-8')
2061
2062 # # Extract video description
2063 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2064 # if mobj is None:
2065 # self._downloader.trouble(u'ERROR: unable to extract video description')
2066 # return
2067 # video_description = mobj.group(1).decode('utf-8')
2068 # if not video_description: video_description = 'No description available.'
2069 video_description = 'Foo.'
2070
2071 # Vimeo specific: extract request signature
2072 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2073 if mobj is None:
2074 self._downloader.trouble(u'ERROR: unable to extract request signature')
2075 return
2076 sig = mobj.group(1).decode('utf-8')
2077
2078 # Vimeo specific: extract video quality information
2079 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2080 if mobj is None:
2081 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2082 return
2083 quality = mobj.group(1).decode('utf-8')
2084
2085 if int(quality) == 1:
2086 quality = 'hd'
2087 else:
2088 quality = 'sd'
2089
2090 # Vimeo specific: Extract request signature expiration
2091 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2092 if mobj is None:
2093 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2094 return
2095 sig_exp = mobj.group(1).decode('utf-8')
2096
2097 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2098
2099 try:
2100 # Process video information
2101 self._downloader.process_info({
2102 'id': video_id.decode('utf-8'),
2103 'url': video_url,
2104 'uploader': video_uploader,
2105 'upload_date': u'NA',
2106 'title': video_title,
2107 'stitle': simple_title,
2108 'ext': u'mp4',
2109 'thumbnail': video_thumbnail.decode('utf-8'),
2110 'description': video_description,
2111 'thumbnail': video_thumbnail,
2112 'description': video_description,
2113 'player_url': None,
2114 })
2115 except UnavailableVideoError:
2116 self._downloader.trouble(u'ERROR: unable to download video')
2117
2118
2119 class GenericIE(InfoExtractor):
2120 """Generic last-resort information extractor."""
2121
2122 _VALID_URL = r'.*'
2123 IE_NAME = u'generic'
2124
2125 def __init__(self, downloader=None):
2126 InfoExtractor.__init__(self, downloader)
2127
2128 def report_download_webpage(self, video_id):
2129 """Report webpage download."""
2130 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2131 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2132
2133 def report_extraction(self, video_id):
2134 """Report information extraction."""
2135 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2136
2137 def _real_extract(self, url):
2138 # At this point we have a new video
2139 self._downloader.increment_downloads()
2140
2141 video_id = url.split('/')[-1]
2142 request = urllib2.Request(url)
2143 try:
2144 self.report_download_webpage(video_id)
2145 webpage = urllib2.urlopen(request).read()
2146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148 return
2149 except ValueError, err:
2150 # since this is the last-resort InfoExtractor, if
2151 # this error is thrown, it'll be thrown here
2152 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2153 return
2154
2155 self.report_extraction(video_id)
2156 # Start with something easy: JW Player in SWFObject
2157 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2158 if mobj is None:
2159 # Broaden the search a little bit
2160 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2161 if mobj is None:
2162 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2163 return
2164
2165 # It's possible that one of the regexes
2166 # matched, but returned an empty group:
2167 if mobj.group(1) is None:
2168 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2169 return
2170
2171 video_url = urllib.unquote(mobj.group(1))
2172 video_id = os.path.basename(video_url)
2173
2174 # here's a fun little line of code for you:
2175 video_extension = os.path.splitext(video_id)[1][1:]
2176 video_id = os.path.splitext(video_id)[0]
2177
2178 # it's tempting to parse this further, but you would
2179 # have to take into account all the variations like
2180 # Video Title - Site Name
2181 # Site Name | Video Title
2182 # Video Title - Tagline | Site Name
2183 # and so on and so forth; it's just not practical
2184 mobj = re.search(r'<title>(.*)</title>', webpage)
2185 if mobj is None:
2186 self._downloader.trouble(u'ERROR: unable to extract title')
2187 return
2188 video_title = mobj.group(1).decode('utf-8')
2189 video_title = sanitize_title(video_title)
2190 simple_title = _simplify_title(video_title)
2191
2192 # video uploader is domain name
2193 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2194 if mobj is None:
2195 self._downloader.trouble(u'ERROR: unable to extract title')
2196 return
2197 video_uploader = mobj.group(1).decode('utf-8')
2198
2199 try:
2200 # Process video information
2201 self._downloader.process_info({
2202 'id': video_id.decode('utf-8'),
2203 'url': video_url.decode('utf-8'),
2204 'uploader': video_uploader,
2205 'upload_date': u'NA',
2206 'title': video_title,
2207 'stitle': simple_title,
2208 'ext': video_extension.decode('utf-8'),
2209 'format': u'NA',
2210 'player_url': None,
2211 })
2212 except UnavailableVideoError, err:
2213 self._downloader.trouble(u'\nERROR: unable to download video')
2214
2215
2216 class YoutubeSearchIE(InfoExtractor):
2217 """Information Extractor for YouTube search queries."""
2218 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2219 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2220 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2221 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2222 _youtube_ie = None
2223 _max_youtube_results = 1000
2224 IE_NAME = u'youtube:search'
2225
2226 def __init__(self, youtube_ie, downloader=None):
2227 InfoExtractor.__init__(self, downloader)
2228 self._youtube_ie = youtube_ie
2229
2230 def report_download_page(self, query, pagenum):
2231 """Report attempt to download playlist page with given number."""
2232 query = query.decode(preferredencoding())
2233 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2234
2235 def _real_initialize(self):
2236 self._youtube_ie.initialize()
2237
2238 def _real_extract(self, query):
2239 mobj = re.match(self._VALID_URL, query)
2240 if mobj is None:
2241 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2242 return
2243
2244 prefix, query = query.split(':')
2245 prefix = prefix[8:]
2246 query = query.encode('utf-8')
2247 if prefix == '':
2248 self._download_n_results(query, 1)
2249 return
2250 elif prefix == 'all':
2251 self._download_n_results(query, self._max_youtube_results)
2252 return
2253 else:
2254 try:
2255 n = long(prefix)
2256 if n <= 0:
2257 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2258 return
2259 elif n > self._max_youtube_results:
2260 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2261 n = self._max_youtube_results
2262 self._download_n_results(query, n)
2263 return
2264 except ValueError: # parsing prefix as integer fails
2265 self._download_n_results(query, 1)
2266 return
2267
2268 def _download_n_results(self, query, n):
2269 """Downloads a specified number of results for a query"""
2270
2271 video_ids = []
2272 already_seen = set()
2273 pagenum = 1
2274
2275 while True:
2276 self.report_download_page(query, pagenum)
2277 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2278 request = urllib2.Request(result_url)
2279 try:
2280 page = urllib2.urlopen(request).read()
2281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2282 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2283 return
2284
2285 # Extract video identifiers
2286 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2287 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2288 if video_id not in already_seen:
2289 video_ids.append(video_id)
2290 already_seen.add(video_id)
2291 if len(video_ids) == n:
2292 # Specified n videos reached
2293 for id in video_ids:
2294 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2295 return
2296
2297 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2298 for id in video_ids:
2299 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2300 return
2301
2302 pagenum = pagenum + 1
2303
2304
2305 class GoogleSearchIE(InfoExtractor):
2306 """Information Extractor for Google Video search queries."""
2307 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2308 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2309 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2310 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2311 _google_ie = None
2312 _max_google_results = 1000
2313 IE_NAME = u'video.google:search'
2314
2315 def __init__(self, google_ie, downloader=None):
2316 InfoExtractor.__init__(self, downloader)
2317 self._google_ie = google_ie
2318
2319 def report_download_page(self, query, pagenum):
2320 """Report attempt to download playlist page with given number."""
2321 query = query.decode(preferredencoding())
2322 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2323
2324 def _real_initialize(self):
2325 self._google_ie.initialize()
2326
2327 def _real_extract(self, query):
2328 mobj = re.match(self._VALID_URL, query)
2329 if mobj is None:
2330 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2331 return
2332
2333 prefix, query = query.split(':')
2334 prefix = prefix[8:]
2335 query = query.encode('utf-8')
2336 if prefix == '':
2337 self._download_n_results(query, 1)
2338 return
2339 elif prefix == 'all':
2340 self._download_n_results(query, self._max_google_results)
2341 return
2342 else:
2343 try:
2344 n = long(prefix)
2345 if n <= 0:
2346 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2347 return
2348 elif n > self._max_google_results:
2349 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2350 n = self._max_google_results
2351 self._download_n_results(query, n)
2352 return
2353 except ValueError: # parsing prefix as integer fails
2354 self._download_n_results(query, 1)
2355 return
2356
2357 def _download_n_results(self, query, n):
2358 """Downloads a specified number of results for a query"""
2359
2360 video_ids = []
2361 already_seen = set()
2362 pagenum = 1
2363
2364 while True:
2365 self.report_download_page(query, pagenum)
2366 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2367 request = urllib2.Request(result_url)
2368 try:
2369 page = urllib2.urlopen(request).read()
2370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2372 return
2373
2374 # Extract video identifiers
2375 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2376 video_id = mobj.group(1)
2377 if video_id not in already_seen:
2378 video_ids.append(video_id)
2379 already_seen.add(video_id)
2380 if len(video_ids) == n:
2381 # Specified n videos reached
2382 for id in video_ids:
2383 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2384 return
2385
2386 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2387 for id in video_ids:
2388 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2389 return
2390
2391 pagenum = pagenum + 1
2392
2393
2394 class YahooSearchIE(InfoExtractor):
2395 """Information Extractor for Yahoo! Video search queries."""
2396 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2397 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2398 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2399 _MORE_PAGES_INDICATOR = r'\s*Next'
2400 _yahoo_ie = None
2401 _max_yahoo_results = 1000
2402 IE_NAME = u'video.yahoo:search'
2403
2404 def __init__(self, yahoo_ie, downloader=None):
2405 InfoExtractor.__init__(self, downloader)
2406 self._yahoo_ie = yahoo_ie
2407
2408 def report_download_page(self, query, pagenum):
2409 """Report attempt to download playlist page with given number."""
2410 query = query.decode(preferredencoding())
2411 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2412
2413 def _real_initialize(self):
2414 self._yahoo_ie.initialize()
2415
2416 def _real_extract(self, query):
2417 mobj = re.match(self._VALID_URL, query)
2418 if mobj is None:
2419 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2420 return
2421
2422 prefix, query = query.split(':')
2423 prefix = prefix[8:]
2424 query = query.encode('utf-8')
2425 if prefix == '':
2426 self._download_n_results(query, 1)
2427 return
2428 elif prefix == 'all':
2429 self._download_n_results(query, self._max_yahoo_results)
2430 return
2431 else:
2432 try:
2433 n = long(prefix)
2434 if n <= 0:
2435 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2436 return
2437 elif n > self._max_yahoo_results:
2438 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2439 n = self._max_yahoo_results
2440 self._download_n_results(query, n)
2441 return
2442 except ValueError: # parsing prefix as integer fails
2443 self._download_n_results(query, 1)
2444 return
2445
2446 def _download_n_results(self, query, n):
2447 """Downloads a specified number of results for a query"""
2448
2449 video_ids = []
2450 already_seen = set()
2451 pagenum = 1
2452
2453 while True:
2454 self.report_download_page(query, pagenum)
2455 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2456 request = urllib2.Request(result_url)
2457 try:
2458 page = urllib2.urlopen(request).read()
2459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2461 return
2462
2463 # Extract video identifiers
2464 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465 video_id = mobj.group(1)
2466 if video_id not in already_seen:
2467 video_ids.append(video_id)
2468 already_seen.add(video_id)
2469 if len(video_ids) == n:
2470 # Specified n videos reached
2471 for id in video_ids:
2472 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2473 return
2474
2475 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2476 for id in video_ids:
2477 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2478 return
2479
2480 pagenum = pagenum + 1
2481
2482
2483 class YoutubePlaylistIE(InfoExtractor):
2484 """Information Extractor for YouTube playlists."""
2485
2486 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2487 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2488 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2489 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2490 _youtube_ie = None
2491 IE_NAME = u'youtube:playlist'
2492
2493 def __init__(self, youtube_ie, downloader=None):
2494 InfoExtractor.__init__(self, downloader)
2495 self._youtube_ie = youtube_ie
2496
2497 def report_download_page(self, playlist_id, pagenum):
2498 """Report attempt to download playlist page with given number."""
2499 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2500
2501 def _real_initialize(self):
2502 self._youtube_ie.initialize()
2503
2504 def _real_extract(self, url):
2505 # Extract playlist id
2506 mobj = re.match(self._VALID_URL, url)
2507 if mobj is None:
2508 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2509 return
2510
2511 # Single video case
2512 if mobj.group(3) is not None:
2513 self._youtube_ie.extract(mobj.group(3))
2514 return
2515
2516 # Download playlist pages
2517 # prefix is 'p' as default for playlists but there are other types that need extra care
2518 playlist_prefix = mobj.group(1)
2519 if playlist_prefix == 'a':
2520 playlist_access = 'artist'
2521 else:
2522 playlist_prefix = 'p'
2523 playlist_access = 'view_play_list'
2524 playlist_id = mobj.group(2)
2525 video_ids = []
2526 pagenum = 1
2527
2528 while True:
2529 self.report_download_page(playlist_id, pagenum)
2530 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2531 request = urllib2.Request(url)
2532 try:
2533 page = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536 return
2537
2538 # Extract video identifiers
2539 ids_in_page = []
2540 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2541 if mobj.group(1) not in ids_in_page:
2542 ids_in_page.append(mobj.group(1))
2543 video_ids.extend(ids_in_page)
2544
2545 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2546 break
2547 pagenum = pagenum + 1
2548
2549 playliststart = self._downloader.params.get('playliststart', 1) - 1
2550 playlistend = self._downloader.params.get('playlistend', -1)
2551 video_ids = video_ids[playliststart:playlistend]
2552
2553 for id in video_ids:
2554 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2555 return
2556
2557
2558 class YoutubeUserIE(InfoExtractor):
2559 """Information Extractor for YouTube users."""
2560
2561 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2562 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2563 _GDATA_PAGE_SIZE = 50
2564 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2565 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2566 _youtube_ie = None
2567 IE_NAME = u'youtube:user'
2568
2569 def __init__(self, youtube_ie, downloader=None):
2570 InfoExtractor.__init__(self, downloader)
2571 self._youtube_ie = youtube_ie
2572
2573 def report_download_page(self, username, start_index):
2574 """Report attempt to download user page."""
2575 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2576 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2577
2578 def _real_initialize(self):
2579 self._youtube_ie.initialize()
2580
2581 def _real_extract(self, url):
2582 # Extract username
2583 mobj = re.match(self._VALID_URL, url)
2584 if mobj is None:
2585 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2586 return
2587
2588 username = mobj.group(1)
2589
2590 # Download video ids using YouTube Data API. Result size per
2591 # query is limited (currently to 50 videos) so we need to query
2592 # page by page until there are no video ids - it means we got
2593 # all of them.
2594
2595 video_ids = []
2596 pagenum = 0
2597
2598 while True:
2599 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2600 self.report_download_page(username, start_index)
2601
2602 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2603
2604 try:
2605 page = urllib2.urlopen(request).read()
2606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2608 return
2609
2610 # Extract video identifiers
2611 ids_in_page = []
2612
2613 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2614 if mobj.group(1) not in ids_in_page:
2615 ids_in_page.append(mobj.group(1))
2616
2617 video_ids.extend(ids_in_page)
2618
2619 # A little optimization - if current page is not
2620 # "full", ie. does not contain PAGE_SIZE video ids then
2621 # we can assume that this page is the last one - there
2622 # are no more ids on further pages - no need to query
2623 # again.
2624
2625 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2626 break
2627
2628 pagenum += 1
2629
2630 all_ids_count = len(video_ids)
2631 playliststart = self._downloader.params.get('playliststart', 1) - 1
2632 playlistend = self._downloader.params.get('playlistend', -1)
2633
2634 if playlistend == -1:
2635 video_ids = video_ids[playliststart:]
2636 else:
2637 video_ids = video_ids[playliststart:playlistend]
2638
2639 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2640 (username, all_ids_count, len(video_ids)))
2641
2642 for video_id in video_ids:
2643 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2644
2645
2646 class DepositFilesIE(InfoExtractor):
2647 """Information extractor for depositfiles.com"""
2648
2649 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2650 IE_NAME = u'DepositFiles'
2651
2652 def __init__(self, downloader=None):
2653 InfoExtractor.__init__(self, downloader)
2654
2655 def report_download_webpage(self, file_id):
2656 """Report webpage download."""
2657 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2658
2659 def report_extraction(self, file_id):
2660 """Report information extraction."""
2661 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2662
2663 def _real_extract(self, url):
2664 # At this point we have a new file
2665 self._downloader.increment_downloads()
2666
2667 file_id = url.split('/')[-1]
2668 # Rebuild url in english locale
2669 url = 'http://depositfiles.com/en/files/' + file_id
2670
2671 # Retrieve file webpage with 'Free download' button pressed
2672 free_download_indication = { 'gateway_result' : '1' }
2673 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2674 try:
2675 self.report_download_webpage(file_id)
2676 webpage = urllib2.urlopen(request).read()
2677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2678 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2679 return
2680
2681 # Search for the real file URL
2682 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2683 if (mobj is None) or (mobj.group(1) is None):
2684 # Try to figure out reason of the error.
2685 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2686 if (mobj is not None) and (mobj.group(1) is not None):
2687 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2688 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2689 else:
2690 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2691 return
2692
2693 file_url = mobj.group(1)
2694 file_extension = os.path.splitext(file_url)[1][1:]
2695
2696 # Search for file title
2697 mobj = re.search(r'<b title="(.*?)">', webpage)
2698 if mobj is None:
2699 self._downloader.trouble(u'ERROR: unable to extract title')
2700 return
2701 file_title = mobj.group(1).decode('utf-8')
2702
2703 try:
2704 # Process file information
2705 self._downloader.process_info({
2706 'id': file_id.decode('utf-8'),
2707 'url': file_url.decode('utf-8'),
2708 'uploader': u'NA',
2709 'upload_date': u'NA',
2710 'title': file_title,
2711 'stitle': file_title,
2712 'ext': file_extension.decode('utf-8'),
2713 'format': u'NA',
2714 'player_url': None,
2715 })
2716 except UnavailableVideoError, err:
2717 self._downloader.trouble(u'ERROR: unable to download file')
2718
2719
2720 class FacebookIE(InfoExtractor):
2721 """Information Extractor for Facebook"""
2722
2723 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2724 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2725 _NETRC_MACHINE = 'facebook'
2726 _available_formats = ['video', 'highqual', 'lowqual']
2727 _video_extensions = {
2728 'video': 'mp4',
2729 'highqual': 'mp4',
2730 'lowqual': 'mp4',
2731 }
2732 IE_NAME = u'facebook'
2733
2734 def __init__(self, downloader=None):
2735 InfoExtractor.__init__(self, downloader)
2736
2737 def _reporter(self, message):
2738 """Add header and report message."""
2739 self._downloader.to_screen(u'[facebook] %s' % message)
2740
2741 def report_login(self):
2742 """Report attempt to log in."""
2743 self._reporter(u'Logging in')
2744
2745 def report_video_webpage_download(self, video_id):
2746 """Report attempt to download video webpage."""
2747 self._reporter(u'%s: Downloading video webpage' % video_id)
2748
2749 def report_information_extraction(self, video_id):
2750 """Report attempt to extract video information."""
2751 self._reporter(u'%s: Extracting video information' % video_id)
2752
2753 def _parse_page(self, video_webpage):
2754 """Extract video information from page"""
2755 # General data
2756 data = {'title': r'\("video_title", "(.*?)"\)',
2757 'description': r'<div class="datawrap">(.*?)</div>',
2758 'owner': r'\("video_owner_name", "(.*?)"\)',
2759 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2760 }
2761 video_info = {}
2762 for piece in data.keys():
2763 mobj = re.search(data[piece], video_webpage)
2764 if mobj is not None:
2765 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2766
2767 # Video urls
2768 video_urls = {}
2769 for fmt in self._available_formats:
2770 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2771 if mobj is not None:
2772 # URL is in a Javascript segment inside an escaped Unicode format within
2773 # the generally utf-8 page
2774 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2775 video_info['video_urls'] = video_urls
2776
2777 return video_info
2778
2779 def _real_initialize(self):
2780 if self._downloader is None:
2781 return
2782
2783 useremail = None
2784 password = None
2785 downloader_params = self._downloader.params
2786
2787 # Attempt to use provided username and password or .netrc data
2788 if downloader_params.get('username', None) is not None:
2789 useremail = downloader_params['username']
2790 password = downloader_params['password']
2791 elif downloader_params.get('usenetrc', False):
2792 try:
2793 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2794 if info is not None:
2795 useremail = info[0]
2796 password = info[2]
2797 else:
2798 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2799 except (IOError, netrc.NetrcParseError), err:
2800 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2801 return
2802
2803 if useremail is None:
2804 return
2805
2806 # Log in
2807 login_form = {
2808 'email': useremail,
2809 'pass': password,
2810 'login': 'Log+In'
2811 }
2812 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2813 try:
2814 self.report_login()
2815 login_results = urllib2.urlopen(request).read()
2816 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2817 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2818 return
2819 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2820 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2821 return
2822
2823 def _real_extract(self, url):
2824 mobj = re.match(self._VALID_URL, url)
2825 if mobj is None:
2826 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2827 return
2828 video_id = mobj.group('ID')
2829
2830 # Get video webpage
2831 self.report_video_webpage_download(video_id)
2832 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2833 try:
2834 page = urllib2.urlopen(request)
2835 video_webpage = page.read()
2836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2837 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2838 return
2839
2840 # Start extracting information
2841 self.report_information_extraction(video_id)
2842
2843 # Extract information
2844 video_info = self._parse_page(video_webpage)
2845
2846 # uploader
2847 if 'owner' not in video_info:
2848 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2849 return
2850 video_uploader = video_info['owner']
2851
2852 # title
2853 if 'title' not in video_info:
2854 self._downloader.trouble(u'ERROR: unable to extract video title')
2855 return
2856 video_title = video_info['title']
2857 video_title = video_title.decode('utf-8')
2858 video_title = sanitize_title(video_title)
2859
2860 simple_title = _simplify_title(video_title)
2861
2862 # thumbnail image
2863 if 'thumbnail' not in video_info:
2864 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2865 video_thumbnail = ''
2866 else:
2867 video_thumbnail = video_info['thumbnail']
2868
2869 # upload date
2870 upload_date = u'NA'
2871 if 'upload_date' in video_info:
2872 upload_time = video_info['upload_date']
2873 timetuple = email.utils.parsedate_tz(upload_time)
2874 if timetuple is not None:
2875 try:
2876 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2877 except:
2878 pass
2879
2880 # description
2881 video_description = video_info.get('description', 'No description available.')
2882
2883 url_map = video_info['video_urls']
2884 if len(url_map.keys()) > 0:
2885 # Decide which formats to download
2886 req_format = self._downloader.params.get('format', None)
2887 format_limit = self._downloader.params.get('format_limit', None)
2888
2889 if format_limit is not None and format_limit in self._available_formats:
2890 format_list = self._available_formats[self._available_formats.index(format_limit):]
2891 else:
2892 format_list = self._available_formats
2893 existing_formats = [x for x in format_list if x in url_map]
2894 if len(existing_formats) == 0:
2895 self._downloader.trouble(u'ERROR: no known formats available for video')
2896 return
2897 if req_format is None:
2898 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2899 elif req_format == 'worst':
2900 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2901 elif req_format == '-1':
2902 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2903 else:
2904 # Specific format
2905 if req_format not in url_map:
2906 self._downloader.trouble(u'ERROR: requested format not available')
2907 return
2908 video_url_list = [(req_format, url_map[req_format])] # Specific format
2909
2910 for format_param, video_real_url in video_url_list:
2911
2912 # At this point we have a new video
2913 self._downloader.increment_downloads()
2914
2915 # Extension
2916 video_extension = self._video_extensions.get(format_param, 'mp4')
2917
2918 try:
2919 # Process video information
2920 self._downloader.process_info({
2921 'id': video_id.decode('utf-8'),
2922 'url': video_real_url.decode('utf-8'),
2923 'uploader': video_uploader.decode('utf-8'),
2924 'upload_date': upload_date,
2925 'title': video_title,
2926 'stitle': simple_title,
2927 'ext': video_extension.decode('utf-8'),
2928 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2929 'thumbnail': video_thumbnail.decode('utf-8'),
2930 'description': video_description.decode('utf-8'),
2931 'player_url': None,
2932 })
2933 except UnavailableVideoError, err:
2934 self._downloader.trouble(u'\nERROR: unable to download video')
2935
2936 class BlipTVIE(InfoExtractor):
2937 """Information extractor for blip.tv"""
2938
2939 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2940 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2941 IE_NAME = u'blip.tv'
2942
2943 def report_extraction(self, file_id):
2944 """Report information extraction."""
2945 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2946
2947 def report_direct_download(self, title):
2948 """Report information extraction."""
2949 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2950
2951 def _real_extract(self, url):
2952 mobj = re.match(self._VALID_URL, url)
2953 if mobj is None:
2954 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2955 return
2956
2957 if '?' in url:
2958 cchar = '&'
2959 else:
2960 cchar = '?'
2961 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2962 request = urllib2.Request(json_url)
2963 self.report_extraction(mobj.group(1))
2964 info = None
2965 try:
2966 urlh = urllib2.urlopen(request)
2967 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2968 basename = url.split('/')[-1]
2969 title,ext = os.path.splitext(basename)
2970 title = title.decode('UTF-8')
2971 ext = ext.replace('.', '')
2972 self.report_direct_download(title)
2973 info = {
2974 'id': title,
2975 'url': url,
2976 'title': title,
2977 'stitle': _simplify_title(title),
2978 'ext': ext,
2979 'urlhandle': urlh
2980 }
2981 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2982 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2983 return
2984 if info is None: # Regular URL
2985 try:
2986 json_code = urlh.read()
2987 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2989 return
2990
2991 try:
2992 json_data = json.loads(json_code)
2993 if 'Post' in json_data:
2994 data = json_data['Post']
2995 else:
2996 data = json_data
2997
2998 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2999 video_url = data['media']['url']
3000 umobj = re.match(self._URL_EXT, video_url)
3001 if umobj is None:
3002 raise ValueError('Can not determine filename extension')
3003 ext = umobj.group(1)
3004
3005 info = {
3006 'id': data['item_id'],
3007 'url': video_url,
3008 'uploader': data['display_name'],
3009 'upload_date': upload_date,
3010 'title': data['title'],
3011 'stitle': _simplify_title(data['title']),
3012 'ext': ext,
3013 'format': data['media']['mimeType'],
3014 'thumbnail': data['thumbnailUrl'],
3015 'description': data['description'],
3016 'player_url': data['embedUrl']
3017 }
3018 except (ValueError,KeyError), err:
3019 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3020 return
3021
3022 self._downloader.increment_downloads()
3023
3024 try:
3025 self._downloader.process_info(info)
3026 except UnavailableVideoError, err:
3027 self._downloader.trouble(u'\nERROR: unable to download video')
3028
3029
3030 class MyVideoIE(InfoExtractor):
3031 """Information Extractor for myvideo.de."""
3032
3033 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3034 IE_NAME = u'myvideo'
3035
3036 def __init__(self, downloader=None):
3037 InfoExtractor.__init__(self, downloader)
3038
3039 def report_download_webpage(self, video_id):
3040 """Report webpage download."""
3041 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3042
3043 def report_extraction(self, video_id):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3046
3047 def _real_extract(self,url):
3048 mobj = re.match(self._VALID_URL, url)
3049 if mobj is None:
3050 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3051 return
3052
3053 video_id = mobj.group(1)
3054
3055 # Get video webpage
3056 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3057 try:
3058 self.report_download_webpage(video_id)
3059 webpage = urllib2.urlopen(request).read()
3060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3061 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3062 return
3063
3064 self.report_extraction(video_id)
3065 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3066 webpage)
3067 if mobj is None:
3068 self._downloader.trouble(u'ERROR: unable to extract media URL')
3069 return
3070 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3071
3072 mobj = re.search('<title>([^<]+)</title>', webpage)
3073 if mobj is None:
3074 self._downloader.trouble(u'ERROR: unable to extract title')
3075 return
3076
3077 video_title = mobj.group(1)
3078 video_title = sanitize_title(video_title)
3079
3080 simple_title = _simplify_title(video_title)
3081
3082 try:
3083 self._downloader.process_info({
3084 'id': video_id,
3085 'url': video_url,
3086 'uploader': u'NA',
3087 'upload_date': u'NA',
3088 'title': video_title,
3089 'stitle': simple_title,
3090 'ext': u'flv',
3091 'format': u'NA',
3092 'player_url': None,
3093 })
3094 except UnavailableVideoError:
3095 self._downloader.trouble(u'\nERROR: Unable to download video')
3096
3097 class ComedyCentralIE(InfoExtractor):
3098 """Information extractor for The Daily Show and Colbert Report """
3099
3100 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3101 IE_NAME = u'comedycentral'
3102
3103 def report_extraction(self, episode_id):
3104 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3105
3106 def report_config_download(self, episode_id):
3107 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3108
3109 def report_index_download(self, episode_id):
3110 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3111
3112 def report_player_url(self, episode_id):
3113 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3114
3115 def _real_extract(self, url):
3116 mobj = re.match(self._VALID_URL, url)
3117 if mobj is None:
3118 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3119 return
3120
3121 if mobj.group('shortname'):
3122 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3123 url = u'http://www.thedailyshow.com/full-episodes/'
3124 else:
3125 url = u'http://www.colbertnation.com/full-episodes/'
3126 mobj = re.match(self._VALID_URL, url)
3127 assert mobj is not None
3128
3129 dlNewest = not mobj.group('episode')
3130 if dlNewest:
3131 epTitle = mobj.group('showname')
3132 else:
3133 epTitle = mobj.group('episode')
3134
3135 req = urllib2.Request(url)
3136 self.report_extraction(epTitle)
3137 try:
3138 htmlHandle = urllib2.urlopen(req)
3139 html = htmlHandle.read()
3140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3141 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3142 return
3143 if dlNewest:
3144 url = htmlHandle.geturl()
3145 mobj = re.match(self._VALID_URL, url)
3146 if mobj is None:
3147 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3148 return
3149 if mobj.group('episode') == '':
3150 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3151 return
3152 epTitle = mobj.group('episode')
3153
3154 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3155 if len(mMovieParams) == 0:
3156 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3157 return
3158
3159 playerUrl_raw = mMovieParams[0][0]
3160 self.report_player_url(epTitle)
3161 try:
3162 urlHandle = urllib2.urlopen(playerUrl_raw)
3163 playerUrl = urlHandle.geturl()
3164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3165 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3166 return
3167
3168 uri = mMovieParams[0][1]
3169 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3170 self.report_index_download(epTitle)
3171 try:
3172 indexXml = urllib2.urlopen(indexUrl).read()
3173 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3174 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3175 return
3176
3177 idoc = xml.etree.ElementTree.fromstring(indexXml)
3178 itemEls = idoc.findall('.//item')
3179 for itemEl in itemEls:
3180 mediaId = itemEl.findall('./guid')[0].text
3181 shortMediaId = mediaId.split(':')[-1]
3182 showId = mediaId.split(':')[-2].replace('.com', '')
3183 officialTitle = itemEl.findall('./title')[0].text
3184 officialDate = itemEl.findall('./pubDate')[0].text
3185
3186 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3187 urllib.urlencode({'uri': mediaId}))
3188 configReq = urllib2.Request(configUrl)
3189 self.report_config_download(epTitle)
3190 try:
3191 configXml = urllib2.urlopen(configReq).read()
3192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3193 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3194 return
3195
3196 cdoc = xml.etree.ElementTree.fromstring(configXml)
3197 turls = []
3198 for rendition in cdoc.findall('.//rendition'):
3199 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3200 turls.append(finfo)
3201
3202 if len(turls) == 0:
3203 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3204 continue
3205
3206 # For now, just pick the highest bitrate
3207 format,video_url = turls[-1]
3208
3209 self._downloader.increment_downloads()
3210
3211 effTitle = showId + u'-' + epTitle
3212 info = {
3213 'id': shortMediaId,
3214 'url': video_url,
3215 'uploader': showId,
3216 'upload_date': officialDate,
3217 'title': effTitle,
3218 'stitle': _simplify_title(effTitle),
3219 'ext': 'mp4',
3220 'format': format,
3221 'thumbnail': None,
3222 'description': officialTitle,
3223 'player_url': playerUrl
3224 }
3225
3226 try:
3227 self._downloader.process_info(info)
3228 except UnavailableVideoError, err:
3229 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3230 continue
3231
3232
3233 class EscapistIE(InfoExtractor):
3234 """Information extractor for The Escapist """
3235
3236 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3237 IE_NAME = u'escapist'
3238
3239 def report_extraction(self, showName):
3240 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3241
3242 def report_config_download(self, showName):
3243 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3244
3245 def _real_extract(self, url):
3246 htmlParser = HTMLParser.HTMLParser()
3247
3248 mobj = re.match(self._VALID_URL, url)
3249 if mobj is None:
3250 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3251 return
3252 showName = mobj.group('showname')
3253 videoId = mobj.group('episode')
3254
3255 self.report_extraction(showName)
3256 try:
3257 webPage = urllib2.urlopen(url).read()
3258 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3259 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3260 return
3261
3262 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3263 description = htmlParser.unescape(descMatch.group(1))
3264 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3265 imgUrl = htmlParser.unescape(imgMatch.group(1))
3266 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3267 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3268 configUrlMatch = re.search('config=(.*)$', playerUrl)
3269 configUrl = urllib2.unquote(configUrlMatch.group(1))
3270
3271 self.report_config_download(showName)
3272 try:
3273 configJSON = urllib2.urlopen(configUrl).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3276 return
3277
3278 # Technically, it's JavaScript, not JSON
3279 configJSON = configJSON.replace("'", '"')
3280
3281 try:
3282 config = json.loads(configJSON)
3283 except (ValueError,), err:
3284 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3285 return
3286
3287 playlist = config['playlist']
3288 videoUrl = playlist[1]['url']
3289
3290 self._downloader.increment_downloads()
3291 info = {
3292 'id': videoId,
3293 'url': videoUrl,
3294 'uploader': showName,
3295 'upload_date': None,
3296 'title': showName,
3297 'stitle': _simplify_title(showName),
3298 'ext': 'flv',
3299 'format': 'flv',
3300 'thumbnail': imgUrl,
3301 'description': description,
3302 'player_url': playerUrl,
3303 }
3304
3305 try:
3306 self._downloader.process_info(info)
3307 except UnavailableVideoError, err:
3308 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3309
3310
3311 class CollegeHumorIE(InfoExtractor):
3312 """Information extractor for collegehumor.com"""
3313
3314 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3315 IE_NAME = u'collegehumor'
3316
3317 def report_webpage(self, video_id):
3318 """Report information extraction."""
3319 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3320
3321 def report_extraction(self, video_id):
3322 """Report information extraction."""
3323 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3324
3325 def _real_extract(self, url):
3326 htmlParser = HTMLParser.HTMLParser()
3327
3328 mobj = re.match(self._VALID_URL, url)
3329 if mobj is None:
3330 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3331 return
3332 video_id = mobj.group('videoid')
3333
3334 self.report_webpage(video_id)
3335 request = urllib2.Request(url)
3336 try:
3337 webpage = urllib2.urlopen(request).read()
3338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3339 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3340 return
3341
3342 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3343 if m is None:
3344 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3345 return
3346 internal_video_id = m.group('internalvideoid')
3347
3348 info = {
3349 'id': video_id,
3350 'internal_id': internal_video_id,
3351 }
3352
3353 self.report_extraction(video_id)
3354 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3355 try:
3356 metaXml = urllib2.urlopen(xmlUrl).read()
3357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3358 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3359 return
3360
3361 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3362 try:
3363 videoNode = mdoc.findall('./video')[0]
3364 info['description'] = videoNode.findall('./description')[0].text
3365 info['title'] = videoNode.findall('./caption')[0].text
3366 info['stitle'] = _simplify_title(info['title'])
3367 info['url'] = videoNode.findall('./file')[0].text
3368 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3369 info['ext'] = info['url'].rpartition('.')[2]
3370 info['format'] = info['ext']
3371 except IndexError:
3372 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3373 return
3374
3375 self._downloader.increment_downloads()
3376
3377 try:
3378 self._downloader.process_info(info)
3379 except UnavailableVideoError, err:
3380 self._downloader.trouble(u'\nERROR: unable to download video')
3381
3382
3383 class XVideosIE(InfoExtractor):
3384 """Information extractor for xvideos.com"""
3385
3386 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3387 IE_NAME = u'xvideos'
3388
3389 def report_webpage(self, video_id):
3390 """Report information extraction."""
3391 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3392
3393 def report_extraction(self, video_id):
3394 """Report information extraction."""
3395 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3396
3397 def _real_extract(self, url):
3398 htmlParser = HTMLParser.HTMLParser()
3399
3400 mobj = re.match(self._VALID_URL, url)
3401 if mobj is None:
3402 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403 return
3404 video_id = mobj.group(1).decode('utf-8')
3405
3406 self.report_webpage(video_id)
3407
3408 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3409 try:
3410 webpage = urllib2.urlopen(request).read()
3411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3413 return
3414
3415 self.report_extraction(video_id)
3416
3417
3418 # Extract video URL
3419 mobj = re.search(r'flv_url=(.+?)&', webpage)
3420 if mobj is None:
3421 self._downloader.trouble(u'ERROR: unable to extract video url')
3422 return
3423 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3424
3425
3426 # Extract title
3427 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3428 if mobj is None:
3429 self._downloader.trouble(u'ERROR: unable to extract video title')
3430 return
3431 video_title = mobj.group(1).decode('utf-8')
3432
3433
3434 # Extract video thumbnail
3435 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3436 if mobj is None:
3437 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3438 return
3439 video_thumbnail = mobj.group(1).decode('utf-8')
3440
3441
3442
3443 self._downloader.increment_downloads()
3444 info = {
3445 'id': video_id,
3446 'url': video_url,
3447 'uploader': None,
3448 'upload_date': None,
3449 'title': video_title,
3450 'stitle': _simplify_title(video_title),
3451 'ext': 'flv',
3452 'format': 'flv',
3453 'thumbnail': video_thumbnail,
3454 'description': None,
3455 'player_url': None,
3456 }
3457
3458 try:
3459 self._downloader.process_info(info)
3460 except UnavailableVideoError, err:
3461 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3462
3463
3464 class SoundcloudIE(InfoExtractor):
3465 """Information extractor for soundcloud.com
3466 To access the media, the uid of the song and a stream token
3467 must be extracted from the page source and the script must make
3468 a request to media.soundcloud.com/crossdomain.xml. Then
3469 the media can be grabbed by requesting from an url composed
3470 of the stream token and uid
3471 """
3472
3473 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3474 IE_NAME = u'soundcloud'
3475
3476 def __init__(self, downloader=None):
3477 InfoExtractor.__init__(self, downloader)
3478
3479 def report_webpage(self, video_id):
3480 """Report information extraction."""
3481 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3482
3483 def report_extraction(self, video_id):
3484 """Report information extraction."""
3485 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3486
3487 def _real_extract(self, url):
3488 htmlParser = HTMLParser.HTMLParser()
3489
3490 mobj = re.match(self._VALID_URL, url)
3491 if mobj is None:
3492 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3493 return
3494
3495 # extract uploader (which is in the url)
3496 uploader = mobj.group(1).decode('utf-8')
3497 # extract simple title (uploader + slug of song title)
3498 slug_title = mobj.group(2).decode('utf-8')
3499 simple_title = uploader + '-' + slug_title
3500
3501 self.report_webpage('%s/%s' % (uploader, slug_title))
3502
3503 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3504 try:
3505 webpage = urllib2.urlopen(request).read()
3506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3507 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3508 return
3509
3510 self.report_extraction('%s/%s' % (uploader, slug_title))
3511
3512 # extract uid and stream token that soundcloud hands out for access
3513 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3514 if mobj:
3515 video_id = mobj.group(1)
3516 stream_token = mobj.group(2)
3517
3518 # extract unsimplified title
3519 mobj = re.search('"title":"(.*?)",', webpage)
3520 if mobj:
3521 title = mobj.group(1)
3522
3523 # construct media url (with uid/token)
3524 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3525 mediaURL = mediaURL % (video_id, stream_token)
3526
3527 # description
3528 description = u'No description available'
3529 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3530 if mobj:
3531 description = mobj.group(1)
3532
3533 # upload date
3534 upload_date = None
3535 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3536 if mobj:
3537 try:
3538 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3539 except Exception, e:
3540 print str(e)
3541
3542 # for soundcloud, a request to a cross domain is required for cookies
3543 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3544
3545 try:
3546 self._downloader.process_info({
3547 'id': video_id.decode('utf-8'),
3548 'url': mediaURL,
3549 'uploader': uploader.decode('utf-8'),
3550 'upload_date': upload_date,
3551 'title': simple_title.decode('utf-8'),
3552 'stitle': simple_title.decode('utf-8'),
3553 'ext': u'mp3',
3554 'format': u'NA',
3555 'player_url': None,
3556 'description': description.decode('utf-8')
3557 })
3558 except UnavailableVideoError:
3559 self._downloader.trouble(u'\nERROR: unable to download video')
3560
3561
3562 class InfoQIE(InfoExtractor):
3563 """Information extractor for infoq.com"""
3564
3565 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3566 IE_NAME = u'infoq'
3567
3568 def report_webpage(self, video_id):
3569 """Report information extraction."""
3570 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3571
3572 def report_extraction(self, video_id):
3573 """Report information extraction."""
3574 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3575
3576 def _real_extract(self, url):
3577 htmlParser = HTMLParser.HTMLParser()
3578
3579 mobj = re.match(self._VALID_URL, url)
3580 if mobj is None:
3581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3582 return
3583
3584 self.report_webpage(url)
3585
3586 request = urllib2.Request(url)
3587 try:
3588 webpage = urllib2.urlopen(request).read()
3589 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3590 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3591 return
3592
3593 self.report_extraction(url)
3594
3595
3596 # Extract video URL
3597 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3598 if mobj is None:
3599 self._downloader.trouble(u'ERROR: unable to extract video url')
3600 return
3601 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3602
3603
3604 # Extract title
3605 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3606 if mobj is None:
3607 self._downloader.trouble(u'ERROR: unable to extract video title')
3608 return
3609 video_title = mobj.group(1).decode('utf-8')
3610
3611 # Extract description
3612 video_description = u'No description available.'
3613 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3614 if mobj is not None:
3615 video_description = mobj.group(1).decode('utf-8')
3616
3617 video_filename = video_url.split('/')[-1]
3618 video_id, extension = video_filename.split('.')
3619
3620 self._downloader.increment_downloads()
3621 info = {
3622 'id': video_id,
3623 'url': video_url,
3624 'uploader': None,
3625 'upload_date': None,
3626 'title': video_title,
3627 'stitle': _simplify_title(video_title),
3628 'ext': extension,
3629 'format': extension, # Extension is always(?) mp4, but seems to be flv
3630 'thumbnail': None,
3631 'description': video_description,
3632 'player_url': None,
3633 }
3634
3635 try:
3636 self._downloader.process_info(info)
3637 except UnavailableVideoError, err:
3638 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3639
3640 class MixcloudIE(InfoExtractor):
3641 """Information extractor for www.mixcloud.com"""
3642 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3643 IE_NAME = u'mixcloud'
3644
3645 def __init__(self, downloader=None):
3646 InfoExtractor.__init__(self, downloader)
3647
3648 def report_download_json(self, file_id):
3649 """Report JSON download."""
3650 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3651
3652 def report_extraction(self, file_id):
3653 """Report information extraction."""
3654 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3655
3656 def get_urls(self, jsonData, fmt, bitrate='best'):
3657 """Get urls from 'audio_formats' section in json"""
3658 file_url = None
3659 try:
3660 bitrate_list = jsonData[fmt]
3661 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3662 bitrate = max(bitrate_list) # select highest
3663
3664 url_list = jsonData[fmt][bitrate]
3665 except TypeError: # we have no bitrate info.
3666 url_list = jsonData[fmt]
3667
3668 return url_list
3669
3670 def check_urls(self, url_list):
3671 """Returns 1st active url from list"""
3672 for url in url_list:
3673 try:
3674 urllib2.urlopen(url)
3675 return url
3676 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3677 url = None
3678
3679 return None
3680
3681 def _print_formats(self, formats):
3682 print 'Available formats:'
3683 for fmt in formats.keys():
3684 for b in formats[fmt]:
3685 try:
3686 ext = formats[fmt][b][0]
3687 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3688 except TypeError: # we have no bitrate info
3689 ext = formats[fmt][0]
3690 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3691 break
3692
3693 def _real_extract(self, url):
3694 mobj = re.match(self._VALID_URL, url)
3695 if mobj is None:
3696 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3697 return
3698 # extract uploader & filename from url
3699 uploader = mobj.group(1).decode('utf-8')
3700 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3701
3702 # construct API request
3703 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3704 # retrieve .json file with links to files
3705 request = urllib2.Request(file_url)
3706 try:
3707 self.report_download_json(file_url)
3708 jsonData = urllib2.urlopen(request).read()
3709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3710 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3711 return
3712
3713 # parse JSON
3714 json_data = json.loads(jsonData)
3715 player_url = json_data['player_swf_url']
3716 formats = dict(json_data['audio_formats'])
3717
3718 req_format = self._downloader.params.get('format', None)
3719 bitrate = None
3720
3721 if self._downloader.params.get('listformats', None):
3722 self._print_formats(formats)
3723 return
3724
3725 if req_format is None or req_format == 'best':
3726 for format_param in formats.keys():
3727 url_list = self.get_urls(formats, format_param)
3728 # check urls
3729 file_url = self.check_urls(url_list)
3730 if file_url is not None:
3731 break # got it!
3732 else:
3733 if req_format not in formats.keys():
3734 self._downloader.trouble(u'ERROR: format is not available')
3735 return
3736
3737 url_list = self.get_urls(formats, req_format)
3738 file_url = self.check_urls(url_list)
3739 format_param = req_format
3740
3741 # We have audio
3742 self._downloader.increment_downloads()
3743 try:
3744 # Process file information
3745 self._downloader.process_info({
3746 'id': file_id.decode('utf-8'),
3747 'url': file_url.decode('utf-8'),
3748 'uploader': uploader.decode('utf-8'),
3749 'upload_date': u'NA',
3750 'title': json_data['name'],
3751 'stitle': _simplify_title(json_data['name']),
3752 'ext': file_url.split('.')[-1].decode('utf-8'),
3753 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3754 'thumbnail': json_data['thumbnail_url'],
3755 'description': json_data['description'],
3756 'player_url': player_url.decode('utf-8'),
3757 })
3758 except UnavailableVideoError, err:
3759 self._downloader.trouble(u'ERROR: unable to download file')
3760
3761 class StanfordOpenClassroomIE(InfoExtractor):
3762 """Information extractor for Stanford's Open ClassRoom"""
3763
3764 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3765 IE_NAME = u'stanfordoc'
3766
3767 def report_download_webpage(self, objid):
3768 """Report information extraction."""
3769 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3770
3771 def report_extraction(self, video_id):
3772 """Report information extraction."""
3773 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3774
3775 def _real_extract(self, url):
3776 mobj = re.match(self._VALID_URL, url)
3777 if mobj is None:
3778 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3779 return
3780
3781 if mobj.group('course') and mobj.group('video'): # A specific video
3782 course = mobj.group('course')
3783 video = mobj.group('video')
3784 info = {
3785 'id': _simplify_title(course + '_' + video),
3786 }
3787
3788 self.report_extraction(info['id'])
3789 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3790 xmlUrl = baseUrl + video + '.xml'
3791 try:
3792 metaXml = urllib2.urlopen(xmlUrl).read()
3793 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3794 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3795 return
3796 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3797 try:
3798 info['title'] = mdoc.findall('./title')[0].text
3799 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3800 except IndexError:
3801 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3802 return
3803 info['stitle'] = _simplify_title(info['title'])
3804 info['ext'] = info['url'].rpartition('.')[2]
3805 info['format'] = info['ext']
3806 self._downloader.increment_downloads()
3807 try:
3808 self._downloader.process_info(info)
3809 except UnavailableVideoError, err:
3810 self._downloader.trouble(u'\nERROR: unable to download video')
3811 elif mobj.group('course'): # A course page
3812 unescapeHTML = HTMLParser.HTMLParser().unescape
3813
3814 course = mobj.group('course')
3815 info = {
3816 'id': _simplify_title(course),
3817 'type': 'playlist',
3818 }
3819
3820 self.report_download_webpage(info['id'])
3821 try:
3822 coursepage = urllib2.urlopen(url).read()
3823 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3824 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3825 return
3826
3827 m = re.search('<h1>([^<]+)</h1>', coursepage)
3828 if m:
3829 info['title'] = unescapeHTML(m.group(1))
3830 else:
3831 info['title'] = info['id']
3832 info['stitle'] = _simplify_title(info['title'])
3833
3834 m = re.search('<description>([^<]+)</description>', coursepage)
3835 if m:
3836 info['description'] = unescapeHTML(m.group(1))
3837
3838 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3839 info['list'] = [
3840 {
3841 'type': 'reference',
3842 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3843 }
3844 for vpage in links]
3845
3846 for entry in info['list']:
3847 assert entry['type'] == 'reference'
3848 self.extract(entry['url'])
3849 else: # Root page
3850 unescapeHTML = HTMLParser.HTMLParser().unescape
3851
3852 info = {
3853 'id': 'Stanford OpenClassroom',
3854 'type': 'playlist',
3855 }
3856
3857 self.report_download_webpage(info['id'])
3858 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3859 try:
3860 rootpage = urllib2.urlopen(rootURL).read()
3861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3862 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3863 return
3864
3865 info['title'] = info['id']
3866 info['stitle'] = _simplify_title(info['title'])
3867
3868 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3869 info['list'] = [
3870 {
3871 'type': 'reference',
3872 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3873 }
3874 for cpage in links]
3875
3876 for entry in info['list']:
3877 assert entry['type'] == 'reference'
3878 self.extract(entry['url'])
3879
3880
3881 class PostProcessor(object):
3882 """Post Processor class.
3883
3884 PostProcessor objects can be added to downloaders with their
3885 add_post_processor() method. When the downloader has finished a
3886 successful download, it will take its internal chain of PostProcessors
3887 and start calling the run() method on each one of them, first with
3888 an initial argument and then with the returned value of the previous
3889 PostProcessor.
3890
3891 The chain will be stopped if one of them ever returns None or the end
3892 of the chain is reached.
3893
3894 PostProcessor objects follow a "mutual registration" process similar
3895 to InfoExtractor objects.
3896 """
3897
3898 _downloader = None
3899
3900 def __init__(self, downloader=None):
3901 self._downloader = downloader
3902
3903 def set_downloader(self, downloader):
3904 """Sets the downloader for this PP."""
3905 self._downloader = downloader
3906
3907 def run(self, information):
3908 """Run the PostProcessor.
3909
3910 The "information" argument is a dictionary like the ones
3911 composed by InfoExtractors. The only difference is that this
3912 one has an extra field called "filepath" that points to the
3913 downloaded file.
3914
3915 When this method returns None, the postprocessing chain is
3916 stopped. However, this method may return an information
3917 dictionary that will be passed to the next postprocessing
3918 object in the chain. It can be the one it received after
3919 changing some fields.
3920
3921 In addition, this method may raise a PostProcessingError
3922 exception that will be taken into account by the downloader
3923 it was called from.
3924 """
3925 return information # by default, do nothing
3926
3927
3928 class FFmpegExtractAudioPP(PostProcessor):
3929
3930 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3931 PostProcessor.__init__(self, downloader)
3932 if preferredcodec is None:
3933 preferredcodec = 'best'
3934 self._preferredcodec = preferredcodec
3935 self._preferredquality = preferredquality
3936 self._keepvideo = keepvideo
3937
3938 @staticmethod
3939 def get_audio_codec(path):
3940 try:
3941 cmd = ['ffprobe', '-show_streams', '--', path]
3942 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3943 output = handle.communicate()[0]
3944 if handle.wait() != 0:
3945 return None
3946 except (IOError, OSError):
3947 return None
3948 audio_codec = None
3949 for line in output.split('\n'):
3950 if line.startswith('codec_name='):
3951 audio_codec = line.split('=')[1].strip()
3952 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3953 return audio_codec
3954 return None
3955
3956 @staticmethod
3957 def run_ffmpeg(path, out_path, codec, more_opts):
3958 try:
3959 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3960 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3961 return (ret == 0)
3962 except (IOError, OSError):
3963 return False
3964
3965 def run(self, information):
3966 path = information['filepath']
3967
3968 filecodec = self.get_audio_codec(path)
3969 if filecodec is None:
3970 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3971 return None
3972
3973 more_opts = []
3974 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3975 if self._preferredcodec == 'm4a' and filecodec == 'aac':
3976 # Lossless, but in another container
3977 acodec = 'copy'
3978 extension = self._preferredcodec
3979 more_opts = ['-absf', 'aac_adtstoasc']
3980 elif filecodec in ['aac', 'mp3', 'vorbis']:
3981 # Lossless if possible
3982 acodec = 'copy'
3983 extension = filecodec
3984 if filecodec == 'aac':
3985 more_opts = ['-f', 'adts']
3986 if filecodec == 'vorbis':
3987 extension = 'ogg'
3988 else:
3989 # MP3 otherwise.
3990 acodec = 'libmp3lame'
3991 extension = 'mp3'
3992 more_opts = []
3993 if self._preferredquality is not None:
3994 more_opts += ['-ab', self._preferredquality]
3995 else:
3996 # We convert the audio (lossy)
3997 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3998 extension = self._preferredcodec
3999 more_opts = []
4000 if self._preferredquality is not None:
4001 more_opts += ['-ab', self._preferredquality]
4002 if self._preferredcodec == 'aac':
4003 more_opts += ['-f', 'adts']
4004 if self._preferredcodec == 'm4a':
4005 more_opts += ['-absf', 'aac_adtstoasc']
4006 if self._preferredcodec == 'vorbis':
4007 extension = 'ogg'
4008
4009 (prefix, ext) = os.path.splitext(path)
4010 new_path = prefix + '.' + extension
4011 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4012 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4013
4014 if not status:
4015 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4016 return None
4017
4018 # Try to update the date time for extracted audio file.
4019 if information.get('filetime') is not None:
4020 try:
4021 os.utime(new_path, (time.time(), information['filetime']))
4022 except:
4023 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4024
4025 if not self._keepvideo:
4026 try:
4027 os.remove(path)
4028 except (IOError, OSError):
4029 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4030 return None
4031
4032 information['filepath'] = new_path
4033 return information
4034
4035
4036 def updateSelf(downloader, filename):
4037 ''' Update the program file with the latest version from the repository '''
4038 # Note: downloader only used for options
4039 if not os.access(filename, os.W_OK):
4040 sys.exit('ERROR: no write permissions on %s' % filename)
4041
4042 downloader.to_screen('Updating to latest version...')
4043
4044 try:
4045 try:
4046 urlh = urllib.urlopen(UPDATE_URL)
4047 newcontent = urlh.read()
4048
4049 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4050 if vmatch is not None and vmatch.group(1) == __version__:
4051 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4052 return
4053 finally:
4054 urlh.close()
4055 except (IOError, OSError), err:
4056 sys.exit('ERROR: unable to download latest version')
4057
4058 try:
4059 outf = open(filename, 'wb')
4060 try:
4061 outf.write(newcontent)
4062 finally:
4063 outf.close()
4064 except (IOError, OSError), err:
4065 sys.exit('ERROR: unable to overwrite current version')
4066
4067 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4068
4069 def parseOpts():
4070 # Deferred imports
4071 import getpass
4072 import optparse
4073 import shlex
4074
4075 def _readOptions(filename):
4076 try:
4077 optionf = open(filename)
4078 except IOError:
4079 return [] # silently skip if file is not present
4080 try:
4081 res = []
4082 for l in optionf:
4083 res += shlex.split(l, comments=True)
4084 finally:
4085 optionf.close()
4086 return res
4087
4088 def _format_option_string(option):
4089 ''' ('-o', '--option') -> -o, --format METAVAR'''
4090
4091 opts = []
4092
4093 if option._short_opts: opts.append(option._short_opts[0])
4094 if option._long_opts: opts.append(option._long_opts[0])
4095 if len(opts) > 1: opts.insert(1, ', ')
4096
4097 if option.takes_value(): opts.append(' %s' % option.metavar)
4098
4099 return "".join(opts)
4100
4101 def _find_term_columns():
4102 columns = os.environ.get('COLUMNS', None)
4103 if columns:
4104 return int(columns)
4105
4106 try:
4107 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4108 out,err = sp.communicate()
4109 return int(out.split()[1])
4110 except:
4111 pass
4112 return None
4113
4114 max_width = 80
4115 max_help_position = 80
4116
4117 # No need to wrap help messages if we're on a wide console
4118 columns = _find_term_columns()
4119 if columns: max_width = columns
4120
4121 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4122 fmt.format_option_strings = _format_option_string
4123
4124 kw = {
4125 'version' : __version__,
4126 'formatter' : fmt,
4127 'usage' : '%prog [options] url [url...]',
4128 'conflict_handler' : 'resolve',
4129 }
4130
4131 parser = optparse.OptionParser(**kw)
4132
4133 # option groups
4134 general = optparse.OptionGroup(parser, 'General Options')
4135 selection = optparse.OptionGroup(parser, 'Video Selection')
4136 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4137 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4138 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4139 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4140 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4141
4142 general.add_option('-h', '--help',
4143 action='help', help='print this help text and exit')
4144 general.add_option('-v', '--version',
4145 action='version', help='print program version and exit')
4146 general.add_option('-U', '--update',
4147 action='store_true', dest='update_self', help='update this program to latest version')
4148 general.add_option('-i', '--ignore-errors',
4149 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4150 general.add_option('-r', '--rate-limit',
4151 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4152 general.add_option('-R', '--retries',
4153 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4154 general.add_option('--dump-user-agent',
4155 action='store_true', dest='dump_user_agent',
4156 help='display the current browser identification', default=False)
4157 general.add_option('--list-extractors',
4158 action='store_true', dest='list_extractors',
4159 help='List all supported extractors and the URLs they would handle', default=False)
4160
4161 selection.add_option('--playlist-start',
4162 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4163 selection.add_option('--playlist-end',
4164 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4165 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4166 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4167 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4168
4169 authentication.add_option('-u', '--username',
4170 dest='username', metavar='USERNAME', help='account username')
4171 authentication.add_option('-p', '--password',
4172 dest='password', metavar='PASSWORD', help='account password')
4173 authentication.add_option('-n', '--netrc',
4174 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4175
4176
4177 video_format.add_option('-f', '--format',
4178 action='store', dest='format', metavar='FORMAT', help='video format code')
4179 video_format.add_option('--all-formats',
4180 action='store_const', dest='format', help='download all available video formats', const='all')
4181 video_format.add_option('--prefer-free-formats',
4182 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4183 video_format.add_option('--max-quality',
4184 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4185 video_format.add_option('-F', '--list-formats',
4186 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4187
4188
4189 verbosity.add_option('-q', '--quiet',
4190 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4191 verbosity.add_option('-s', '--simulate',
4192 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4193 verbosity.add_option('--skip-download',
4194 action='store_true', dest='skip_download', help='do not download the video', default=False)
4195 verbosity.add_option('-g', '--get-url',
4196 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4197 verbosity.add_option('-e', '--get-title',
4198 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4199 verbosity.add_option('--get-thumbnail',
4200 action='store_true', dest='getthumbnail',
4201 help='simulate, quiet but print thumbnail URL', default=False)
4202 verbosity.add_option('--get-description',
4203 action='store_true', dest='getdescription',
4204 help='simulate, quiet but print video description', default=False)
4205 verbosity.add_option('--get-filename',
4206 action='store_true', dest='getfilename',
4207 help='simulate, quiet but print output filename', default=False)
4208 verbosity.add_option('--get-format',
4209 action='store_true', dest='getformat',
4210 help='simulate, quiet but print output format', default=False)
4211 verbosity.add_option('--no-progress',
4212 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4213 verbosity.add_option('--console-title',
4214 action='store_true', dest='consoletitle',
4215 help='display progress in console titlebar', default=False)
4216
4217
4218 filesystem.add_option('-t', '--title',
4219 action='store_true', dest='usetitle', help='use title in file name', default=False)
4220 filesystem.add_option('-l', '--literal',
4221 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4222 filesystem.add_option('-A', '--auto-number',
4223 action='store_true', dest='autonumber',
4224 help='number downloaded files starting from 00000', default=False)
4225 filesystem.add_option('-o', '--output',
4226 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4227 filesystem.add_option('-a', '--batch-file',
4228 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4229 filesystem.add_option('-w', '--no-overwrites',
4230 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4231 filesystem.add_option('-c', '--continue',
4232 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4233 filesystem.add_option('--no-continue',
4234 action='store_false', dest='continue_dl',
4235 help='do not resume partially downloaded files (restart from beginning)')
4236 filesystem.add_option('--cookies',
4237 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4238 filesystem.add_option('--no-part',
4239 action='store_true', dest='nopart', help='do not use .part files', default=False)
4240 filesystem.add_option('--no-mtime',
4241 action='store_false', dest='updatetime',
4242 help='do not use the Last-modified header to set the file modification time', default=True)
4243 filesystem.add_option('--write-description',
4244 action='store_true', dest='writedescription',
4245 help='write video description to a .description file', default=False)
4246 filesystem.add_option('--write-info-json',
4247 action='store_true', dest='writeinfojson',
4248 help='write video metadata to a .info.json file', default=False)
4249
4250
4251 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4252 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4253 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4254 help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4255 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4256 help='ffmpeg audio bitrate specification, 128k by default')
4257 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4258 help='keeps the video file on disk after the post-processing; the video is erased by default')
4259
4260
4261 parser.add_option_group(general)
4262 parser.add_option_group(selection)
4263 parser.add_option_group(filesystem)
4264 parser.add_option_group(verbosity)
4265 parser.add_option_group(video_format)
4266 parser.add_option_group(authentication)
4267 parser.add_option_group(postproc)
4268
4269 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4270 if xdg_config_home:
4271 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4272 else:
4273 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4274 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4275 opts, args = parser.parse_args(argv)
4276
4277 return parser, opts, args
4278
4279 def gen_extractors():
4280 """ Return a list of an instance of every supported extractor.
4281 The order does matter; the first extractor matched is the one handling the URL.
4282 """
4283 youtube_ie = YoutubeIE()
4284 google_ie = GoogleIE()
4285 yahoo_ie = YahooIE()
4286 return [
4287 YoutubePlaylistIE(youtube_ie),
4288 YoutubeUserIE(youtube_ie),
4289 YoutubeSearchIE(youtube_ie),
4290 youtube_ie,
4291 MetacafeIE(youtube_ie),
4292 DailymotionIE(),
4293 google_ie,
4294 GoogleSearchIE(google_ie),
4295 PhotobucketIE(),
4296 yahoo_ie,
4297 YahooSearchIE(yahoo_ie),
4298 DepositFilesIE(),
4299 FacebookIE(),
4300 BlipTVIE(),
4301 VimeoIE(),
4302 MyVideoIE(),
4303 ComedyCentralIE(),
4304 EscapistIE(),
4305 CollegeHumorIE(),
4306 XVideosIE(),
4307 SoundcloudIE(),
4308 InfoQIE(),
4309 MixcloudIE(),
4310 StanfordOpenClassroomIE(),
4311
4312 GenericIE()
4313 ]
4314
4315 def _real_main():
4316 parser, opts, args = parseOpts()
4317
4318 # Open appropriate CookieJar
4319 if opts.cookiefile is None:
4320 jar = cookielib.CookieJar()
4321 else:
4322 try:
4323 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4324 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4325 jar.load()
4326 except (IOError, OSError), err:
4327 sys.exit(u'ERROR: unable to open cookie file')
4328
4329 # Dump user agent
4330 if opts.dump_user_agent:
4331 print std_headers['User-Agent']
4332 sys.exit(0)
4333
4334 # Batch file verification
4335 batchurls = []
4336 if opts.batchfile is not None:
4337 try:
4338 if opts.batchfile == '-':
4339 batchfd = sys.stdin
4340 else:
4341 batchfd = open(opts.batchfile, 'r')
4342 batchurls = batchfd.readlines()
4343 batchurls = [x.strip() for x in batchurls]
4344 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4345 except IOError:
4346 sys.exit(u'ERROR: batch file could not be read')
4347 all_urls = batchurls + args
4348
4349 # General configuration
4350 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4351 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4352 urllib2.install_opener(opener)
4353 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4354
4355 extractors = gen_extractors()
4356
4357 if opts.list_extractors:
4358 for ie in extractors:
4359 print(ie.IE_NAME)
4360 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4361 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4362 for mu in matchedUrls:
4363 print(u' ' + mu)
4364 sys.exit(0)
4365
4366 # Conflicting, missing and erroneous options
4367 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4368 parser.error(u'using .netrc conflicts with giving username/password')
4369 if opts.password is not None and opts.username is None:
4370 parser.error(u'account username missing')
4371 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4372 parser.error(u'using output template conflicts with using title, literal title or auto number')
4373 if opts.usetitle and opts.useliteral:
4374 parser.error(u'using title conflicts with using literal title')
4375 if opts.username is not None and opts.password is None:
4376 opts.password = getpass.getpass(u'Type account password and press return:')
4377 if opts.ratelimit is not None:
4378 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4379 if numeric_limit is None:
4380 parser.error(u'invalid rate limit specified')
4381 opts.ratelimit = numeric_limit
4382 if opts.retries is not None:
4383 try:
4384 opts.retries = long(opts.retries)
4385 except (TypeError, ValueError), err:
4386 parser.error(u'invalid retry count specified')
4387 try:
4388 opts.playliststart = int(opts.playliststart)
4389 if opts.playliststart <= 0:
4390 raise ValueError(u'Playlist start must be positive')
4391 except (TypeError, ValueError), err:
4392 parser.error(u'invalid playlist start number specified')
4393 try:
4394 opts.playlistend = int(opts.playlistend)
4395 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4396 raise ValueError(u'Playlist end must be greater than playlist start')
4397 except (TypeError, ValueError), err:
4398 parser.error(u'invalid playlist end number specified')
4399 if opts.extractaudio:
4400 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4401 parser.error(u'invalid audio format specified')
4402
4403 # File downloader
4404 fd = FileDownloader({
4405 'usenetrc': opts.usenetrc,
4406 'username': opts.username,
4407 'password': opts.password,
4408 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4409 'forceurl': opts.geturl,
4410 'forcetitle': opts.gettitle,
4411 'forcethumbnail': opts.getthumbnail,
4412 'forcedescription': opts.getdescription,
4413 'forcefilename': opts.getfilename,
4414 'forceformat': opts.getformat,
4415 'simulate': opts.simulate,
4416 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4417 'format': opts.format,
4418 'format_limit': opts.format_limit,
4419 'listformats': opts.listformats,
4420 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4421 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4422 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4423 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4424 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4425 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4426 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4427 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4428 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4429 or u'%(id)s.%(ext)s'),
4430 'ignoreerrors': opts.ignoreerrors,
4431 'ratelimit': opts.ratelimit,
4432 'nooverwrites': opts.nooverwrites,
4433 'retries': opts.retries,
4434 'continuedl': opts.continue_dl,
4435 'noprogress': opts.noprogress,
4436 'playliststart': opts.playliststart,
4437 'playlistend': opts.playlistend,
4438 'logtostderr': opts.outtmpl == '-',
4439 'consoletitle': opts.consoletitle,
4440 'nopart': opts.nopart,
4441 'updatetime': opts.updatetime,
4442 'writedescription': opts.writedescription,
4443 'writeinfojson': opts.writeinfojson,
4444 'matchtitle': opts.matchtitle,
4445 'rejecttitle': opts.rejecttitle,
4446 'max_downloads': opts.max_downloads,
4447 'prefer_free_formats': opts.prefer_free_formats,
4448 })
4449 for extractor in extractors:
4450 fd.add_info_extractor(extractor)
4451
4452 # PostProcessors
4453 if opts.extractaudio:
4454 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4455
4456 # Update version
4457 if opts.update_self:
4458 updateSelf(fd, sys.argv[0])
4459
4460 # Maybe do nothing
4461 if len(all_urls) < 1:
4462 if not opts.update_self:
4463 parser.error(u'you must provide at least one URL')
4464 else:
4465 sys.exit()
4466
4467 try:
4468 retcode = fd.download(all_urls)
4469 except MaxDownloadsReached:
4470 fd.to_screen(u'--max-download limit reached, aborting.')
4471 retcode = 101
4472
4473 # Dump cookie jar if requested
4474 if opts.cookiefile is not None:
4475 try:
4476 jar.save()
4477 except (IOError, OSError), err:
4478 sys.exit(u'ERROR: unable to save cookie jar')
4479
4480 sys.exit(retcode)
4481
4482 def main():
4483 try:
4484 _real_main()
4485 except DownloadError:
4486 sys.exit(1)
4487 except SameFileError:
4488 sys.exit(u'ERROR: fixed output name but more than one file to download')
4489 except KeyboardInterrupt:
4490 sys.exit(u'\nERROR: Interrupted by user')
4491
4492 if __name__ == '__main__':
4493 main()
4494
4495 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: